-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# Westmere 5.1/+94%(**)
# Sandy Bridge 5.0/+8%
# Atom 12.6/+6%
+# VIA Nano 6.4/+9%
+# Ivy Bridge 4.9/±0%
+# Bulldozer 4.9/+15%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
# but this specific code performs poorly on Core2. And vice
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"rc4-586.pl");
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"rc4-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
$xx="eax";
$yy="ebx";
&movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
# (*) This is the key to Core2 and Westmere performance.
- # Whithout movz out-of-order execution logic confuses
+ # Without movz out-of-order execution logic confuses
# itself and fails to reorder loads and stores. Problem
# appears to be fixed in Sandy Bridge...
}
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
- &test ($ty,-8);
&mov (&wparam(3),$out); # $out as accumulator in these loops
+ if ($x86only) {
+ &jmp (&label("go4loop4"));
+ } else {
+ &test ($ty,-8);
&jz (&label("go4loop4"));
&picmeup($out,"OPENSSL_ia32cap_P");
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&jmp (&label("loop1"));
+ }
&set_label("go4loop4",16);
&lea ($ty,&DWP(-4,$inp,$ty));
$idx="edx";
# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
-&function_begin("private_RC4_set_key");
+&function_begin("RC4_set_key");
&mov ($out,&wparam(0)); # load key
&mov ($idi,&wparam(1)); # load len
&mov ($inp,&wparam(2)); # load data
&xor ("eax","eax");
&mov (&DWP(-8,$out),"eax"); # key->x=0;
&mov (&DWP(-4,$out),"eax"); # key->y=0;
-&function_end("private_RC4_set_key");
+&function_end("RC4_set_key");
# const char *RC4_options(void);
&function_begin_B("RC4_options");
&asm_finish();
+close STDOUT;