upd: openssl to 1.1.0

[cassiopeia.git] / lib / openssl / crypto / aes / asm / aes-586.pl
diff --git a/lib/openssl/crypto/aes/asm/aes-586.pl b/lib/openssl/crypto/aes/asm/aes-586.pl

index 687ed811be4796639d79841ee11fc882470a9dc9..1ba356508a42b12583193b32680dd89db4291239 100755 (executable)
--- a/lib/openssl/crypto/aes/asm/aes-586.pl
+++ b/lib/openssl/crypto/aes/asm/aes-586.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -39,13 +46,13 @@
  # but exhibits up to 10% improvement on other cores.
  #
  # Second version is "monolithic" replacement for aes_core.c, which in
  # but exhibits up to 10% improvement on other cores.
  #
  # Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
  # This made it possible to implement little-endian variant of the
  # algorithm without modifying the base C code. Motivating factor for
  # the undertaken effort was that it appeared that in tight IA-32
  # register window little-endian flavor could achieve slightly higher
  # Instruction Level Parallelism, and it indeed resulted in up to 15%
  # This made it possible to implement little-endian variant of the
  # algorithm without modifying the base C code. Motivating factor for
  # the undertaken effort was that it appeared that in tight IA-32
  # register window little-endian flavor could achieve slightly higher
  # Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent �-archs...
+# better performance on most recent ��-archs...
  #
  # Third version adds AES_cbc_encrypt implementation, which resulted in
  # up to 40% performance imrovement of CBC benchmark results. 40% was
  #
  # Third version adds AES_cbc_encrypt implementation, which resulted in
  # up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -103,11 +110,12 @@
  # byte for 128-bit key.
  #
  #              ECB encrypt     ECB decrypt     CBC large chunk
  # byte for 128-bit key.
  #
  #              ECB encrypt     ECB decrypt     CBC large chunk
-# P4           56[60]          84[100]         23
-# AMD K8       48[44]          70[79]          18
-# PIII         41[50]          61[91]          24
-# Core 2       32[38]          45[70]          18.5
-# Pentium      120             160             77
+# P4           52[54]          83[95]          23
+# AMD K8       46[41]          66[70]          18
+# PIII         41[50]          60[77]          24
+# Core 2       31[36]          45[64]          18.5
+# Atom         76[100]         96[138]         60
+# Pentium      115             150             77
  #
  # Version 4.1 switches to compact S-box even in key schedule setup.
  #
  #
  # Version 4.1 switches to compact S-box even in key schedule setup.
  #
@@ -190,6 +198,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  push(@INC,"${dir}","${dir}../../perlasm");
  require "x86asm.pl";
  
  push(@INC,"${dir}","${dir}../../perlasm");
  require "x86asm.pl";
  
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
  &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
  &static_label("AES_Te");
  &static_label("AES_Td");
  &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
  &static_label("AES_Te");
  &static_label("AES_Td");
@@ -223,7 +235,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
  $speed_limit=512;      # chunks smaller than $speed_limit are
                         # processed with compact routine in CBC mode
  $small_footprint=1;    # $small_footprint=1 code is ~5% slower [on
  $speed_limit=512;      # chunks smaller than $speed_limit are
                         # processed with compact routine in CBC mode
  $small_footprint=1;    # $small_footprint=1 code is ~5% slower [on
-                       # recent �-archs], but ~5 times smaller!
+                       # recent ��-archs], but ~5 times smaller!
                         # I favor compact code to minimize cache
                         # contention and in hope to "collect" 5% back
                         # in real-life applications...
                         # I favor compact code to minimize cache
                         # contention and in hope to "collect" 5% back
                         # in real-life applications...
@@ -242,7 +254,7 @@ $vertical_spin=0;   # shift "verticaly" defaults to 0, because of
  
  sub encvert()
  { my ($te,@s) = @_;
  
  sub encvert()
  { my ($te,@s) = @_;
-  my $v0 = $acc, $v1 = $key;
+  my ($v0,$v1) = ($acc,$key);
  
         &mov    ($v0,$s[3]);                            # copy s3
         &mov    (&DWP(4,"esp"),$s[2]);                  # save s2
  
         &mov    ($v0,$s[3]);                            # copy s3
         &mov    (&DWP(4,"esp"),$s[2]);                  # save s2
@@ -299,7 +311,7 @@ sub encvert()
  # Another experimental routine, which features "horizontal spin," but
  # eliminates one reference to stack. Strangely enough runs slower...
  sub enchoriz()
  # Another experimental routine, which features "horizontal spin," but
  # eliminates one reference to stack. Strangely enough runs slower...
  sub enchoriz()
-{ my $v0 = $key, $v1 = $acc;
+{ my ($v0,$v1) = ($key,$acc);
  
         &movz   ($v0,&LB($s0));                 #  3, 2, 1, 0*
         &rotr   ($s2,8);                        #  8,11,10, 9
  
         &movz   ($v0,&LB($s0));                 #  3, 2, 1, 0*
         &rotr   ($s2,8);                        #  8,11,10, 9
@@ -427,7 +439,7 @@ sub sse_encbody()
  ######################################################################
  
  sub enccompact()
  ######################################################################
  
  sub enccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
    while ($#_>5) { pop(@_); $Fn=sub{}; }
    my ($i,$te,@s)=@_;
    my $tmp = $key;
    while ($#_>5) { pop(@_); $Fn=sub{}; }
    my ($i,$te,@s)=@_;
    my $tmp = $key;
@@ -476,24 +488,25 @@ sub enctransform()
    my $tmp = $tbl;
    my $r2  = $key ;
  
    my $tmp = $tbl;
    my $r2  = $key ;
  
-       &mov    ($acc,$s[$i]);
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &and    ($tmp,$s[$i]);
         &lea    ($r2,&DWP(0,$s[$i],$s[$i]));
         &lea    ($r2,&DWP(0,$s[$i],$s[$i]));
-       &sub    ($acc,$tmp);
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
         &and    ($r2,0xfefefefe);
         &and    ($r2,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
+       &sub    ($acc,$tmp);
         &mov    ($tmp,$s[$i]);
         &mov    ($tmp,$s[$i]);
+       &and    ($acc,0x1b1b1b1b);
+       &rotr   ($tmp,16);
         &xor    ($acc,$r2);     # r2
         &xor    ($acc,$r2);     # r2
+       &mov    ($r2,$s[$i]);
  
         &xor    ($s[$i],$acc);  # r0 ^ r2
  
         &xor    ($s[$i],$acc);  # r0 ^ r2
+       &rotr   ($r2,16+8);
+       &xor    ($acc,$tmp);
         &rotl   ($s[$i],24);
         &rotl   ($s[$i],24);
-       &xor    ($s[$i],$acc)   # ROTATE(r2^r0,24) ^ r2
-       &rotr   ($tmp,16);
-       &xor    ($s[$i],$tmp);
-       &rotr   ($tmp,8);
-       &xor    ($s[$i],$tmp);
+       &xor    ($acc,$r2);
+       &mov    ($tmp,0x80808080)       if ($i!=1);
+       &xor    ($s[$i],$acc);  # ROTATE(r2^r0,24) ^ r2
  }
  
  &function_begin_B("_x86_AES_encrypt_compact");
  }
  
  &function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +539,7 @@ sub enctransform()
                 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
                 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
                 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
                 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
                 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
                 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+               &mov    ($tbl,0x80808080);
                 &enctransform(2);
                 &enctransform(3);
                 &enctransform(0);
                 &enctransform(2);
                 &enctransform(3);
                 &enctransform(0);
@@ -562,7 +576,7 @@ sub enctransform()
  # Performance is not actually extraordinary in comparison to pure
  # x86 code. In particular encrypt performance is virtually the same.
  # Decrypt performance on the other hand is 15-20% better on newer
  # Performance is not actually extraordinary in comparison to pure
  # x86 code. In particular encrypt performance is virtually the same.
  # Decrypt performance on the other hand is 15-20% better on newer
-# �-archs [but we're thankful for *any* improvement here], and ~50%
+# ��-archs [but we're thankful for *any* improvement here], and ~50%
  # better on PIII:-) And additionally on the pros side this code
  # eliminates redundant references to stack and thus relieves/
  # minimizes the pressure on the memory bus.
  # better on PIII:-) And additionally on the pros side this code
  # eliminates redundant references to stack and thus relieves/
  # minimizes the pressure on the memory bus.
@@ -607,82 +621,84 @@ sub sse_enccompact()
         &pshufw ("mm5","mm4",0x0d);             # 15,14,11,10
         &movd   ("eax","mm1");                  #  5, 4, 1, 0
         &movd   ("ebx","mm5");                  # 15,14,11,10
         &pshufw ("mm5","mm4",0x0d);             # 15,14,11,10
         &movd   ("eax","mm1");                  #  5, 4, 1, 0
         &movd   ("ebx","mm5");                  # 15,14,11,10
+       &mov    ($__key,$key);
  
         &movz   ($acc,&LB("eax"));              #  0
  
         &movz   ($acc,&LB("eax"));              #  0
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
-       &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
         &movz   ("edx",&HB("eax"));             #  1
         &movz   ("edx",&HB("eax"));             #  1
+       &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
+       &movz   ($key,&LB("ebx"));              # 10
         &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
         &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
-       &shl    ("edx",8);                      #  1
         &shr    ("eax",16);                     #  5, 4
         &shr    ("eax",16);                     #  5, 4
+       &shl    ("edx",8);                      #  1
  
  
-       &movz   ($acc,&LB("ebx"));              # 10
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 10
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 10
+       &movz   ($key,&HB("ebx"));              # 11
         &shl    ($acc,16);                      # 10
         &shl    ($acc,16);                      # 10
-       &or     ("ecx",$acc);                   # 10
         &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
         &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
-       &movz   ($acc,&HB("ebx"));              # 11
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 11
+       &or     ("ecx",$acc);                   # 10
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 11
+       &movz   ($key,&HB("eax"));              #  5
         &shl    ($acc,24);                      # 11
         &shl    ($acc,24);                      # 11
-       &or     ("edx",$acc);                   # 11
         &shr    ("ebx",16);                     # 15,14
         &shr    ("ebx",16);                     # 15,14
+       &or     ("edx",$acc);                   # 11
  
  
-       &movz   ($acc,&HB("eax"));              #  5
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  5
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  5
+       &movz   ($key,&HB("ebx"));              # 15
         &shl    ($acc,8);                       #  5
         &or     ("ecx",$acc);                   #  5
         &shl    ($acc,8);                       #  5
         &or     ("ecx",$acc);                   #  5
-       &movz   ($acc,&HB("ebx"));              # 15
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 15
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 15
+       &movz   ($key,&LB("eax"));              #  4
         &shl    ($acc,24);                      # 15
         &or     ("ecx",$acc);                   # 15
         &shl    ($acc,24);                      # 15
         &or     ("ecx",$acc);                   # 15
-       &movd   ("mm0","ecx");                  # t[0] collected
  
  
-       &movz   ($acc,&LB("eax"));              #  4
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  4
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  4
+       &movz   ($key,&LB("ebx"));              # 14
         &movd   ("eax","mm2");                  #  7, 6, 3, 2
         &movd   ("eax","mm2");                  #  7, 6, 3, 2
-       &movz   ($acc,&LB("ebx"));              # 14
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 14
-       &shl    ($acc,16);                      # 14
+       &movd   ("mm0","ecx");                  # t[0] collected
+       &movz   ("ecx",&BP(-128,$tbl,$key,1));  # 14
+       &movz   ($key,&HB("eax"));              #  3
+       &shl    ("ecx",16);                     # 14
+       &movd   ("ebx","mm6");                  # 13,12, 9, 8
         &or     ("ecx",$acc);                   # 14
  
         &or     ("ecx",$acc);                   # 14
  
-       &movd   ("ebx","mm6");                  # 13,12, 9, 8
-       &movz   ($acc,&HB("eax"));              #  3
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  3
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  3
+       &movz   ($key,&HB("ebx"));              #  9
         &shl    ($acc,24);                      #  3
         &or     ("ecx",$acc);                   #  3
         &shl    ($acc,24);                      #  3
         &or     ("ecx",$acc);                   #  3
-       &movz   ($acc,&HB("ebx"));              #  9
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  9
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  9
+       &movz   ($key,&LB("ebx"));              #  8
         &shl    ($acc,8);                       #  9
         &shl    ($acc,8);                       #  9
+       &shr    ("ebx",16);                     # 13,12
         &or     ("ecx",$acc);                   #  9
         &or     ("ecx",$acc);                   #  9
-       &movd   ("mm1","ecx");                  # t[1] collected
  
  
-       &movz   ($acc,&LB("ebx"));              #  8
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  8
-       &shr    ("ebx",16);                     # 13,12
-       &movz   ($acc,&LB("eax"));              #  2
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  2
-       &shl    ($acc,16);                      #  2
-       &or     ("ecx",$acc);                   #  2
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  8
+       &movz   ($key,&LB("eax"));              #  2
         &shr    ("eax",16);                     #  7, 6
         &shr    ("eax",16);                     #  7, 6
+       &movd   ("mm1","ecx");                  # t[1] collected
+       &movz   ("ecx",&BP(-128,$tbl,$key,1));  #  2
+       &movz   ($key,&HB("eax"));              #  7
+       &shl    ("ecx",16);                     #  2
+       &and    ("eax",0xff);                   #  6
+       &or     ("ecx",$acc);                   #  2
  
         &punpckldq      ("mm0","mm1");          # t[0,1] collected
  
  
         &punpckldq      ("mm0","mm1");          # t[0,1] collected
  
-       &movz   ($acc,&HB("eax"));              #  7
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  7
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  7
+       &movz   ($key,&HB("ebx"));              # 13
         &shl    ($acc,24);                      #  7
         &shl    ($acc,24);                      #  7
-       &or     ("ecx",$acc);                   #  7
-       &and    ("eax",0xff);                   #  6
+       &and    ("ebx",0xff);                   # 12
         &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  6
         &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  6
+       &or     ("ecx",$acc);                   #  7
         &shl    ("eax",16);                     #  6
         &shl    ("eax",16);                     #  6
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 13
         &or     ("edx","eax");                  #  6
         &or     ("edx","eax");                  #  6
-       &movz   ($acc,&HB("ebx"));              # 13
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 13
         &shl    ($acc,8);                       # 13
         &shl    ($acc,8);                       # 13
-       &or     ("ecx",$acc);                   # 13
-       &movd   ("mm4","ecx");                  # t[2] collected
-       &and    ("ebx",0xff);                   # 12
         &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
         &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
+       &or     ("ecx",$acc);                   # 13
         &or     ("edx","ebx");                  # 12
         &or     ("edx","ebx");                  # 12
+       &mov    ($key,$__key);
+       &movd   ("mm4","ecx");                  # t[2] collected
         &movd   ("mm5","edx");                  # t[3] collected
  
         &punpckldq      ("mm4","mm5");          # t[2,3] collected
         &movd   ("mm5","edx");                  # t[3] collected
  
         &punpckldq      ("mm4","mm5");          # t[2,3] collected
@@ -1222,7 +1238,7 @@ sub enclast()
  ######################################################################
  
  sub deccompact()
  ######################################################################
  
  sub deccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
    while ($#_>5) { pop(@_); $Fn=sub{}; }
    my ($i,$td,@s)=@_;
    my $tmp = $key;
    while ($#_>5) { pop(@_); $Fn=sub{}; }
    my ($i,$td,@s)=@_;
    my $tmp = $key;
@@ -1270,30 +1286,30 @@ sub dectransform()
    my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
    my $tp8 = $tbl;
  
    my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
    my $tp8 = $tbl;
  
-       &mov    ($acc,$s[$i]);
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
+       &mov    ($tmp,0x80808080);
+       &and    ($tmp,$s[$i]);
+       &mov    ($acc,$tmp);
         &shr    ($tmp,7);
         &lea    ($tp2,&DWP(0,$s[$i],$s[$i]));
         &sub    ($acc,$tmp);
         &and    ($tp2,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
         &shr    ($tmp,7);
         &lea    ($tp2,&DWP(0,$s[$i],$s[$i]));
         &sub    ($acc,$tmp);
         &and    ($tp2,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
-       &xor    ($acc,$tp2);
-       &mov    ($tp2,$acc);
+       &xor    ($tp2,$acc);
+       &mov    ($tmp,0x80808080);
  
  
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
+       &and    ($tmp,$tp2);
+       &mov    ($acc,$tmp);
         &shr    ($tmp,7);
         &lea    ($tp4,&DWP(0,$tp2,$tp2));
         &sub    ($acc,$tmp);
         &and    ($tp4,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
          &xor   ($tp2,$s[$i]);  # tp2^tp1
         &shr    ($tmp,7);
         &lea    ($tp4,&DWP(0,$tp2,$tp2));
         &sub    ($acc,$tmp);
         &and    ($tp4,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
          &xor   ($tp2,$s[$i]);  # tp2^tp1
-       &xor    ($acc,$tp4);
-       &mov    ($tp4,$acc);
+       &xor    ($tp4,$acc);
+       &mov    ($tmp,0x80808080);
  
  
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
+       &and    ($tmp,$tp4);
+       &mov    ($acc,$tmp);
         &shr    ($tmp,7);
         &lea    ($tp8,&DWP(0,$tp4,$tp4));
         &sub    ($acc,$tmp);
         &shr    ($tmp,7);
         &lea    ($tp8,&DWP(0,$tp4,$tp4));
         &sub    ($acc,$tmp);
@@ -1305,13 +1321,13 @@ sub dectransform()
  
         &xor    ($s[$i],$tp2);
         &xor    ($tp2,$tp8);
  
         &xor    ($s[$i],$tp2);
         &xor    ($tp2,$tp8);
-       &rotl   ($tp2,24);
         &xor    ($s[$i],$tp4);
         &xor    ($tp4,$tp8);
         &xor    ($s[$i],$tp4);
         &xor    ($tp4,$tp8);
-       &rotl   ($tp4,16);
+       &rotl   ($tp2,24);
         &xor    ($s[$i],$tp8);  # ^= tp8^(tp4^tp1)^(tp2^tp1)
         &xor    ($s[$i],$tp8);  # ^= tp8^(tp4^tp1)^(tp2^tp1)
-       &rotl   ($tp8,8);
+       &rotl   ($tp4,16);
         &xor    ($s[$i],$tp2);  # ^= ROTATE(tp8^tp2^tp1,24)
         &xor    ($s[$i],$tp2);  # ^= ROTATE(tp8^tp2^tp1,24)
+       &rotl   ($tp8,8);
         &xor    ($s[$i],$tp4);  # ^= ROTATE(tp8^tp4^tp1,16)
          &mov   ($s[0],$__s0)                   if($i==2); #prefetch $s0
          &mov   ($s[1],$__s1)                   if($i==3); #prefetch $s1
         &xor    ($s[$i],$tp4);  # ^= ROTATE(tp8^tp4^tp1,16)
          &mov   ($s[0],$__s0)                   if($i==2); #prefetch $s0
          &mov   ($s[1],$__s1)                   if($i==3); #prefetch $s1
@@ -1389,85 +1405,87 @@ sub dectransform()
  sub sse_deccompact()
  {
         &pshufw ("mm1","mm0",0x0c);             #  7, 6, 1, 0
  sub sse_deccompact()
  {
         &pshufw ("mm1","mm0",0x0c);             #  7, 6, 1, 0
+       &pshufw ("mm5","mm4",0x09);             # 13,12,11,10
         &movd   ("eax","mm1");                  #  7, 6, 1, 0
         &movd   ("eax","mm1");                  #  7, 6, 1, 0
+       &movd   ("ebx","mm5");                  # 13,12,11,10
+       &mov    ($__key,$key);
  
  
-       &pshufw ("mm5","mm4",0x09);             # 13,12,11,10
         &movz   ($acc,&LB("eax"));              #  0
         &movz   ($acc,&LB("eax"));              #  0
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
-       &movd   ("ebx","mm5");                  # 13,12,11,10
         &movz   ("edx",&HB("eax"));             #  1
         &movz   ("edx",&HB("eax"));             #  1
+       &pshufw ("mm2","mm0",0x06);             #  3, 2, 5, 4
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
+       &movz   ($key,&LB("ebx"));              # 10
         &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
         &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
+       &shr    ("eax",16);                     #  7, 6
         &shl    ("edx",8);                      #  1
  
         &shl    ("edx",8);                      #  1
  
-       &pshufw ("mm2","mm0",0x06);             #  3, 2, 5, 4
-       &movz   ($acc,&LB("ebx"));              # 10
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 10
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 10
+       &movz   ($key,&HB("ebx"));              # 11
         &shl    ($acc,16);                      # 10
         &shl    ($acc,16);                      # 10
+       &pshufw ("mm6","mm4",0x03);             # 9, 8,15,14
         &or     ("ecx",$acc);                   # 10
         &or     ("ecx",$acc);                   # 10
-       &shr    ("eax",16);                     #  7, 6
-       &movz   ($acc,&HB("ebx"));              # 11
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 11
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 11
+       &movz   ($key,&HB("eax"));              #  7
         &shl    ($acc,24);                      # 11
         &shl    ($acc,24);                      # 11
-       &or     ("edx",$acc);                   # 11
         &shr    ("ebx",16);                     # 13,12
         &shr    ("ebx",16);                     # 13,12
+       &or     ("edx",$acc);                   # 11
  
  
-       &pshufw ("mm6","mm4",0x03);             # 9, 8,15,14
-       &movz   ($acc,&HB("eax"));              #  7
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  7
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  7
+       &movz   ($key,&HB("ebx"));              # 13
         &shl    ($acc,24);                      #  7
         &or     ("ecx",$acc);                   #  7
         &shl    ($acc,24);                      #  7
         &or     ("ecx",$acc);                   #  7
-       &movz   ($acc,&HB("ebx"));              # 13
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 13
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 13
+       &movz   ($key,&LB("eax"));              #  6
         &shl    ($acc,8);                       # 13
         &shl    ($acc,8);                       # 13
+       &movd   ("eax","mm2");                  #  3, 2, 5, 4
         &or     ("ecx",$acc);                   # 13
         &or     ("ecx",$acc);                   # 13
-       &movd   ("mm0","ecx");                  # t[0] collected
  
  
-       &movz   ($acc,&LB("eax"));              #  6
-       &movd   ("eax","mm2");                  #  3, 2, 5, 4
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  6
-       &shl    ("ecx",16);                     #  6
-       &movz   ($acc,&LB("ebx"));              # 12
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  6
+       &movz   ($key,&LB("ebx"));              # 12
+       &shl    ($acc,16);                      #  6
         &movd   ("ebx","mm6");                  #  9, 8,15,14
         &movd   ("ebx","mm6");                  #  9, 8,15,14
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 12
+       &movd   ("mm0","ecx");                  # t[0] collected
+       &movz   ("ecx",&BP(-128,$tbl,$key,1));  # 12
+       &movz   ($key,&LB("eax"));              #  4
         &or     ("ecx",$acc);                   # 12
  
         &or     ("ecx",$acc);                   # 12
  
-       &movz   ($acc,&LB("eax"));              #  4
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  4
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  4
+       &movz   ($key,&LB("ebx"));              # 14
         &or     ("edx",$acc);                   #  4
         &or     ("edx",$acc);                   #  4
-       &movz   ($acc,&LB("ebx"));              # 14
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 14
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 14
+       &movz   ($key,&HB("eax"));              #  5
         &shl    ($acc,16);                      # 14
         &shl    ($acc,16);                      # 14
+       &shr    ("eax",16);                     #  3, 2
         &or     ("edx",$acc);                   # 14
         &or     ("edx",$acc);                   # 14
-       &movd   ("mm1","edx");                  # t[1] collected
  
  
-       &movz   ($acc,&HB("eax"));              #  5
-       &movz   ("edx",&BP(-128,$tbl,$acc,1));  #  5
-       &shl    ("edx",8);                      #  5
-       &movz   ($acc,&HB("ebx"));              # 15
-       &shr    ("eax",16);                     #  3, 2
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 15
-       &shl    ($acc,24);                      # 15
-       &or     ("edx",$acc);                   # 15
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  5
+       &movz   ($key,&HB("ebx"));              # 15
         &shr    ("ebx",16);                     #  9, 8
         &shr    ("ebx",16);                     #  9, 8
+       &shl    ($acc,8);                       #  5
+       &movd   ("mm1","edx");                  # t[1] collected
+       &movz   ("edx",&BP(-128,$tbl,$key,1));  # 15
+       &movz   ($key,&HB("ebx"));              #  9
+       &shl    ("edx",24);                     # 15
+       &and    ("ebx",0xff);                   #  8
+       &or     ("edx",$acc);                   # 15
  
         &punpckldq      ("mm0","mm1");          # t[0,1] collected
  
  
         &punpckldq      ("mm0","mm1");          # t[0,1] collected
  
-       &movz   ($acc,&HB("ebx"));              #  9
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  9
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  9
+       &movz   ($key,&LB("eax"));              #  2
         &shl    ($acc,8);                       #  9
         &shl    ($acc,8);                       #  9
-       &or     ("ecx",$acc);                   #  9
-       &and    ("ebx",0xff);                   #  8
+       &movz   ("eax",&HB("eax"));             #  3
         &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); #  8
         &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); #  8
+       &or     ("ecx",$acc);                   #  9
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  2
         &or     ("edx","ebx");                  #  8
         &or     ("edx","ebx");                  #  8
-       &movz   ($acc,&LB("eax"));              #  2
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  2
         &shl    ($acc,16);                      #  2
         &shl    ($acc,16);                      #  2
-       &or     ("edx",$acc);                   #  2
-       &movd   ("mm4","edx");                  # t[2] collected
-       &movz   ("eax",&HB("eax"));             #  3
         &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  3
         &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  3
+       &or     ("edx",$acc);                   #  2
         &shl    ("eax",24);                     #  3
         &or     ("ecx","eax");                  #  3
         &shl    ("eax",24);                     #  3
         &or     ("ecx","eax");                  #  3
+       &mov    ($key,$__key);
+       &movd   ("mm4","edx");                  # t[2] collected
         &movd   ("mm5","ecx");                  # t[3] collected
  
         &punpckldq      ("mm4","mm5");          # t[2,3] collected
         &movd   ("mm5","ecx");                  # t[3] collected
  
         &punpckldq      ("mm4","mm5");          # t[2,3] collected
@@ -2181,8 +2199,8 @@ my $mark=&DWP(76+240,"esp");      # copy of aes_key->rounds
         &mov    ("ecx",240/4);
         &xor    ("eax","eax");
         &align  (4);
         &mov    ("ecx",240/4);
         &xor    ("eax","eax");
         &align  (4);
-       &data_word(0xABF3F689); # rep stosd
-       &set_label("skip_ezero")
+       &data_word(0xABF3F689);         # rep stosd
+       &set_label("skip_ezero");
         &mov    ("esp",$_esp);
         &popf   ();
      &set_label("drop_out");
         &mov    ("esp",$_esp);
         &popf   ();
      &set_label("drop_out");
@@ -2301,8 +2319,8 @@ my $mark=&DWP(76+240,"esp");      # copy of aes_key->rounds
         &mov    ("ecx",240/4);
         &xor    ("eax","eax");
         &align  (4);
         &mov    ("ecx",240/4);
         &xor    ("eax","eax");
         &align  (4);
-       &data_word(0xABF3F689); # rep stosd
-       &set_label("skip_dzero")
+       &data_word(0xABF3F689);         # rep stosd
+       &set_label("skip_dzero");
         &mov    ("esp",$_esp);
         &popf   ();
         &function_end_A();
         &mov    ("esp",$_esp);
         &popf   ();
         &function_end_A();
@@ -2854,43 +2872,43 @@ sub enckey()
      &set_label("exit");
  &function_end("_x86_AES_set_encrypt_key");
  
      &set_label("exit");
  &function_end("_x86_AES_set_encrypt_key");
  
-# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
  #                        AES_KEY *key)
  #                        AES_KEY *key)
-&function_begin_B("private_AES_set_encrypt_key");
+&function_begin_B("AES_set_encrypt_key");
         &call   ("_x86_AES_set_encrypt_key");
         &ret    ();
         &call   ("_x86_AES_set_encrypt_key");
         &ret    ();
-&function_end_B("private_AES_set_encrypt_key");
+&function_end_B("AES_set_encrypt_key");
  
  sub deckey()
  { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
    my $tmp = $tbl;
  
  
  sub deckey()
  { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
    my $tmp = $tbl;
  
-       &mov    ($acc,$tp1);
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &mov    ($tmp,0x80808080);
+       &and    ($tmp,$tp1);
         &lea    ($tp2,&DWP(0,$tp1,$tp1));
         &lea    ($tp2,&DWP(0,$tp1,$tp1));
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
         &sub    ($acc,$tmp);
         &and    ($tp2,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
         &sub    ($acc,$tmp);
         &and    ($tp2,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
-       &xor    ($acc,$tp2);
-       &mov    ($tp2,$acc);
+       &xor    ($tp2,$acc);
+       &mov    ($tmp,0x80808080);
  
  
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &and    ($tmp,$tp2);
         &lea    ($tp4,&DWP(0,$tp2,$tp2));
         &lea    ($tp4,&DWP(0,$tp2,$tp2));
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
         &sub    ($acc,$tmp);
         &and    ($tp4,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
          &xor   ($tp2,$tp1);    # tp2^tp1
         &sub    ($acc,$tmp);
         &and    ($tp4,0xfefefefe);
         &and    ($acc,0x1b1b1b1b);
          &xor   ($tp2,$tp1);    # tp2^tp1
-       &xor    ($acc,$tp4);
-       &mov    ($tp4,$acc);
+       &xor    ($tp4,$acc);
+       &mov    ($tmp,0x80808080);
  
  
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &and    ($tmp,$tp4);
         &lea    ($tp8,&DWP(0,$tp4,$tp4));
         &lea    ($tp8,&DWP(0,$tp4,$tp4));
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
          &xor   ($tp4,$tp1);    # tp4^tp1
         &sub    ($acc,$tmp);
         &and    ($tp8,0xfefefefe);
          &xor   ($tp4,$tp1);    # tp4^tp1
         &sub    ($acc,$tmp);
         &and    ($tp8,0xfefefefe);
@@ -2916,9 +2934,9 @@ sub deckey()
         &mov    (&DWP(4*$i,$key),$tp1);
  }
  
         &mov    (&DWP(4*$i,$key),$tp1);
  }
  
-# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
  #                        AES_KEY *key)
  #                        AES_KEY *key)
-&function_begin_B("private_AES_set_decrypt_key");
+&function_begin_B("AES_set_decrypt_key");
         &call   ("_x86_AES_set_encrypt_key");
         &cmp    ("eax",0);
         &je     (&label("proceed"));
         &call   ("_x86_AES_set_encrypt_key");
         &cmp    ("eax",0);
         &je     (&label("proceed"));
@@ -2974,7 +2992,9 @@ sub deckey()
         &jb     (&label("permute"));
  
         &xor    ("eax","eax");                  # return success
         &jb     (&label("permute"));
  
         &xor    ("eax","eax");                  # return success
-&function_end("private_AES_set_decrypt_key");
+&function_end("AES_set_decrypt_key");
  &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
  
  &asm_finish();
  &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
  
  &asm_finish();
+
+close STDOUT;