+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
-# P4 56[60] 84[100] 23
-# AMD K8 48[44] 70[79] 18
-# PIII 41[50] 61[91] 24
-# Core 2 32[38] 45[70] 18.5
-# Pentium 120 160 77
+# P4 52[54] 83[95] 23
+# AMD K8 46[41] 66[70] 18
+# PIII 41[50] 60[77] 24
+# Core 2 31[36] 45[64] 18.5
+# Atom 76[100] 96[138] 60
+# Pentium 115 150 77
$speed_limit=512; # chunks smaller than $speed_limit are
# processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
$speed_limit=512; # chunks smaller than $speed_limit are
# processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
# I favor compact code to minimize cache
# contention and in hope to "collect" 5% back
# in real-life applications...
# I favor compact code to minimize cache
# contention and in hope to "collect" 5% back
# in real-life applications...
&mov ($v0,$s[3]); # copy s3
&mov (&DWP(4,"esp"),$s[2]); # save s2
&mov ($v0,$s[3]); # copy s3
&mov (&DWP(4,"esp"),$s[2]); # save s2
# Another experimental routine, which features "horizontal spin," but
# eliminates one reference to stack. Strangely enough runs slower...
sub enchoriz()
# Another experimental routine, which features "horizontal spin," but
# eliminates one reference to stack. Strangely enough runs slower...
sub enchoriz()
&movz ($v0,&LB($s0)); # 3, 2, 1, 0*
&rotr ($s2,8); # 8,11,10, 9
&movz ($v0,&LB($s0)); # 3, 2, 1, 0*
&rotr ($s2,8); # 8,11,10, 9
&lea ($r2,&DWP(0,$s[$i],$s[$i]));
&lea ($r2,&DWP(0,$s[$i],$s[$i]));
- &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
- &rotr ($tmp,16);
- &xor ($s[$i],$tmp);
- &rotr ($tmp,8);
- &xor ($s[$i],$tmp);
+ &xor ($acc,$r2);
+ &mov ($tmp,0x80808080) if ($i!=1);
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
# Decrypt performance on the other hand is 15-20% better on newer
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
# Decrypt performance on the other hand is 15-20% better on newer
# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
&movd ("eax","mm1"); # 5, 4, 1, 0
&movd ("ebx","mm5"); # 15,14,11,10
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
&movd ("eax","mm1"); # 5, 4, 1, 0
&movd ("ebx","mm5"); # 15,14,11,10
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &or ("ecx",$acc); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 5
- &movz ($acc,&HB("eax")); # 5
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
- &movz ($acc,&HB("ebx")); # 15
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&LB("eax")); # 4
- &movz ($acc,&LB("eax")); # 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&movd ("eax","mm2"); # 7, 6, 3, 2
&movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
- &shl ($acc,16); # 14
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 3
+ &shl ("ecx",16); # 14
+ &movd ("ebx","mm6"); # 13,12, 9, 8
- &movd ("ebx","mm6"); # 13,12, 9, 8
- &movz ($acc,&HB("eax")); # 3
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
+ &movz ($key,&HB("ebx")); # 9
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("ebx")); # 8
- &movz ($acc,&LB("ebx")); # 8
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
- &shr ("ebx",16); # 13,12
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
- &shl ($acc,16); # 2
- &or ("ecx",$acc); # 2
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
+ &movz ($key,&LB("eax")); # 2
+ &movd ("mm1","ecx"); # t[1] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
+ &movz ($key,&HB("eax")); # 7
+ &shl ("ecx",16); # 2
+ &and ("eax",0xff); # 6
+ &or ("ecx",$acc); # 2
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
&movd ("mm5","edx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
&movd ("mm5","edx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
&shr ($tmp,7);
&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&shr ($tmp,7);
&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&shr ($tmp,7);
&lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$s[$i]); # tp2^tp1
&shr ($tmp,7);
&lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$s[$i]); # tp2^tp1
&shr ($tmp,7);
&lea ($tp8,&DWP(0,$tp4,$tp4));
&sub ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp8,&DWP(0,$tp4,$tp4));
&sub ($acc,$tmp);
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&mov ($s[0],$__s0) if($i==2); #prefetch $s0
&mov ($s[1],$__s1) if($i==3); #prefetch $s1
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&mov ($s[0],$__s0) if($i==2); #prefetch $s0
&mov ($s[1],$__s1) if($i==3); #prefetch $s1
sub sse_deccompact()
{
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
sub sse_deccompact()
{
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
&movd ("eax","mm1"); # 7, 6, 1, 0
&movd ("eax","mm1"); # 7, 6, 1, 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &movd ("ebx","mm5"); # 13,12,11,10
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
- &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
- &shr ("eax",16); # 7, 6
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 7
- &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
+ &movz ($key,&LB("eax")); # 6
- &movz ($acc,&LB("eax")); # 6
- &movd ("eax","mm2"); # 3, 2, 5, 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
- &shl ("ecx",16); # 6
- &movz ($acc,&LB("ebx")); # 12
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
+ &movz ($key,&LB("ebx")); # 12
+ &shl ($acc,16); # 6
&movd ("ebx","mm6"); # 9, 8,15,14
&movd ("ebx","mm6"); # 9, 8,15,14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
+ &movz ($key,&LB("eax")); # 4
- &movz ($acc,&LB("eax")); # 4
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 5
- &movz ($acc,&HB("eax")); # 5
- &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
- &shl ("edx",8); # 5
- &movz ($acc,&HB("ebx")); # 15
- &shr ("eax",16); # 3, 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
- &shl ($acc,24); # 15
- &or ("edx",$acc); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
+ &shl ($acc,8); # 5
+ &movd ("mm1","edx"); # t[1] collected
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&HB("ebx")); # 9
+ &shl ("edx",24); # 15
+ &and ("ebx",0xff); # 8
+ &or ("edx",$acc); # 15
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("eax")); # 2
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
&movd ("mm5","ecx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
&movd ("mm5","ecx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
my $tmp = $tbl;
sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
my $tmp = $tbl;
&lea ($tp2,&DWP(0,$tp1,$tp1));
&lea ($tp2,&DWP(0,$tp1,$tp1));
&lea ($tp4,&DWP(0,$tp2,$tp2));
&lea ($tp4,&DWP(0,$tp2,$tp2));
&lea ($tp8,&DWP(0,$tp4,$tp4));
&lea ($tp8,&DWP(0,$tp4,$tp4));