3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93 # suboptimal, but XTS is meant to be used with larger blocks...
99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106 die "can't locate x86_64-xlate.pl";
108 open OUT,"| \"$^X\" $xlate $flavour $output";
111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
184 sub InvInBasisChange { # OutBasisChange in reverse
185 my @b=@_[5,1,2,6,3,7,0,4];
203 sub InvOutBasisChange { # InBasisChange in reverse
204 my @b=@_[2,5,7,3,6,1,0,4];
225 #;*************************************************************
226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227 #;*************************************************************
228 my ($x0,$x1,$y0,$y1,$t0)=@_;
241 sub Mul_GF4_N { # not used, see next subroutine
242 # multiply and scale by N
243 my ($x0,$x1,$y0,$y1,$t0)=@_;
257 # interleaved Mul_GF4_N and Mul_GF4
258 my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
322 #;********************************************************************
323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324 #;********************************************************************
328 # direct optimizations from hardware
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
385 # new smaller inversion
419 # output in s3, s2, s1, t1
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
429 # AES linear components
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
438 pxor 0x20($key),@x[2]
440 pxor 0x30($key),@x[3]
442 pxor 0x40($key),@x[4]
444 pxor 0x50($key),@x[5]
446 pxor 0x60($key),@x[6]
448 pxor 0x70($key),@x[7]
456 # modified to emit output in order suitable for feeding back to aesenc[last]
459 my $inv=@_[16]; # optional
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
466 pshufd \$0x93, @x[3], @t[3]
468 pshufd \$0x93, @x[4], @t[4]
470 pshufd \$0x93, @x[5], @t[5]
472 pshufd \$0x93, @x[6], @t[6]
474 pshufd \$0x93, @x[7], @t[7]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
483 pshufd \$0x4E, @x[1], @x[1]
489 pshufd \$0x4E, @x[4], @t[0]
491 pshufd \$0x4E, @x[5], @t[1]
493 pshufd \$0x4E, @x[3], @x[4]
495 pshufd \$0x4E, @x[7], @x[5]
497 pshufd \$0x4E, @x[6], @x[3]
499 pshufd \$0x4E, @x[2], @x[6]
502 $code.=<<___ if (!$inv);
510 $code.=<<___ if ($inv);
523 sub InvMixColumns_orig {
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
555 my @y = @x[7,5,0,2,1,3,4,6];
557 # multiplication by 0x0b
561 pshufd \$0x93, @t[2], @t[2]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
570 pshufd \$0x93, @t[0], @t[0]
574 pshufd \$0x93, @t[1], @t[1]
578 pshufd \$0x93, @t[2], @t[2]
582 pshufd \$0x93, @t[3], @t[3]
588 pxor @t[5], @t[7] # clobber t[7] even more
591 pshufd \$0x93, @t[4], @t[4]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
599 # multiplication by 0x0d
602 pshufd \$0x93, @t[6], @t[6]
606 pshufd \$0x93, @t[7], @t[7]
615 pshufd \$0x93, @t[0], @t[0]
619 pshufd \$0x93, @t[1], @t[1]
624 pshufd \$0x93, @t[2], @t[2]
626 pxor @t[3], @t[6] # clobber t[6]
633 pshufd \$0x93, @t[4], @t[4]
636 pxor @t[3], @t[6] # restore t[6]
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
643 # multiplication by 0x09
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
649 pxor @y[0], @t[0] # t[0]=y[0]
651 pxor @t[7], @t[6] # clobber t[6]
654 pxor @y[4], @t[4] # t[4]=y[4]
656 pxor @y[3], @t[3] # t[3]=y[3]
658 pxor @y[2], @t[2] # t[2]=y[2]
660 pxor @y[5], @t[5] # t[5]=y[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
681 # Thanks to Jussi Kivilinna for providing pointer to
683 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
693 pshufd \$0x4E, @x[7], @t[7]
695 pshufd \$0x4E, @x[1], @t[1]
697 pshufd \$0x4E, @x[2], @t[2]
699 pshufd \$0x4E, @x[3], @t[3]
703 pshufd \$0x4E, @x[4], @t[4]
707 pshufd \$0x4E, @x[5], @t[5]
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
725 sub aesenc { # not used
729 movdqa 0x30($const),@t[0] # .LSR
731 &ShiftRows (@b,@t[0]);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
736 sub aesenclast { # not used
740 movdqa 0x40($const),@t[0] # .LSRM0
742 &ShiftRows (@b,@t[0]);
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
757 my ($a,$b,$n,$mask,$t)=@_;
769 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
789 my @x=reverse(@_[0..7]);
790 my ($t0,$t1,$t2,$t3)=@_[8..11];
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
798 movdqa 0x20($const),$t0 # .LBS2
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810 .extern asm_AES_encrypt
811 .extern asm_AES_decrypt
813 .type _bsaes_encrypt8,\@abi-omnipotent
816 lea .LBS0(%rip), $const # constants table
818 movdqa ($key), @XMM[9] # round 0 key
820 movdqa 0x50($const), @XMM[8] # .LM0SR
821 pxor @XMM[9], @XMM[0] # xor with round0 key
822 pxor @XMM[9], @XMM[1]
823 pshufb @XMM[8], @XMM[0]
824 pxor @XMM[9], @XMM[2]
825 pshufb @XMM[8], @XMM[1]
826 pxor @XMM[9], @XMM[3]
827 pshufb @XMM[8], @XMM[2]
828 pxor @XMM[9], @XMM[4]
829 pshufb @XMM[8], @XMM[3]
830 pxor @XMM[9], @XMM[5]
831 pshufb @XMM[8], @XMM[4]
832 pxor @XMM[9], @XMM[6]
833 pshufb @XMM[8], @XMM[5]
834 pxor @XMM[9], @XMM[7]
835 pshufb @XMM[8], @XMM[6]
836 pshufb @XMM[8], @XMM[7]
837 _bsaes_encrypt8_bitslice:
839 &bitslice (@XMM[0..7, 8..11]);
846 &ShiftRows (@XMM[0..7, 8]);
847 $code.=".Lenc_sbox:\n";
848 &Sbox (@XMM[0..7, 8..15]);
853 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855 movdqa 0x30($const), @XMM[8] # .LSR
857 movdqa 0x40($const), @XMM[8] # .LSRM0
862 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[3]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[2]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
875 .size _bsaes_encrypt8,.-_bsaes_encrypt8
877 .type _bsaes_decrypt8,\@abi-omnipotent
880 lea .LBS0(%rip), $const # constants table
882 movdqa ($key), @XMM[9] # round 0 key
884 movdqa -0x30($const), @XMM[8] # .LM0ISR
885 pxor @XMM[9], @XMM[0] # xor with round0 key
886 pxor @XMM[9], @XMM[1]
887 pshufb @XMM[8], @XMM[0]
888 pxor @XMM[9], @XMM[2]
889 pshufb @XMM[8], @XMM[1]
890 pxor @XMM[9], @XMM[3]
891 pshufb @XMM[8], @XMM[2]
892 pxor @XMM[9], @XMM[4]
893 pshufb @XMM[8], @XMM[3]
894 pxor @XMM[9], @XMM[5]
895 pshufb @XMM[8], @XMM[4]
896 pxor @XMM[9], @XMM[6]
897 pshufb @XMM[8], @XMM[5]
898 pxor @XMM[9], @XMM[7]
899 pshufb @XMM[8], @XMM[6]
900 pshufb @XMM[8], @XMM[7]
902 &bitslice (@XMM[0..7, 8..11]);
909 &ShiftRows (@XMM[0..7, 8]);
910 $code.=".Ldec_sbox:\n";
911 &InvSbox (@XMM[0..7, 8..15]);
916 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
918 movdqa -0x10($const), @XMM[8] # .LISR
920 movdqa -0x20($const), @XMM[8] # .LISRM0
925 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
927 movdqa ($key), @XMM[8] # last round key
928 pxor @XMM[8], @XMM[6]
929 pxor @XMM[8], @XMM[4]
930 pxor @XMM[8], @XMM[2]
931 pxor @XMM[8], @XMM[7]
932 pxor @XMM[8], @XMM[3]
933 pxor @XMM[8], @XMM[5]
934 pxor @XMM[8], @XMM[0]
935 pxor @XMM[8], @XMM[1]
937 .size _bsaes_decrypt8,.-_bsaes_decrypt8
941 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944 my @x=reverse(@_[0..7]);
945 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
947 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
949 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
953 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
955 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
957 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
963 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
964 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
968 .type _bsaes_key_convert,\@abi-omnipotent
971 lea .Lmasks(%rip), $const
972 movdqu ($inp), %xmm7 # load round 0 key
974 movdqa 0x00($const), %xmm0 # 0x01...
975 movdqa 0x10($const), %xmm1 # 0x02...
976 movdqa 0x20($const), %xmm2 # 0x04...
977 movdqa 0x30($const), %xmm3 # 0x08...
978 movdqa 0x40($const), %xmm4 # .LM0
979 pcmpeqd %xmm5, %xmm5 # .LNOT
981 movdqu ($inp), %xmm6 # load round 1 key
982 movdqa %xmm7, ($out) # save round 0 key
988 pshufb %xmm4, %xmm6 # .LM0
997 psllq \$4, %xmm0 # 0x10...
1000 psllq \$4, %xmm1 # 0x20...
1004 movdqa %xmm0, %xmm12
1005 pcmpeqb %xmm2, %xmm10
1006 psllq \$4, %xmm2 # 0x40...
1007 movdqa %xmm1, %xmm13
1008 pcmpeqb %xmm3, %xmm11
1009 psllq \$4, %xmm3 # 0x80...
1011 movdqa %xmm2, %xmm14
1012 movdqa %xmm3, %xmm15
1013 pxor %xmm5, %xmm8 # "pnot"
1018 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1019 pcmpeqb %xmm0, %xmm12
1020 psrlq \$4, %xmm0 # 0x01...
1021 movdqa %xmm9, 0x10($out)
1022 pcmpeqb %xmm1, %xmm13
1023 psrlq \$4, %xmm1 # 0x02...
1024 lea 0x10($inp), $inp
1028 movdqa %xmm10, 0x20($out)
1029 pcmpeqb %xmm2, %xmm14
1030 psrlq \$4, %xmm2 # 0x04...
1031 movdqa %xmm11, 0x30($out)
1032 pcmpeqb %xmm3, %xmm15
1033 psrlq \$4, %xmm3 # 0x08...
1034 movdqu ($inp), %xmm6 # load next round key
1036 pxor %xmm5, %xmm13 # "pnot"
1038 movdqa %xmm12, 0x40($out)
1039 movdqa %xmm13, 0x50($out)
1040 movdqa %xmm14, 0x60($out)
1041 movdqa %xmm15, 0x70($out)
1046 movdqa 0x50($const), %xmm7 # .L63
1047 #movdqa %xmm6, ($out) # don't save last round key
1049 .size _bsaes_key_convert,.-_bsaes_key_convert
1053 if (0 && !$win64) { # following four functions are unsupported interface
1054 # used for benchmarking...
1056 .globl bsaes_enc_key_convert
1057 .type bsaes_enc_key_convert,\@function,2
1059 bsaes_enc_key_convert:
1060 mov 240($inp),%r10d # pass rounds
1061 mov $inp,%rcx # pass key
1062 mov $out,%rax # pass key schedule
1063 call _bsaes_key_convert
1064 pxor %xmm6,%xmm7 # fix up last round key
1065 movdqa %xmm7,(%rax) # save last round key
1067 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1069 .globl bsaes_encrypt_128
1070 .type bsaes_encrypt_128,\@function,4
1074 movdqu 0x00($inp), @XMM[0] # load input
1075 movdqu 0x10($inp), @XMM[1]
1076 movdqu 0x20($inp), @XMM[2]
1077 movdqu 0x30($inp), @XMM[3]
1078 movdqu 0x40($inp), @XMM[4]
1079 movdqu 0x50($inp), @XMM[5]
1080 movdqu 0x60($inp), @XMM[6]
1081 movdqu 0x70($inp), @XMM[7]
1082 mov $key, %rax # pass the $key
1083 lea 0x80($inp), $inp
1086 call _bsaes_encrypt8
1088 movdqu @XMM[0], 0x00($out) # write output
1089 movdqu @XMM[1], 0x10($out)
1090 movdqu @XMM[4], 0x20($out)
1091 movdqu @XMM[6], 0x30($out)
1092 movdqu @XMM[3], 0x40($out)
1093 movdqu @XMM[7], 0x50($out)
1094 movdqu @XMM[2], 0x60($out)
1095 movdqu @XMM[5], 0x70($out)
1096 lea 0x80($out), $out
1100 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1102 .globl bsaes_dec_key_convert
1103 .type bsaes_dec_key_convert,\@function,2
1105 bsaes_dec_key_convert:
1106 mov 240($inp),%r10d # pass rounds
1107 mov $inp,%rcx # pass key
1108 mov $out,%rax # pass key schedule
1109 call _bsaes_key_convert
1110 pxor ($out),%xmm7 # fix up round 0 key
1111 movdqa %xmm6,(%rax) # save last round key
1114 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1116 .globl bsaes_decrypt_128
1117 .type bsaes_decrypt_128,\@function,4
1121 movdqu 0x00($inp), @XMM[0] # load input
1122 movdqu 0x10($inp), @XMM[1]
1123 movdqu 0x20($inp), @XMM[2]
1124 movdqu 0x30($inp), @XMM[3]
1125 movdqu 0x40($inp), @XMM[4]
1126 movdqu 0x50($inp), @XMM[5]
1127 movdqu 0x60($inp), @XMM[6]
1128 movdqu 0x70($inp), @XMM[7]
1129 mov $key, %rax # pass the $key
1130 lea 0x80($inp), $inp
1133 call _bsaes_decrypt8
1135 movdqu @XMM[0], 0x00($out) # write output
1136 movdqu @XMM[1], 0x10($out)
1137 movdqu @XMM[6], 0x20($out)
1138 movdqu @XMM[4], 0x30($out)
1139 movdqu @XMM[2], 0x40($out)
1140 movdqu @XMM[7], 0x50($out)
1141 movdqu @XMM[3], 0x60($out)
1142 movdqu @XMM[5], 0x70($out)
1143 lea 0x80($out), $out
1147 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1151 ######################################################################
1155 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1161 .globl bsaes_ecb_encrypt_blocks
1162 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1164 bsaes_ecb_encrypt_blocks:
1173 lea -0x48(%rsp),%rsp
1175 $code.=<<___ if ($win64);
1176 lea -0xa0(%rsp), %rsp
1177 movaps %xmm6, 0x40(%rsp)
1178 movaps %xmm7, 0x50(%rsp)
1179 movaps %xmm8, 0x60(%rsp)
1180 movaps %xmm9, 0x70(%rsp)
1181 movaps %xmm10, 0x80(%rsp)
1182 movaps %xmm11, 0x90(%rsp)
1183 movaps %xmm12, 0xa0(%rsp)
1184 movaps %xmm13, 0xb0(%rsp)
1185 movaps %xmm14, 0xc0(%rsp)
1186 movaps %xmm15, 0xd0(%rsp)
1190 mov %rsp,%rbp # backup %rsp
1191 mov 240($arg4),%eax # rounds
1192 mov $arg1,$inp # backup arguments
1199 mov %eax,%ebx # backup rounds
1200 shl \$7,%rax # 128 bytes per inner round key
1201 sub \$`128-32`,%rax # size of bit-sliced key schedule
1203 mov %rsp,%rax # pass key schedule
1204 mov $key,%rcx # pass key
1205 mov %ebx,%r10d # pass rounds
1206 call _bsaes_key_convert
1207 pxor %xmm6,%xmm7 # fix up last round key
1208 movdqa %xmm7,(%rax) # save last round key
1212 movdqu 0x00($inp), @XMM[0] # load input
1213 movdqu 0x10($inp), @XMM[1]
1214 movdqu 0x20($inp), @XMM[2]
1215 movdqu 0x30($inp), @XMM[3]
1216 movdqu 0x40($inp), @XMM[4]
1217 movdqu 0x50($inp), @XMM[5]
1218 mov %rsp, %rax # pass key schedule
1219 movdqu 0x60($inp), @XMM[6]
1220 mov %ebx,%r10d # pass rounds
1221 movdqu 0x70($inp), @XMM[7]
1222 lea 0x80($inp), $inp
1224 call _bsaes_encrypt8
1226 movdqu @XMM[0], 0x00($out) # write output
1227 movdqu @XMM[1], 0x10($out)
1228 movdqu @XMM[4], 0x20($out)
1229 movdqu @XMM[6], 0x30($out)
1230 movdqu @XMM[3], 0x40($out)
1231 movdqu @XMM[7], 0x50($out)
1232 movdqu @XMM[2], 0x60($out)
1233 movdqu @XMM[5], 0x70($out)
1234 lea 0x80($out), $out
1241 movdqu 0x00($inp), @XMM[0] # load input
1242 mov %rsp, %rax # pass key schedule
1243 mov %ebx,%r10d # pass rounds
1246 movdqu 0x10($inp), @XMM[1]
1248 movdqu 0x20($inp), @XMM[2]
1251 movdqu 0x30($inp), @XMM[3]
1253 movdqu 0x40($inp), @XMM[4]
1256 movdqu 0x50($inp), @XMM[5]
1258 movdqu 0x60($inp), @XMM[6]
1259 call _bsaes_encrypt8
1260 movdqu @XMM[0], 0x00($out) # write output
1261 movdqu @XMM[1], 0x10($out)
1262 movdqu @XMM[4], 0x20($out)
1263 movdqu @XMM[6], 0x30($out)
1264 movdqu @XMM[3], 0x40($out)
1265 movdqu @XMM[7], 0x50($out)
1266 movdqu @XMM[2], 0x60($out)
1270 call _bsaes_encrypt8
1271 movdqu @XMM[0], 0x00($out) # write output
1272 movdqu @XMM[1], 0x10($out)
1273 movdqu @XMM[4], 0x20($out)
1274 movdqu @XMM[6], 0x30($out)
1275 movdqu @XMM[3], 0x40($out)
1276 movdqu @XMM[7], 0x50($out)
1280 call _bsaes_encrypt8
1281 movdqu @XMM[0], 0x00($out) # write output
1282 movdqu @XMM[1], 0x10($out)
1283 movdqu @XMM[4], 0x20($out)
1284 movdqu @XMM[6], 0x30($out)
1285 movdqu @XMM[3], 0x40($out)
1289 call _bsaes_encrypt8
1290 movdqu @XMM[0], 0x00($out) # write output
1291 movdqu @XMM[1], 0x10($out)
1292 movdqu @XMM[4], 0x20($out)
1293 movdqu @XMM[6], 0x30($out)
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1306 movdqu @XMM[1], 0x10($out)
1310 call _bsaes_encrypt8
1311 movdqu @XMM[0], 0x00($out) # write output
1318 call asm_AES_encrypt
1327 .Lecb_enc_bzero: # wipe key schedule [if any]
1328 movdqa %xmm0, 0x00(%rax)
1329 movdqa %xmm0, 0x10(%rax)
1330 lea 0x20(%rax), %rax
1334 lea (%rbp),%rsp # restore %rsp
1336 $code.=<<___ if ($win64);
1337 movaps 0x40(%rbp), %xmm6
1338 movaps 0x50(%rbp), %xmm7
1339 movaps 0x60(%rbp), %xmm8
1340 movaps 0x70(%rbp), %xmm9
1341 movaps 0x80(%rbp), %xmm10
1342 movaps 0x90(%rbp), %xmm11
1343 movaps 0xa0(%rbp), %xmm12
1344 movaps 0xb0(%rbp), %xmm13
1345 movaps 0xc0(%rbp), %xmm14
1346 movaps 0xd0(%rbp), %xmm15
1347 lea 0xa0(%rbp), %rsp
1350 mov 0x48(%rsp), %r15
1351 mov 0x50(%rsp), %r14
1352 mov 0x58(%rsp), %r13
1353 mov 0x60(%rsp), %r12
1354 mov 0x68(%rsp), %rbx
1355 mov 0x70(%rsp), %rax
1356 lea 0x78(%rsp), %rsp
1360 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1362 .globl bsaes_ecb_decrypt_blocks
1363 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1365 bsaes_ecb_decrypt_blocks:
1374 lea -0x48(%rsp),%rsp
1376 $code.=<<___ if ($win64);
1377 lea -0xa0(%rsp), %rsp
1378 movaps %xmm6, 0x40(%rsp)
1379 movaps %xmm7, 0x50(%rsp)
1380 movaps %xmm8, 0x60(%rsp)
1381 movaps %xmm9, 0x70(%rsp)
1382 movaps %xmm10, 0x80(%rsp)
1383 movaps %xmm11, 0x90(%rsp)
1384 movaps %xmm12, 0xa0(%rsp)
1385 movaps %xmm13, 0xb0(%rsp)
1386 movaps %xmm14, 0xc0(%rsp)
1387 movaps %xmm15, 0xd0(%rsp)
1391 mov %rsp,%rbp # backup %rsp
1392 mov 240($arg4),%eax # rounds
1393 mov $arg1,$inp # backup arguments
1400 mov %eax,%ebx # backup rounds
1401 shl \$7,%rax # 128 bytes per inner round key
1402 sub \$`128-32`,%rax # size of bit-sliced key schedule
1404 mov %rsp,%rax # pass key schedule
1405 mov $key,%rcx # pass key
1406 mov %ebx,%r10d # pass rounds
1407 call _bsaes_key_convert
1408 pxor (%rsp),%xmm7 # fix up 0 round key
1409 movdqa %xmm6,(%rax) # save last round key
1414 movdqu 0x00($inp), @XMM[0] # load input
1415 movdqu 0x10($inp), @XMM[1]
1416 movdqu 0x20($inp), @XMM[2]
1417 movdqu 0x30($inp), @XMM[3]
1418 movdqu 0x40($inp), @XMM[4]
1419 movdqu 0x50($inp), @XMM[5]
1420 mov %rsp, %rax # pass key schedule
1421 movdqu 0x60($inp), @XMM[6]
1422 mov %ebx,%r10d # pass rounds
1423 movdqu 0x70($inp), @XMM[7]
1424 lea 0x80($inp), $inp
1426 call _bsaes_decrypt8
1428 movdqu @XMM[0], 0x00($out) # write output
1429 movdqu @XMM[1], 0x10($out)
1430 movdqu @XMM[6], 0x20($out)
1431 movdqu @XMM[4], 0x30($out)
1432 movdqu @XMM[2], 0x40($out)
1433 movdqu @XMM[7], 0x50($out)
1434 movdqu @XMM[3], 0x60($out)
1435 movdqu @XMM[5], 0x70($out)
1436 lea 0x80($out), $out
1443 movdqu 0x00($inp), @XMM[0] # load input
1444 mov %rsp, %rax # pass key schedule
1445 mov %ebx,%r10d # pass rounds
1448 movdqu 0x10($inp), @XMM[1]
1450 movdqu 0x20($inp), @XMM[2]
1453 movdqu 0x30($inp), @XMM[3]
1455 movdqu 0x40($inp), @XMM[4]
1458 movdqu 0x50($inp), @XMM[5]
1460 movdqu 0x60($inp), @XMM[6]
1461 call _bsaes_decrypt8
1462 movdqu @XMM[0], 0x00($out) # write output
1463 movdqu @XMM[1], 0x10($out)
1464 movdqu @XMM[6], 0x20($out)
1465 movdqu @XMM[4], 0x30($out)
1466 movdqu @XMM[2], 0x40($out)
1467 movdqu @XMM[7], 0x50($out)
1468 movdqu @XMM[3], 0x60($out)
1472 call _bsaes_decrypt8
1473 movdqu @XMM[0], 0x00($out) # write output
1474 movdqu @XMM[1], 0x10($out)
1475 movdqu @XMM[6], 0x20($out)
1476 movdqu @XMM[4], 0x30($out)
1477 movdqu @XMM[2], 0x40($out)
1478 movdqu @XMM[7], 0x50($out)
1482 call _bsaes_decrypt8
1483 movdqu @XMM[0], 0x00($out) # write output
1484 movdqu @XMM[1], 0x10($out)
1485 movdqu @XMM[6], 0x20($out)
1486 movdqu @XMM[4], 0x30($out)
1487 movdqu @XMM[2], 0x40($out)
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1508 movdqu @XMM[1], 0x10($out)
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1520 call asm_AES_decrypt
1529 .Lecb_dec_bzero: # wipe key schedule [if any]
1530 movdqa %xmm0, 0x00(%rax)
1531 movdqa %xmm0, 0x10(%rax)
1532 lea 0x20(%rax), %rax
1536 lea (%rbp),%rsp # restore %rsp
1538 $code.=<<___ if ($win64);
1539 movaps 0x40(%rbp), %xmm6
1540 movaps 0x50(%rbp), %xmm7
1541 movaps 0x60(%rbp), %xmm8
1542 movaps 0x70(%rbp), %xmm9
1543 movaps 0x80(%rbp), %xmm10
1544 movaps 0x90(%rbp), %xmm11
1545 movaps 0xa0(%rbp), %xmm12
1546 movaps 0xb0(%rbp), %xmm13
1547 movaps 0xc0(%rbp), %xmm14
1548 movaps 0xd0(%rbp), %xmm15
1549 lea 0xa0(%rbp), %rsp
1552 mov 0x48(%rsp), %r15
1553 mov 0x50(%rsp), %r14
1554 mov 0x58(%rsp), %r13
1555 mov 0x60(%rsp), %r12
1556 mov 0x68(%rsp), %rbx
1557 mov 0x70(%rsp), %rax
1558 lea 0x78(%rsp), %rsp
1562 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1566 .extern asm_AES_cbc_encrypt
1567 .globl bsaes_cbc_encrypt
1568 .type bsaes_cbc_encrypt,\@abi-omnipotent
1572 $code.=<<___ if ($win64);
1573 mov 48(%rsp),$arg6 # pull direction flag
1577 jne asm_AES_cbc_encrypt
1579 jb asm_AES_cbc_encrypt
1589 lea -0x48(%rsp), %rsp
1591 $code.=<<___ if ($win64);
1592 mov 0xa0(%rsp),$arg5 # pull ivp
1593 lea -0xa0(%rsp), %rsp
1594 movaps %xmm6, 0x40(%rsp)
1595 movaps %xmm7, 0x50(%rsp)
1596 movaps %xmm8, 0x60(%rsp)
1597 movaps %xmm9, 0x70(%rsp)
1598 movaps %xmm10, 0x80(%rsp)
1599 movaps %xmm11, 0x90(%rsp)
1600 movaps %xmm12, 0xa0(%rsp)
1601 movaps %xmm13, 0xb0(%rsp)
1602 movaps %xmm14, 0xc0(%rsp)
1603 movaps %xmm15, 0xd0(%rsp)
1607 mov %rsp, %rbp # backup %rsp
1608 mov 240($arg4), %eax # rounds
1609 mov $arg1, $inp # backup arguments
1614 shr \$4, $len # bytes to blocks
1616 mov %eax, %edx # rounds
1617 shl \$7, %rax # 128 bytes per inner round key
1618 sub \$`128-32`, %rax # size of bit-sliced key schedule
1621 mov %rsp, %rax # pass key schedule
1622 mov $key, %rcx # pass key
1623 mov %edx, %r10d # pass rounds
1624 call _bsaes_key_convert
1625 pxor (%rsp),%xmm7 # fix up 0 round key
1626 movdqa %xmm6,(%rax) # save last round key
1629 movdqu (%rbx), @XMM[15] # load IV
1632 movdqu 0x00($inp), @XMM[0] # load input
1633 movdqu 0x10($inp), @XMM[1]
1634 movdqu 0x20($inp), @XMM[2]
1635 movdqu 0x30($inp), @XMM[3]
1636 movdqu 0x40($inp), @XMM[4]
1637 movdqu 0x50($inp), @XMM[5]
1638 mov %rsp, %rax # pass key schedule
1639 movdqu 0x60($inp), @XMM[6]
1640 mov %edx,%r10d # pass rounds
1641 movdqu 0x70($inp), @XMM[7]
1642 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1644 call _bsaes_decrypt8
1646 pxor 0x20(%rbp), @XMM[0] # ^= IV
1647 movdqu 0x00($inp), @XMM[8] # re-load input
1648 movdqu 0x10($inp), @XMM[9]
1649 pxor @XMM[8], @XMM[1]
1650 movdqu 0x20($inp), @XMM[10]
1651 pxor @XMM[9], @XMM[6]
1652 movdqu 0x30($inp), @XMM[11]
1653 pxor @XMM[10], @XMM[4]
1654 movdqu 0x40($inp), @XMM[12]
1655 pxor @XMM[11], @XMM[2]
1656 movdqu 0x50($inp), @XMM[13]
1657 pxor @XMM[12], @XMM[7]
1658 movdqu 0x60($inp), @XMM[14]
1659 pxor @XMM[13], @XMM[3]
1660 movdqu 0x70($inp), @XMM[15] # IV
1661 pxor @XMM[14], @XMM[5]
1662 movdqu @XMM[0], 0x00($out) # write output
1663 lea 0x80($inp), $inp
1664 movdqu @XMM[1], 0x10($out)
1665 movdqu @XMM[6], 0x20($out)
1666 movdqu @XMM[4], 0x30($out)
1667 movdqu @XMM[2], 0x40($out)
1668 movdqu @XMM[7], 0x50($out)
1669 movdqu @XMM[3], 0x60($out)
1670 movdqu @XMM[5], 0x70($out)
1671 lea 0x80($out), $out
1678 movdqu 0x00($inp), @XMM[0] # load input
1679 mov %rsp, %rax # pass key schedule
1680 mov %edx, %r10d # pass rounds
1683 movdqu 0x10($inp), @XMM[1]
1685 movdqu 0x20($inp), @XMM[2]
1688 movdqu 0x30($inp), @XMM[3]
1690 movdqu 0x40($inp), @XMM[4]
1693 movdqu 0x50($inp), @XMM[5]
1695 movdqu 0x60($inp), @XMM[6]
1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1697 call _bsaes_decrypt8
1698 pxor 0x20(%rbp), @XMM[0] # ^= IV
1699 movdqu 0x00($inp), @XMM[8] # re-load input
1700 movdqu 0x10($inp), @XMM[9]
1701 pxor @XMM[8], @XMM[1]
1702 movdqu 0x20($inp), @XMM[10]
1703 pxor @XMM[9], @XMM[6]
1704 movdqu 0x30($inp), @XMM[11]
1705 pxor @XMM[10], @XMM[4]
1706 movdqu 0x40($inp), @XMM[12]
1707 pxor @XMM[11], @XMM[2]
1708 movdqu 0x50($inp), @XMM[13]
1709 pxor @XMM[12], @XMM[7]
1710 movdqu 0x60($inp), @XMM[15] # IV
1711 pxor @XMM[13], @XMM[3]
1712 movdqu @XMM[0], 0x00($out) # write output
1713 movdqu @XMM[1], 0x10($out)
1714 movdqu @XMM[6], 0x20($out)
1715 movdqu @XMM[4], 0x30($out)
1716 movdqu @XMM[2], 0x40($out)
1717 movdqu @XMM[7], 0x50($out)
1718 movdqu @XMM[3], 0x60($out)
1722 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1723 call _bsaes_decrypt8
1724 pxor 0x20(%rbp), @XMM[0] # ^= IV
1725 movdqu 0x00($inp), @XMM[8] # re-load input
1726 movdqu 0x10($inp), @XMM[9]
1727 pxor @XMM[8], @XMM[1]
1728 movdqu 0x20($inp), @XMM[10]
1729 pxor @XMM[9], @XMM[6]
1730 movdqu 0x30($inp), @XMM[11]
1731 pxor @XMM[10], @XMM[4]
1732 movdqu 0x40($inp), @XMM[12]
1733 pxor @XMM[11], @XMM[2]
1734 movdqu 0x50($inp), @XMM[15] # IV
1735 pxor @XMM[12], @XMM[7]
1736 movdqu @XMM[0], 0x00($out) # write output
1737 movdqu @XMM[1], 0x10($out)
1738 movdqu @XMM[6], 0x20($out)
1739 movdqu @XMM[4], 0x30($out)
1740 movdqu @XMM[2], 0x40($out)
1741 movdqu @XMM[7], 0x50($out)
1745 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1746 call _bsaes_decrypt8
1747 pxor 0x20(%rbp), @XMM[0] # ^= IV
1748 movdqu 0x00($inp), @XMM[8] # re-load input
1749 movdqu 0x10($inp), @XMM[9]
1750 pxor @XMM[8], @XMM[1]
1751 movdqu 0x20($inp), @XMM[10]
1752 pxor @XMM[9], @XMM[6]
1753 movdqu 0x30($inp), @XMM[11]
1754 pxor @XMM[10], @XMM[4]
1755 movdqu 0x40($inp), @XMM[15] # IV
1756 pxor @XMM[11], @XMM[2]
1757 movdqu @XMM[0], 0x00($out) # write output
1758 movdqu @XMM[1], 0x10($out)
1759 movdqu @XMM[6], 0x20($out)
1760 movdqu @XMM[4], 0x30($out)
1761 movdqu @XMM[2], 0x40($out)
1765 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1766 call _bsaes_decrypt8
1767 pxor 0x20(%rbp), @XMM[0] # ^= IV
1768 movdqu 0x00($inp), @XMM[8] # re-load input
1769 movdqu 0x10($inp), @XMM[9]
1770 pxor @XMM[8], @XMM[1]
1771 movdqu 0x20($inp), @XMM[10]
1772 pxor @XMM[9], @XMM[6]
1773 movdqu 0x30($inp), @XMM[15] # IV
1774 pxor @XMM[10], @XMM[4]
1775 movdqu @XMM[0], 0x00($out) # write output
1776 movdqu @XMM[1], 0x10($out)
1777 movdqu @XMM[6], 0x20($out)
1778 movdqu @XMM[4], 0x30($out)
1782 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1783 call _bsaes_decrypt8
1784 pxor 0x20(%rbp), @XMM[0] # ^= IV
1785 movdqu 0x00($inp), @XMM[8] # re-load input
1786 movdqu 0x10($inp), @XMM[9]
1787 pxor @XMM[8], @XMM[1]
1788 movdqu 0x20($inp), @XMM[15] # IV
1789 pxor @XMM[9], @XMM[6]
1790 movdqu @XMM[0], 0x00($out) # write output
1791 movdqu @XMM[1], 0x10($out)
1792 movdqu @XMM[6], 0x20($out)
1796 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1797 call _bsaes_decrypt8
1798 pxor 0x20(%rbp), @XMM[0] # ^= IV
1799 movdqu 0x00($inp), @XMM[8] # re-load input
1800 movdqu 0x10($inp), @XMM[15] # IV
1801 pxor @XMM[8], @XMM[1]
1802 movdqu @XMM[0], 0x00($out) # write output
1803 movdqu @XMM[1], 0x10($out)
1808 lea 0x20(%rbp), $arg2 # buffer output
1810 call asm_AES_decrypt # doesn't touch %xmm
1811 pxor 0x20(%rbp), @XMM[15] # ^= IV
1812 movdqu @XMM[15], ($out) # write output
1813 movdqa @XMM[0], @XMM[15] # IV
1816 movdqu @XMM[15], (%rbx) # return IV
1819 .Lcbc_dec_bzero: # wipe key schedule [if any]
1820 movdqa %xmm0, 0x00(%rax)
1821 movdqa %xmm0, 0x10(%rax)
1822 lea 0x20(%rax), %rax
1826 lea (%rbp),%rsp # restore %rsp
1828 $code.=<<___ if ($win64);
1829 movaps 0x40(%rbp), %xmm6
1830 movaps 0x50(%rbp), %xmm7
1831 movaps 0x60(%rbp), %xmm8
1832 movaps 0x70(%rbp), %xmm9
1833 movaps 0x80(%rbp), %xmm10
1834 movaps 0x90(%rbp), %xmm11
1835 movaps 0xa0(%rbp), %xmm12
1836 movaps 0xb0(%rbp), %xmm13
1837 movaps 0xc0(%rbp), %xmm14
1838 movaps 0xd0(%rbp), %xmm15
1839 lea 0xa0(%rbp), %rsp
1842 mov 0x48(%rsp), %r15
1843 mov 0x50(%rsp), %r14
1844 mov 0x58(%rsp), %r13
1845 mov 0x60(%rsp), %r12
1846 mov 0x68(%rsp), %rbx
1847 mov 0x70(%rsp), %rax
1848 lea 0x78(%rsp), %rsp
1852 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1854 .globl bsaes_ctr32_encrypt_blocks
1855 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1857 bsaes_ctr32_encrypt_blocks:
1866 lea -0x48(%rsp), %rsp
1868 $code.=<<___ if ($win64);
1869 mov 0xa0(%rsp),$arg5 # pull ivp
1870 lea -0xa0(%rsp), %rsp
1871 movaps %xmm6, 0x40(%rsp)
1872 movaps %xmm7, 0x50(%rsp)
1873 movaps %xmm8, 0x60(%rsp)
1874 movaps %xmm9, 0x70(%rsp)
1875 movaps %xmm10, 0x80(%rsp)
1876 movaps %xmm11, 0x90(%rsp)
1877 movaps %xmm12, 0xa0(%rsp)
1878 movaps %xmm13, 0xb0(%rsp)
1879 movaps %xmm14, 0xc0(%rsp)
1880 movaps %xmm15, 0xd0(%rsp)
1884 mov %rsp, %rbp # backup %rsp
1885 movdqu ($arg5), %xmm0 # load counter
1886 mov 240($arg4), %eax # rounds
1887 mov $arg1, $inp # backup arguments
1891 movdqa %xmm0, 0x20(%rbp) # copy counter
1895 mov %eax, %ebx # rounds
1896 shl \$7, %rax # 128 bytes per inner round key
1897 sub \$`128-32`, %rax # size of bit-sliced key schedule
1900 mov %rsp, %rax # pass key schedule
1901 mov $key, %rcx # pass key
1902 mov %ebx, %r10d # pass rounds
1903 call _bsaes_key_convert
1904 pxor %xmm6,%xmm7 # fix up last round key
1905 movdqa %xmm7,(%rax) # save last round key
1907 movdqa (%rsp), @XMM[9] # load round0 key
1908 lea .LADD1(%rip), %r11
1909 movdqa 0x20(%rbp), @XMM[0] # counter copy
1910 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1911 pshufb @XMM[8], @XMM[9] # byte swap upper part
1912 pshufb @XMM[8], @XMM[0]
1913 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1917 movdqa @XMM[0], 0x20(%rbp) # save counter
1918 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1919 movdqa @XMM[0], @XMM[2]
1920 paddd 0x00(%r11), @XMM[1] # .LADD1
1921 movdqa @XMM[0], @XMM[3]
1922 paddd 0x10(%r11), @XMM[2] # .LADD2
1923 movdqa @XMM[0], @XMM[4]
1924 paddd 0x20(%r11), @XMM[3] # .LADD3
1925 movdqa @XMM[0], @XMM[5]
1926 paddd 0x30(%r11), @XMM[4] # .LADD4
1927 movdqa @XMM[0], @XMM[6]
1928 paddd 0x40(%r11), @XMM[5] # .LADD5
1929 movdqa @XMM[0], @XMM[7]
1930 paddd 0x50(%r11), @XMM[6] # .LADD6
1931 paddd 0x60(%r11), @XMM[7] # .LADD7
1933 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934 # to flip byte order in 32-bit counter
1935 movdqa (%rsp), @XMM[9] # round 0 key
1936 lea 0x10(%rsp), %rax # pass key schedule
1937 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1938 pxor @XMM[9], @XMM[0] # xor with round0 key
1939 pxor @XMM[9], @XMM[1]
1940 pshufb @XMM[8], @XMM[0]
1941 pxor @XMM[9], @XMM[2]
1942 pshufb @XMM[8], @XMM[1]
1943 pxor @XMM[9], @XMM[3]
1944 pshufb @XMM[8], @XMM[2]
1945 pxor @XMM[9], @XMM[4]
1946 pshufb @XMM[8], @XMM[3]
1947 pxor @XMM[9], @XMM[5]
1948 pshufb @XMM[8], @XMM[4]
1949 pxor @XMM[9], @XMM[6]
1950 pshufb @XMM[8], @XMM[5]
1951 pxor @XMM[9], @XMM[7]
1952 pshufb @XMM[8], @XMM[6]
1953 lea .LBS0(%rip), %r11 # constants table
1954 pshufb @XMM[8], @XMM[7]
1955 mov %ebx,%r10d # pass rounds
1957 call _bsaes_encrypt8_bitslice
1960 jc .Lctr_enc_loop_done
1962 movdqu 0x00($inp), @XMM[8] # load input
1963 movdqu 0x10($inp), @XMM[9]
1964 movdqu 0x20($inp), @XMM[10]
1965 movdqu 0x30($inp), @XMM[11]
1966 movdqu 0x40($inp), @XMM[12]
1967 movdqu 0x50($inp), @XMM[13]
1968 movdqu 0x60($inp), @XMM[14]
1969 movdqu 0x70($inp), @XMM[15]
1971 pxor @XMM[0], @XMM[8]
1972 movdqa 0x20(%rbp), @XMM[0] # load counter
1973 pxor @XMM[9], @XMM[1]
1974 movdqu @XMM[8], 0x00($out) # write output
1975 pxor @XMM[10], @XMM[4]
1976 movdqu @XMM[1], 0x10($out)
1977 pxor @XMM[11], @XMM[6]
1978 movdqu @XMM[4], 0x20($out)
1979 pxor @XMM[12], @XMM[3]
1980 movdqu @XMM[6], 0x30($out)
1981 pxor @XMM[13], @XMM[7]
1982 movdqu @XMM[3], 0x40($out)
1983 pxor @XMM[14], @XMM[2]
1984 movdqu @XMM[7], 0x50($out)
1985 pxor @XMM[15], @XMM[5]
1986 movdqu @XMM[2], 0x60($out)
1987 lea .LADD1(%rip), %r11
1988 movdqu @XMM[5], 0x70($out)
1989 lea 0x80($out), $out
1990 paddd 0x70(%r11), @XMM[0] # .LADD8
1995 .Lctr_enc_loop_done:
1997 movdqu 0x00($inp), @XMM[8] # load input
1998 pxor @XMM[8], @XMM[0]
1999 movdqu @XMM[0], 0x00($out) # write output
2002 movdqu 0x10($inp), @XMM[9]
2003 pxor @XMM[9], @XMM[1]
2004 movdqu @XMM[1], 0x10($out)
2006 movdqu 0x20($inp), @XMM[10]
2007 pxor @XMM[10], @XMM[4]
2008 movdqu @XMM[4], 0x20($out)
2011 movdqu 0x30($inp), @XMM[11]
2012 pxor @XMM[11], @XMM[6]
2013 movdqu @XMM[6], 0x30($out)
2015 movdqu 0x40($inp), @XMM[12]
2016 pxor @XMM[12], @XMM[3]
2017 movdqu @XMM[3], 0x40($out)
2020 movdqu 0x50($inp), @XMM[13]
2021 pxor @XMM[13], @XMM[7]
2022 movdqu @XMM[7], 0x50($out)
2024 movdqu 0x60($inp), @XMM[14]
2025 pxor @XMM[14], @XMM[2]
2026 movdqu @XMM[2], 0x60($out)
2031 lea 0x20(%rbp), $arg1
2032 lea 0x30(%rbp), $arg2
2034 call asm_AES_encrypt
2035 movdqu ($inp), @XMM[1]
2037 mov 0x2c(%rbp), %eax # load 32-bit counter
2039 pxor 0x30(%rbp), @XMM[1]
2040 inc %eax # increment
2041 movdqu @XMM[1], ($out)
2044 mov %eax, 0x2c(%rsp) # save 32-bit counter
2051 .Lctr_enc_bzero: # wipe key schedule [if any]
2052 movdqa %xmm0, 0x00(%rax)
2053 movdqa %xmm0, 0x10(%rax)
2054 lea 0x20(%rax), %rax
2058 lea (%rbp),%rsp # restore %rsp
2060 $code.=<<___ if ($win64);
2061 movaps 0x40(%rbp), %xmm6
2062 movaps 0x50(%rbp), %xmm7
2063 movaps 0x60(%rbp), %xmm8
2064 movaps 0x70(%rbp), %xmm9
2065 movaps 0x80(%rbp), %xmm10
2066 movaps 0x90(%rbp), %xmm11
2067 movaps 0xa0(%rbp), %xmm12
2068 movaps 0xb0(%rbp), %xmm13
2069 movaps 0xc0(%rbp), %xmm14
2070 movaps 0xd0(%rbp), %xmm15
2071 lea 0xa0(%rbp), %rsp
2074 mov 0x48(%rsp), %r15
2075 mov 0x50(%rsp), %r14
2076 mov 0x58(%rsp), %r13
2077 mov 0x60(%rsp), %r12
2078 mov 0x68(%rsp), %rbx
2079 mov 0x70(%rsp), %rax
2080 lea 0x78(%rsp), %rsp
2084 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2086 ######################################################################
2087 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088 # const AES_KEY *key1, const AES_KEY *key2,
2089 # const unsigned char iv[16]);
2091 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2095 .globl bsaes_xts_encrypt
2096 .type bsaes_xts_encrypt,\@abi-omnipotent
2107 lea -0x48(%rsp), %rsp
2109 $code.=<<___ if ($win64);
2110 mov 0xa0(%rsp),$arg5 # pull key2
2111 mov 0xa8(%rsp),$arg6 # pull ivp
2112 lea -0xa0(%rsp), %rsp
2113 movaps %xmm6, 0x40(%rsp)
2114 movaps %xmm7, 0x50(%rsp)
2115 movaps %xmm8, 0x60(%rsp)
2116 movaps %xmm9, 0x70(%rsp)
2117 movaps %xmm10, 0x80(%rsp)
2118 movaps %xmm11, 0x90(%rsp)
2119 movaps %xmm12, 0xa0(%rsp)
2120 movaps %xmm13, 0xb0(%rsp)
2121 movaps %xmm14, 0xc0(%rsp)
2122 movaps %xmm15, 0xd0(%rsp)
2126 mov %rsp, %rbp # backup %rsp
2127 mov $arg1, $inp # backup arguments
2133 lea 0x20(%rbp), $arg2
2135 call asm_AES_encrypt # generate initial tweak
2137 mov 240($key), %eax # rounds
2138 mov $len, %rbx # backup $len
2140 mov %eax, %edx # rounds
2141 shl \$7, %rax # 128 bytes per inner round key
2142 sub \$`128-32`, %rax # size of bit-sliced key schedule
2145 mov %rsp, %rax # pass key schedule
2146 mov $key, %rcx # pass key
2147 mov %edx, %r10d # pass rounds
2148 call _bsaes_key_convert
2149 pxor %xmm6, %xmm7 # fix up last round key
2150 movdqa %xmm7, (%rax) # save last round key
2153 sub \$0x80, %rsp # place for tweak[8]
2154 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2157 movdqa .Lxts_magic(%rip), $twmask
2158 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2167 for ($i=0;$i<7;$i++) {
2169 pshufd \$0x13, $twtmp, $twres
2171 movdqa @XMM[7], @XMM[$i]
2172 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2173 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2174 pand $twmask, $twres # isolate carry and residue
2175 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2176 pxor $twres, @XMM[7]
2178 $code.=<<___ if ($i>=1);
2179 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2181 $code.=<<___ if ($i>=2);
2182 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2186 movdqu 0x60($inp), @XMM[8+6]
2187 pxor @XMM[8+5], @XMM[5]
2188 movdqu 0x70($inp), @XMM[8+7]
2189 lea 0x80($inp), $inp
2190 movdqa @XMM[7], 0x70(%rsp)
2191 pxor @XMM[8+6], @XMM[6]
2192 lea 0x80(%rsp), %rax # pass key schedule
2193 pxor @XMM[8+7], @XMM[7]
2194 mov %edx, %r10d # pass rounds
2196 call _bsaes_encrypt8
2198 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2199 pxor 0x10(%rsp), @XMM[1]
2200 movdqu @XMM[0], 0x00($out) # write output
2201 pxor 0x20(%rsp), @XMM[4]
2202 movdqu @XMM[1], 0x10($out)
2203 pxor 0x30(%rsp), @XMM[6]
2204 movdqu @XMM[4], 0x20($out)
2205 pxor 0x40(%rsp), @XMM[3]
2206 movdqu @XMM[6], 0x30($out)
2207 pxor 0x50(%rsp), @XMM[7]
2208 movdqu @XMM[3], 0x40($out)
2209 pxor 0x60(%rsp), @XMM[2]
2210 movdqu @XMM[7], 0x50($out)
2211 pxor 0x70(%rsp), @XMM[5]
2212 movdqu @XMM[2], 0x60($out)
2213 movdqu @XMM[5], 0x70($out)
2214 lea 0x80($out), $out
2216 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2218 movdqa .Lxts_magic(%rip), $twmask
2219 pcmpgtd @XMM[7], $twtmp
2220 pshufd \$0x13, $twtmp, $twres
2222 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2223 pand $twmask, $twres # isolate carry and residue
2224 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2225 pxor $twres, @XMM[7]
2234 for ($i=0;$i<7;$i++) {
2236 pshufd \$0x13, $twtmp, $twres
2238 movdqa @XMM[7], @XMM[$i]
2239 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2240 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2241 pand $twmask, $twres # isolate carry and residue
2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2243 pxor $twres, @XMM[7]
2245 $code.=<<___ if ($i>=1);
2246 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2247 cmp \$`0x10*$i`,$len
2250 $code.=<<___ if ($i>=2);
2251 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2255 movdqu 0x60($inp), @XMM[8+6]
2256 pxor @XMM[8+5], @XMM[5]
2257 movdqa @XMM[7], 0x70(%rsp)
2258 lea 0x70($inp), $inp
2259 pxor @XMM[8+6], @XMM[6]
2260 lea 0x80(%rsp), %rax # pass key schedule
2261 mov %edx, %r10d # pass rounds
2263 call _bsaes_encrypt8
2265 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2266 pxor 0x10(%rsp), @XMM[1]
2267 movdqu @XMM[0], 0x00($out) # write output
2268 pxor 0x20(%rsp), @XMM[4]
2269 movdqu @XMM[1], 0x10($out)
2270 pxor 0x30(%rsp), @XMM[6]
2271 movdqu @XMM[4], 0x20($out)
2272 pxor 0x40(%rsp), @XMM[3]
2273 movdqu @XMM[6], 0x30($out)
2274 pxor 0x50(%rsp), @XMM[7]
2275 movdqu @XMM[3], 0x40($out)
2276 pxor 0x60(%rsp), @XMM[2]
2277 movdqu @XMM[7], 0x50($out)
2278 movdqu @XMM[2], 0x60($out)
2279 lea 0x70($out), $out
2281 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2285 pxor @XMM[8+4], @XMM[4]
2286 lea 0x60($inp), $inp
2287 pxor @XMM[8+5], @XMM[5]
2288 lea 0x80(%rsp), %rax # pass key schedule
2289 mov %edx, %r10d # pass rounds
2291 call _bsaes_encrypt8
2293 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2294 pxor 0x10(%rsp), @XMM[1]
2295 movdqu @XMM[0], 0x00($out) # write output
2296 pxor 0x20(%rsp), @XMM[4]
2297 movdqu @XMM[1], 0x10($out)
2298 pxor 0x30(%rsp), @XMM[6]
2299 movdqu @XMM[4], 0x20($out)
2300 pxor 0x40(%rsp), @XMM[3]
2301 movdqu @XMM[6], 0x30($out)
2302 pxor 0x50(%rsp), @XMM[7]
2303 movdqu @XMM[3], 0x40($out)
2304 movdqu @XMM[7], 0x50($out)
2305 lea 0x60($out), $out
2307 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2311 pxor @XMM[8+3], @XMM[3]
2312 lea 0x50($inp), $inp
2313 pxor @XMM[8+4], @XMM[4]
2314 lea 0x80(%rsp), %rax # pass key schedule
2315 mov %edx, %r10d # pass rounds
2317 call _bsaes_encrypt8
2319 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2320 pxor 0x10(%rsp), @XMM[1]
2321 movdqu @XMM[0], 0x00($out) # write output
2322 pxor 0x20(%rsp), @XMM[4]
2323 movdqu @XMM[1], 0x10($out)
2324 pxor 0x30(%rsp), @XMM[6]
2325 movdqu @XMM[4], 0x20($out)
2326 pxor 0x40(%rsp), @XMM[3]
2327 movdqu @XMM[6], 0x30($out)
2328 movdqu @XMM[3], 0x40($out)
2329 lea 0x50($out), $out
2331 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2335 pxor @XMM[8+2], @XMM[2]
2336 lea 0x40($inp), $inp
2337 pxor @XMM[8+3], @XMM[3]
2338 lea 0x80(%rsp), %rax # pass key schedule
2339 mov %edx, %r10d # pass rounds
2341 call _bsaes_encrypt8
2343 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 pxor 0x10(%rsp), @XMM[1]
2345 movdqu @XMM[0], 0x00($out) # write output
2346 pxor 0x20(%rsp), @XMM[4]
2347 movdqu @XMM[1], 0x10($out)
2348 pxor 0x30(%rsp), @XMM[6]
2349 movdqu @XMM[4], 0x20($out)
2350 movdqu @XMM[6], 0x30($out)
2351 lea 0x40($out), $out
2353 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2357 pxor @XMM[8+1], @XMM[1]
2358 lea 0x30($inp), $inp
2359 pxor @XMM[8+2], @XMM[2]
2360 lea 0x80(%rsp), %rax # pass key schedule
2361 mov %edx, %r10d # pass rounds
2363 call _bsaes_encrypt8
2365 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2366 pxor 0x10(%rsp), @XMM[1]
2367 movdqu @XMM[0], 0x00($out) # write output
2368 pxor 0x20(%rsp), @XMM[4]
2369 movdqu @XMM[1], 0x10($out)
2370 movdqu @XMM[4], 0x20($out)
2371 lea 0x30($out), $out
2373 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2377 pxor @XMM[8+0], @XMM[0]
2378 lea 0x20($inp), $inp
2379 pxor @XMM[8+1], @XMM[1]
2380 lea 0x80(%rsp), %rax # pass key schedule
2381 mov %edx, %r10d # pass rounds
2383 call _bsaes_encrypt8
2385 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2386 pxor 0x10(%rsp), @XMM[1]
2387 movdqu @XMM[0], 0x00($out) # write output
2388 movdqu @XMM[1], 0x10($out)
2389 lea 0x20($out), $out
2391 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2395 pxor @XMM[0], @XMM[8]
2396 lea 0x10($inp), $inp
2397 movdqa @XMM[8], 0x20(%rbp)
2398 lea 0x20(%rbp), $arg1
2399 lea 0x20(%rbp), $arg2
2401 call asm_AES_encrypt # doesn't touch %xmm
2402 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2403 #pxor @XMM[8], @XMM[0]
2404 #lea 0x80(%rsp), %rax # pass key schedule
2405 #mov %edx, %r10d # pass rounds
2406 #call _bsaes_encrypt8
2407 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2408 movdqu @XMM[0], 0x00($out) # write output
2409 lea 0x10($out), $out
2411 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2420 movzb -16(%rdx), %ecx
2428 movdqu -16($out), @XMM[0]
2429 lea 0x20(%rbp), $arg1
2430 pxor @XMM[7], @XMM[0]
2431 lea 0x20(%rbp), $arg2
2432 movdqa @XMM[0], 0x20(%rbp)
2434 call asm_AES_encrypt # doesn't touch %xmm
2435 pxor 0x20(%rbp), @XMM[7]
2436 movdqu @XMM[7], -16($out)
2441 .Lxts_enc_bzero: # wipe key schedule [if any]
2442 movdqa %xmm0, 0x00(%rax)
2443 movdqa %xmm0, 0x10(%rax)
2444 lea 0x20(%rax), %rax
2448 lea (%rbp),%rsp # restore %rsp
2450 $code.=<<___ if ($win64);
2451 movaps 0x40(%rbp), %xmm6
2452 movaps 0x50(%rbp), %xmm7
2453 movaps 0x60(%rbp), %xmm8
2454 movaps 0x70(%rbp), %xmm9
2455 movaps 0x80(%rbp), %xmm10
2456 movaps 0x90(%rbp), %xmm11
2457 movaps 0xa0(%rbp), %xmm12
2458 movaps 0xb0(%rbp), %xmm13
2459 movaps 0xc0(%rbp), %xmm14
2460 movaps 0xd0(%rbp), %xmm15
2461 lea 0xa0(%rbp), %rsp
2464 mov 0x48(%rsp), %r15
2465 mov 0x50(%rsp), %r14
2466 mov 0x58(%rsp), %r13
2467 mov 0x60(%rsp), %r12
2468 mov 0x68(%rsp), %rbx
2469 mov 0x70(%rsp), %rax
2470 lea 0x78(%rsp), %rsp
2474 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2476 .globl bsaes_xts_decrypt
2477 .type bsaes_xts_decrypt,\@abi-omnipotent
2488 lea -0x48(%rsp), %rsp
2490 $code.=<<___ if ($win64);
2491 mov 0xa0(%rsp),$arg5 # pull key2
2492 mov 0xa8(%rsp),$arg6 # pull ivp
2493 lea -0xa0(%rsp), %rsp
2494 movaps %xmm6, 0x40(%rsp)
2495 movaps %xmm7, 0x50(%rsp)
2496 movaps %xmm8, 0x60(%rsp)
2497 movaps %xmm9, 0x70(%rsp)
2498 movaps %xmm10, 0x80(%rsp)
2499 movaps %xmm11, 0x90(%rsp)
2500 movaps %xmm12, 0xa0(%rsp)
2501 movaps %xmm13, 0xb0(%rsp)
2502 movaps %xmm14, 0xc0(%rsp)
2503 movaps %xmm15, 0xd0(%rsp)
2507 mov %rsp, %rbp # backup %rsp
2508 mov $arg1, $inp # backup arguments
2514 lea 0x20(%rbp), $arg2
2516 call asm_AES_encrypt # generate initial tweak
2518 mov 240($key), %eax # rounds
2519 mov $len, %rbx # backup $len
2521 mov %eax, %edx # rounds
2522 shl \$7, %rax # 128 bytes per inner round key
2523 sub \$`128-32`, %rax # size of bit-sliced key schedule
2526 mov %rsp, %rax # pass key schedule
2527 mov $key, %rcx # pass key
2528 mov %edx, %r10d # pass rounds
2529 call _bsaes_key_convert
2530 pxor (%rsp), %xmm7 # fix up round 0 key
2531 movdqa %xmm6, (%rax) # save last round key
2532 movdqa %xmm7, (%rsp)
2534 xor %eax, %eax # if ($len%16) len-=16;
2541 sub \$0x80, %rsp # place for tweak[8]
2542 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2545 movdqa .Lxts_magic(%rip), $twmask
2546 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2555 for ($i=0;$i<7;$i++) {
2557 pshufd \$0x13, $twtmp, $twres
2559 movdqa @XMM[7], @XMM[$i]
2560 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2561 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2562 pand $twmask, $twres # isolate carry and residue
2563 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2564 pxor $twres, @XMM[7]
2566 $code.=<<___ if ($i>=1);
2567 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2569 $code.=<<___ if ($i>=2);
2570 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2574 movdqu 0x60($inp), @XMM[8+6]
2575 pxor @XMM[8+5], @XMM[5]
2576 movdqu 0x70($inp), @XMM[8+7]
2577 lea 0x80($inp), $inp
2578 movdqa @XMM[7], 0x70(%rsp)
2579 pxor @XMM[8+6], @XMM[6]
2580 lea 0x80(%rsp), %rax # pass key schedule
2581 pxor @XMM[8+7], @XMM[7]
2582 mov %edx, %r10d # pass rounds
2584 call _bsaes_decrypt8
2586 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2587 pxor 0x10(%rsp), @XMM[1]
2588 movdqu @XMM[0], 0x00($out) # write output
2589 pxor 0x20(%rsp), @XMM[6]
2590 movdqu @XMM[1], 0x10($out)
2591 pxor 0x30(%rsp), @XMM[4]
2592 movdqu @XMM[6], 0x20($out)
2593 pxor 0x40(%rsp), @XMM[2]
2594 movdqu @XMM[4], 0x30($out)
2595 pxor 0x50(%rsp), @XMM[7]
2596 movdqu @XMM[2], 0x40($out)
2597 pxor 0x60(%rsp), @XMM[3]
2598 movdqu @XMM[7], 0x50($out)
2599 pxor 0x70(%rsp), @XMM[5]
2600 movdqu @XMM[3], 0x60($out)
2601 movdqu @XMM[5], 0x70($out)
2602 lea 0x80($out), $out
2604 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2606 movdqa .Lxts_magic(%rip), $twmask
2607 pcmpgtd @XMM[7], $twtmp
2608 pshufd \$0x13, $twtmp, $twres
2610 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2611 pand $twmask, $twres # isolate carry and residue
2612 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2613 pxor $twres, @XMM[7]
2622 for ($i=0;$i<7;$i++) {
2624 pshufd \$0x13, $twtmp, $twres
2626 movdqa @XMM[7], @XMM[$i]
2627 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2628 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2629 pand $twmask, $twres # isolate carry and residue
2630 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2631 pxor $twres, @XMM[7]
2633 $code.=<<___ if ($i>=1);
2634 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2635 cmp \$`0x10*$i`,$len
2638 $code.=<<___ if ($i>=2);
2639 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2643 movdqu 0x60($inp), @XMM[8+6]
2644 pxor @XMM[8+5], @XMM[5]
2645 movdqa @XMM[7], 0x70(%rsp)
2646 lea 0x70($inp), $inp
2647 pxor @XMM[8+6], @XMM[6]
2648 lea 0x80(%rsp), %rax # pass key schedule
2649 mov %edx, %r10d # pass rounds
2651 call _bsaes_decrypt8
2653 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2654 pxor 0x10(%rsp), @XMM[1]
2655 movdqu @XMM[0], 0x00($out) # write output
2656 pxor 0x20(%rsp), @XMM[6]
2657 movdqu @XMM[1], 0x10($out)
2658 pxor 0x30(%rsp), @XMM[4]
2659 movdqu @XMM[6], 0x20($out)
2660 pxor 0x40(%rsp), @XMM[2]
2661 movdqu @XMM[4], 0x30($out)
2662 pxor 0x50(%rsp), @XMM[7]
2663 movdqu @XMM[2], 0x40($out)
2664 pxor 0x60(%rsp), @XMM[3]
2665 movdqu @XMM[7], 0x50($out)
2666 movdqu @XMM[3], 0x60($out)
2667 lea 0x70($out), $out
2669 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2673 pxor @XMM[8+4], @XMM[4]
2674 lea 0x60($inp), $inp
2675 pxor @XMM[8+5], @XMM[5]
2676 lea 0x80(%rsp), %rax # pass key schedule
2677 mov %edx, %r10d # pass rounds
2679 call _bsaes_decrypt8
2681 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2682 pxor 0x10(%rsp), @XMM[1]
2683 movdqu @XMM[0], 0x00($out) # write output
2684 pxor 0x20(%rsp), @XMM[6]
2685 movdqu @XMM[1], 0x10($out)
2686 pxor 0x30(%rsp), @XMM[4]
2687 movdqu @XMM[6], 0x20($out)
2688 pxor 0x40(%rsp), @XMM[2]
2689 movdqu @XMM[4], 0x30($out)
2690 pxor 0x50(%rsp), @XMM[7]
2691 movdqu @XMM[2], 0x40($out)
2692 movdqu @XMM[7], 0x50($out)
2693 lea 0x60($out), $out
2695 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2699 pxor @XMM[8+3], @XMM[3]
2700 lea 0x50($inp), $inp
2701 pxor @XMM[8+4], @XMM[4]
2702 lea 0x80(%rsp), %rax # pass key schedule
2703 mov %edx, %r10d # pass rounds
2705 call _bsaes_decrypt8
2707 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2708 pxor 0x10(%rsp), @XMM[1]
2709 movdqu @XMM[0], 0x00($out) # write output
2710 pxor 0x20(%rsp), @XMM[6]
2711 movdqu @XMM[1], 0x10($out)
2712 pxor 0x30(%rsp), @XMM[4]
2713 movdqu @XMM[6], 0x20($out)
2714 pxor 0x40(%rsp), @XMM[2]
2715 movdqu @XMM[4], 0x30($out)
2716 movdqu @XMM[2], 0x40($out)
2717 lea 0x50($out), $out
2719 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2723 pxor @XMM[8+2], @XMM[2]
2724 lea 0x40($inp), $inp
2725 pxor @XMM[8+3], @XMM[3]
2726 lea 0x80(%rsp), %rax # pass key schedule
2727 mov %edx, %r10d # pass rounds
2729 call _bsaes_decrypt8
2731 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 pxor 0x10(%rsp), @XMM[1]
2733 movdqu @XMM[0], 0x00($out) # write output
2734 pxor 0x20(%rsp), @XMM[6]
2735 movdqu @XMM[1], 0x10($out)
2736 pxor 0x30(%rsp), @XMM[4]
2737 movdqu @XMM[6], 0x20($out)
2738 movdqu @XMM[4], 0x30($out)
2739 lea 0x40($out), $out
2741 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2745 pxor @XMM[8+1], @XMM[1]
2746 lea 0x30($inp), $inp
2747 pxor @XMM[8+2], @XMM[2]
2748 lea 0x80(%rsp), %rax # pass key schedule
2749 mov %edx, %r10d # pass rounds
2751 call _bsaes_decrypt8
2753 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2754 pxor 0x10(%rsp), @XMM[1]
2755 movdqu @XMM[0], 0x00($out) # write output
2756 pxor 0x20(%rsp), @XMM[6]
2757 movdqu @XMM[1], 0x10($out)
2758 movdqu @XMM[6], 0x20($out)
2759 lea 0x30($out), $out
2761 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2765 pxor @XMM[8+0], @XMM[0]
2766 lea 0x20($inp), $inp
2767 pxor @XMM[8+1], @XMM[1]
2768 lea 0x80(%rsp), %rax # pass key schedule
2769 mov %edx, %r10d # pass rounds
2771 call _bsaes_decrypt8
2773 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2774 pxor 0x10(%rsp), @XMM[1]
2775 movdqu @XMM[0], 0x00($out) # write output
2776 movdqu @XMM[1], 0x10($out)
2777 lea 0x20($out), $out
2779 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2783 pxor @XMM[0], @XMM[8]
2784 lea 0x10($inp), $inp
2785 movdqa @XMM[8], 0x20(%rbp)
2786 lea 0x20(%rbp), $arg1
2787 lea 0x20(%rbp), $arg2
2789 call asm_AES_decrypt # doesn't touch %xmm
2790 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2791 #pxor @XMM[8], @XMM[0]
2792 #lea 0x80(%rsp), %rax # pass key schedule
2793 #mov %edx, %r10d # pass rounds
2794 #call _bsaes_decrypt8
2795 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2796 movdqu @XMM[0], 0x00($out) # write output
2797 lea 0x10($out), $out
2799 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2806 movdqa .Lxts_magic(%rip), $twmask
2807 pcmpgtd @XMM[7], $twtmp
2808 pshufd \$0x13, $twtmp, $twres
2809 movdqa @XMM[7], @XMM[6]
2810 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2811 pand $twmask, $twres # isolate carry and residue
2812 movdqu ($inp), @XMM[0]
2813 pxor $twres, @XMM[7]
2815 lea 0x20(%rbp), $arg1
2816 pxor @XMM[7], @XMM[0]
2817 lea 0x20(%rbp), $arg2
2818 movdqa @XMM[0], 0x20(%rbp)
2820 call asm_AES_decrypt # doesn't touch %xmm
2821 pxor 0x20(%rbp), @XMM[7]
2823 movdqu @XMM[7], ($out)
2826 movzb 16($inp), %eax
2835 movdqu ($out), @XMM[0]
2836 lea 0x20(%rbp), $arg1
2837 pxor @XMM[6], @XMM[0]
2838 lea 0x20(%rbp), $arg2
2839 movdqa @XMM[0], 0x20(%rbp)
2841 call asm_AES_decrypt # doesn't touch %xmm
2842 pxor 0x20(%rbp), @XMM[6]
2843 movdqu @XMM[6], ($out)
2848 .Lxts_dec_bzero: # wipe key schedule [if any]
2849 movdqa %xmm0, 0x00(%rax)
2850 movdqa %xmm0, 0x10(%rax)
2851 lea 0x20(%rax), %rax
2855 lea (%rbp),%rsp # restore %rsp
2857 $code.=<<___ if ($win64);
2858 movaps 0x40(%rbp), %xmm6
2859 movaps 0x50(%rbp), %xmm7
2860 movaps 0x60(%rbp), %xmm8
2861 movaps 0x70(%rbp), %xmm9
2862 movaps 0x80(%rbp), %xmm10
2863 movaps 0x90(%rbp), %xmm11
2864 movaps 0xa0(%rbp), %xmm12
2865 movaps 0xb0(%rbp), %xmm13
2866 movaps 0xc0(%rbp), %xmm14
2867 movaps 0xd0(%rbp), %xmm15
2868 lea 0xa0(%rbp), %rsp
2871 mov 0x48(%rsp), %r15
2872 mov 0x50(%rsp), %r14
2873 mov 0x58(%rsp), %r13
2874 mov 0x60(%rsp), %r12
2875 mov 0x68(%rsp), %rbx
2876 mov 0x70(%rsp), %rax
2877 lea 0x78(%rsp), %rsp
2881 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2885 .type _bsaes_const,\@object
2888 .LM0ISR: # InvShiftRows constants
2889 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2891 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2893 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2894 .LBS0: # bit-slice constants
2895 .quad 0x5555555555555555, 0x5555555555555555
2897 .quad 0x3333333333333333, 0x3333333333333333
2899 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2900 .LSR: # shiftrows constants
2901 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2903 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2905 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2906 .LSWPUP: # byte-swap upper dword
2907 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2909 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2910 .LADD1: # counter increment constants
2911 .quad 0x0000000000000000, 0x0000000100000000
2913 .quad 0x0000000000000000, 0x0000000200000000
2915 .quad 0x0000000000000000, 0x0000000300000000
2917 .quad 0x0000000000000000, 0x0000000400000000
2919 .quad 0x0000000000000000, 0x0000000500000000
2921 .quad 0x0000000000000000, 0x0000000600000000
2923 .quad 0x0000000000000000, 0x0000000700000000
2925 .quad 0x0000000000000000, 0x0000000800000000
2929 .quad 0x0101010101010101, 0x0101010101010101
2930 .quad 0x0202020202020202, 0x0202020202020202
2931 .quad 0x0404040404040404, 0x0404040404040404
2932 .quad 0x0808080808080808, 0x0808080808080808
2934 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2936 .quad 0x6363636363636363, 0x6363636363636363
2937 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2939 .size _bsaes_const,.-_bsaes_const
2942 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2943 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2951 .extern __imp_RtlVirtualUnwind
2952 .type se_handler,\@abi-omnipotent
2966 mov 120($context),%rax # pull context->Rax
2967 mov 248($context),%rbx # pull context->Rip
2969 mov 8($disp),%rsi # disp->ImageBase
2970 mov 56($disp),%r11 # disp->HandlerData
2972 mov 0(%r11),%r10d # HandlerData[0]
2973 lea (%rsi,%r10),%r10 # prologue label
2974 cmp %r10,%rbx # context->Rip<prologue label
2977 mov 152($context),%rax # pull context->Rsp
2979 mov 4(%r11),%r10d # HandlerData[1]
2980 lea (%rsi,%r10),%r10 # epilogue label
2981 cmp %r10,%rbx # context->Rip>=epilogue label
2984 mov 160($context),%rax # pull context->Rbp
2986 lea 0x40(%rax),%rsi # %xmm save area
2987 lea 512($context),%rdi # &context.Xmm6
2988 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2989 .long 0xa548f3fc # cld; rep movsq
2990 lea 0xa0(%rax),%rax # adjust stack pointer
2998 lea 0x78(%rax),%rax # adjust stack pointer
2999 mov %rbx,144($context) # restore context->Rbx
3000 mov %rbp,160($context) # restore context->Rbp
3001 mov %r12,216($context) # restore context->R12
3002 mov %r13,224($context) # restore context->R13
3003 mov %r14,232($context) # restore context->R14
3004 mov %r15,240($context) # restore context->R15
3007 mov %rax,152($context) # restore context->Rsp
3009 mov 40($disp),%rdi # disp->ContextRecord
3010 mov $context,%rsi # context
3011 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3012 .long 0xa548f3fc # cld; rep movsq
3015 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3016 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3017 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3018 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3019 mov 40(%rsi),%r10 # disp->ContextRecord
3020 lea 56(%rsi),%r11 # &disp->HandlerData
3021 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3022 mov %r10,32(%rsp) # arg5
3023 mov %r11,40(%rsp) # arg6
3024 mov %r12,48(%rsp) # arg7
3025 mov %rcx,56(%rsp) # arg8, (NULL)
3026 call *__imp_RtlVirtualUnwind(%rip)
3028 mov \$1,%eax # ExceptionContinueSearch
3040 .size se_handler,.-se_handler
3045 $code.=<<___ if ($ecb);
3046 .rva .Lecb_enc_prologue
3047 .rva .Lecb_enc_epilogue
3050 .rva .Lecb_dec_prologue
3051 .rva .Lecb_dec_epilogue
3055 .rva .Lcbc_dec_prologue
3056 .rva .Lcbc_dec_epilogue
3059 .rva .Lctr_enc_prologue
3060 .rva .Lctr_enc_epilogue
3063 .rva .Lxts_enc_prologue
3064 .rva .Lxts_enc_epilogue
3067 .rva .Lxts_dec_prologue
3068 .rva .Lxts_dec_epilogue
3074 $code.=<<___ if ($ecb);
3078 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3082 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3088 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3092 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3096 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3100 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3104 $code =~ s/\`([^\`]*)\`/eval($1)/gem;