2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possibile thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
45 # Emilia's this(*) difference
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
53 # (*) Comparison is not completely fair, because "this" is ECB,
54 # i.e. no extra processing such as counter values calculation
55 # and xor-ing input as in Emilia's CTR implementation is
56 # performed. However, the CTR calculations stand for not more
57 # than 1% of total time, so comparison is *rather* fair.
59 # (**) Results were collected on Westmere, which is considered to
60 # be equivalent to Nehalem for this code.
62 # As for key schedule conversion subroutine. Interface to OpenSSL
63 # relies on per-invocation on-the-fly conversion. This naturally
64 # has impact on performance, especially for short inputs. Conversion
65 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
68 # conversion conversion/8x block
73 # The ratio values mean that 128-byte blocks will be processed
74 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75 # etc. Then keep in mind that input sizes not divisible by 128 are
76 # *effectively* slower, especially shortest ones, e.g. consecutive
77 # 144-byte blocks are processed 44% slower than one would expect,
78 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79 # it's still faster than ["hyper-threading-safe" code path in]
80 # aes-x86_64.pl on all lengths above 64 bytes...
84 # Add decryption procedure. Performance in CPU cycles spent to decrypt
85 # one byte out of 4096-byte buffer with 128-bit key is:
95 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96 # suboptimal, but XTS is meant to be used with larger blocks...
102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109 die "can't locate x86_64-xlate.pl";
111 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
114 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
115 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
116 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
119 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
128 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
129 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
133 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
134 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
156 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
157 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
177 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
178 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
182 &InvInBasisChange (@b);
183 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
184 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
187 sub InvInBasisChange { # OutBasisChange in reverse
188 my @b=@_[5,1,2,6,3,7,0,4];
206 sub InvOutBasisChange { # InBasisChange in reverse
207 my @b=@_[2,5,7,3,6,1,0,4];
228 #;*************************************************************
229 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
230 #;*************************************************************
231 my ($x0,$x1,$y0,$y1,$t0)=@_;
244 sub Mul_GF4_N { # not used, see next subroutine
245 # multiply and scale by N
246 my ($x0,$x1,$y0,$y1,$t0)=@_;
260 # interleaved Mul_GF4_N and Mul_GF4
261 my ($x0,$x1,$y0,$y1,$t0,
262 $x2,$x3,$y2,$y3,$t1)=@_;
290 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
297 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
298 @x[2], @x[3], @y[2], @y[3], @t[2]);
310 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
311 @x[6], @x[7], @y[2], @y[3], @t[2]);
316 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
325 #;********************************************************************
326 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
327 #;********************************************************************
331 # direct optimizations from hardware
386 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
388 # new smaller inversion
422 # output in s3, s2, s1, t1
424 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
426 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
427 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
429 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432 # AES linear components
438 pxor 0x00($key),@x[0]
439 pxor 0x10($key),@x[1]
440 pxor 0x20($key),@x[2]
441 pxor 0x30($key),@x[3]
444 pxor 0x40($key),@x[4]
445 pxor 0x50($key),@x[5]
448 pxor 0x60($key),@x[6]
449 pxor 0x70($key),@x[7]
459 # modified to emit output in order suitable for feeding back to aesenc[last]
462 my $inv=@_[16]; # optional
464 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
465 pshufd \$0x93, @x[1], @t[1]
466 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
467 pshufd \$0x93, @x[2], @t[2]
469 pshufd \$0x93, @x[3], @t[3]
471 pshufd \$0x93, @x[4], @t[4]
473 pshufd \$0x93, @x[5], @t[5]
475 pshufd \$0x93, @x[6], @t[6]
477 pshufd \$0x93, @x[7], @t[7]
484 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
486 pshufd \$0x4E, @x[1], @x[1]
492 pshufd \$0x4E, @x[4], @t[0]
494 pshufd \$0x4E, @x[5], @t[1]
496 pshufd \$0x4E, @x[3], @x[4]
498 pshufd \$0x4E, @x[7], @x[5]
500 pshufd \$0x4E, @x[6], @x[3]
502 pshufd \$0x4E, @x[2], @x[6]
505 $code.=<<___ if (!$inv);
513 $code.=<<___ if ($inv);
526 sub InvMixColumns_orig {
531 # multiplication by 0x0e
532 pshufd \$0x93, @x[7], @t[7]
534 pxor @x[5], @x[7] # 7 5
535 pxor @x[5], @x[2] # 2 5
536 pshufd \$0x93, @x[0], @t[0]
538 pxor @x[0], @x[5] # 5 0 [1]
539 pxor @x[1], @x[0] # 0 1
540 pshufd \$0x93, @x[1], @t[1]
541 pxor @x[2], @x[1] # 1 25
542 pxor @x[6], @x[0] # 01 6 [2]
543 pxor @x[3], @x[1] # 125 3 [4]
544 pshufd \$0x93, @x[3], @t[3]
545 pxor @x[0], @x[2] # 25 016 [3]
546 pxor @x[7], @x[3] # 3 75
547 pxor @x[6], @x[7] # 75 6 [0]
548 pshufd \$0x93, @x[6], @t[6]
550 pxor @x[4], @x[6] # 6 4
551 pxor @x[3], @x[4] # 4 375 [6]
552 pxor @x[7], @x[3] # 375 756=36
553 pxor @t[5], @x[6] # 64 5 [7]
554 pxor @t[2], @x[3] # 36 2
555 pxor @t[4], @x[3] # 362 4 [5]
556 pshufd \$0x93, @t[5], @t[5]
558 my @y = @x[7,5,0,2,1,3,4,6];
560 # multiplication by 0x0b
564 pshufd \$0x93, @t[2], @t[2]
568 pshufd \$0x93, @t[4], @t[4]
569 pxor @t[6], @t[7] # clobber t[7]
573 pshufd \$0x93, @t[0], @t[0]
577 pshufd \$0x93, @t[1], @t[1]
581 pshufd \$0x93, @t[2], @t[2]
585 pshufd \$0x93, @t[3], @t[3]
591 pxor @t[5], @t[7] # clobber t[7] even more
594 pshufd \$0x93, @t[4], @t[4]
599 pshufd \$0x93, @t[5], @t[5]
600 pxor @t[6], @t[7] # restore t[7]
602 # multiplication by 0x0d
605 pshufd \$0x93, @t[6], @t[6]
609 pshufd \$0x93, @t[7], @t[7]
618 pshufd \$0x93, @t[0], @t[0]
622 pshufd \$0x93, @t[1], @t[1]
627 pshufd \$0x93, @t[2], @t[2]
629 pxor @t[3], @t[6] # clobber t[6]
636 pshufd \$0x93, @t[4], @t[4]
639 pxor @t[3], @t[6] # restore t[6]
641 pshufd \$0x93, @t[5], @t[5]
642 pshufd \$0x93, @t[6], @t[6]
643 pshufd \$0x93, @t[7], @t[7]
644 pshufd \$0x93, @t[3], @t[3]
646 # multiplication by 0x09
648 pxor @y[1], @t[1] # t[1]=y[1]
649 pxor @t[5], @t[0] # clobber t[0]
652 pxor @y[0], @t[0] # t[0]=y[0]
654 pxor @t[7], @t[6] # clobber t[6]
657 pxor @y[4], @t[4] # t[4]=y[4]
659 pxor @y[3], @t[3] # t[3]=y[3]
661 pxor @y[2], @t[2] # t[2]=y[2]
663 pxor @y[5], @t[5] # t[5]=y[5]
666 pxor @y[6], @t[6] # t[6]=y[6]
667 pxor @y[7], @t[7] # t[7]=y[7]
684 # Thanks to Jussi Kivilinna for providing pointer to
686 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
687 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
688 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
689 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
692 # multiplication by 0x05-0x00-0x04-0x00
693 pshufd \$0x4E, @x[0], @t[0]
694 pshufd \$0x4E, @x[6], @t[6]
696 pshufd \$0x4E, @x[7], @t[7]
698 pshufd \$0x4E, @x[1], @t[1]
700 pshufd \$0x4E, @x[2], @t[2]
702 pshufd \$0x4E, @x[3], @t[3]
706 pshufd \$0x4E, @x[4], @t[4]
710 pshufd \$0x4E, @x[5], @t[5]
725 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
728 sub aesenc { # not used
732 movdqa 0x30($const),@t[0] # .LSR
734 &ShiftRows (@b,@t[0]);
736 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
739 sub aesenclast { # not used
743 movdqa 0x40($const),@t[0] # .LSRM0
745 &ShiftRows (@b,@t[0]);
748 pxor 0x00($key),@b[0]
749 pxor 0x10($key),@b[1]
750 pxor 0x20($key),@b[4]
751 pxor 0x30($key),@b[6]
752 pxor 0x40($key),@b[3]
753 pxor 0x50($key),@b[7]
754 pxor 0x60($key),@b[2]
755 pxor 0x70($key),@b[5]
760 my ($a,$b,$n,$mask,$t)=@_;
772 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
792 my @x=reverse(@_[0..7]);
793 my ($t0,$t1,$t2,$t3)=@_[8..11];
795 movdqa 0x00($const),$t0 # .LBS0
796 movdqa 0x10($const),$t1 # .LBS1
798 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
799 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
801 movdqa 0x20($const),$t0 # .LBS2
803 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
804 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
806 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
807 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
813 .extern asm_AES_encrypt
814 .extern asm_AES_decrypt
816 .type _bsaes_encrypt8,\@abi-omnipotent
819 lea .LBS0(%rip), $const # constants table
821 movdqa ($key), @XMM[9] # round 0 key
823 movdqa 0x50($const), @XMM[8] # .LM0SR
824 pxor @XMM[9], @XMM[0] # xor with round0 key
825 pxor @XMM[9], @XMM[1]
826 pxor @XMM[9], @XMM[2]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[0]
829 pshufb @XMM[8], @XMM[1]
830 pxor @XMM[9], @XMM[4]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[2]
833 pshufb @XMM[8], @XMM[3]
834 pxor @XMM[9], @XMM[6]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[4]
837 pshufb @XMM[8], @XMM[5]
838 pshufb @XMM[8], @XMM[6]
839 pshufb @XMM[8], @XMM[7]
840 _bsaes_encrypt8_bitslice:
842 &bitslice (@XMM[0..7, 8..11]);
849 &ShiftRows (@XMM[0..7, 8]);
850 $code.=".Lenc_sbox:\n";
851 &Sbox (@XMM[0..7, 8..15]);
856 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
858 movdqa 0x30($const), @XMM[8] # .LSR
860 movdqa 0x40($const), @XMM[8] # .LSRM0
865 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
866 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
868 movdqa ($key), @XMM[8] # last round key
869 pxor @XMM[8], @XMM[4]
870 pxor @XMM[8], @XMM[6]
871 pxor @XMM[8], @XMM[3]
872 pxor @XMM[8], @XMM[7]
873 pxor @XMM[8], @XMM[2]
874 pxor @XMM[8], @XMM[5]
875 pxor @XMM[8], @XMM[0]
876 pxor @XMM[8], @XMM[1]
878 .size _bsaes_encrypt8,.-_bsaes_encrypt8
880 .type _bsaes_decrypt8,\@abi-omnipotent
883 lea .LBS0(%rip), $const # constants table
885 movdqa ($key), @XMM[9] # round 0 key
887 movdqa -0x30($const), @XMM[8] # .LM0ISR
888 pxor @XMM[9], @XMM[0] # xor with round0 key
889 pxor @XMM[9], @XMM[1]
890 pxor @XMM[9], @XMM[2]
891 pxor @XMM[9], @XMM[3]
892 pshufb @XMM[8], @XMM[0]
893 pshufb @XMM[8], @XMM[1]
894 pxor @XMM[9], @XMM[4]
895 pxor @XMM[9], @XMM[5]
896 pshufb @XMM[8], @XMM[2]
897 pshufb @XMM[8], @XMM[3]
898 pxor @XMM[9], @XMM[6]
899 pxor @XMM[9], @XMM[7]
900 pshufb @XMM[8], @XMM[4]
901 pshufb @XMM[8], @XMM[5]
902 pshufb @XMM[8], @XMM[6]
903 pshufb @XMM[8], @XMM[7]
905 &bitslice (@XMM[0..7, 8..11]);
912 &ShiftRows (@XMM[0..7, 8]);
913 $code.=".Ldec_sbox:\n";
914 &InvSbox (@XMM[0..7, 8..15]);
919 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
921 movdqa -0x10($const), @XMM[8] # .LISR
923 movdqa -0x20($const), @XMM[8] # .LISRM0
928 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
930 movdqa ($key), @XMM[8] # last round key
931 pxor @XMM[8], @XMM[6]
932 pxor @XMM[8], @XMM[4]
933 pxor @XMM[8], @XMM[2]
934 pxor @XMM[8], @XMM[7]
935 pxor @XMM[8], @XMM[3]
936 pxor @XMM[8], @XMM[5]
937 pxor @XMM[8], @XMM[0]
938 pxor @XMM[8], @XMM[1]
940 .size _bsaes_decrypt8,.-_bsaes_decrypt8
944 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
947 my @x=reverse(@_[0..7]);
948 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
950 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
952 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
956 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
958 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
960 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
966 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
967 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
971 .type _bsaes_key_convert,\@abi-omnipotent
974 lea .Lmasks(%rip), $const
975 movdqu ($inp), %xmm7 # load round 0 key
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
984 movdqu ($inp), %xmm6 # load round 1 key
985 movdqa %xmm7, ($out) # save round 0 key
991 pshufb %xmm4, %xmm6 # .LM0
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1039 pxor %xmm5, %xmm13 # "pnot"
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
1049 movdqa 0x50($const), %xmm7 # .L63
1050 #movdqa %xmm6, ($out) # don't save last round key
1052 .size _bsaes_key_convert,.-_bsaes_key_convert
1056 if (0 && !$win64) { # following four functions are unsupported interface
1057 # used for benchmarking...
1059 .globl bsaes_enc_key_convert
1060 .type bsaes_enc_key_convert,\@function,2
1062 bsaes_enc_key_convert:
1063 mov 240($inp),%r10d # pass rounds
1064 mov $inp,%rcx # pass key
1065 mov $out,%rax # pass key schedule
1066 call _bsaes_key_convert
1067 pxor %xmm6,%xmm7 # fix up last round key
1068 movdqa %xmm7,(%rax) # save last round key
1070 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072 .globl bsaes_encrypt_128
1073 .type bsaes_encrypt_128,\@function,4
1077 movdqu 0x00($inp), @XMM[0] # load input
1078 movdqu 0x10($inp), @XMM[1]
1079 movdqu 0x20($inp), @XMM[2]
1080 movdqu 0x30($inp), @XMM[3]
1081 movdqu 0x40($inp), @XMM[4]
1082 movdqu 0x50($inp), @XMM[5]
1083 movdqu 0x60($inp), @XMM[6]
1084 movdqu 0x70($inp), @XMM[7]
1085 mov $key, %rax # pass the $key
1086 lea 0x80($inp), $inp
1089 call _bsaes_encrypt8
1091 movdqu @XMM[0], 0x00($out) # write output
1092 movdqu @XMM[1], 0x10($out)
1093 movdqu @XMM[4], 0x20($out)
1094 movdqu @XMM[6], 0x30($out)
1095 movdqu @XMM[3], 0x40($out)
1096 movdqu @XMM[7], 0x50($out)
1097 movdqu @XMM[2], 0x60($out)
1098 movdqu @XMM[5], 0x70($out)
1099 lea 0x80($out), $out
1103 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1105 .globl bsaes_dec_key_convert
1106 .type bsaes_dec_key_convert,\@function,2
1108 bsaes_dec_key_convert:
1109 mov 240($inp),%r10d # pass rounds
1110 mov $inp,%rcx # pass key
1111 mov $out,%rax # pass key schedule
1112 call _bsaes_key_convert
1113 pxor ($out),%xmm7 # fix up round 0 key
1114 movdqa %xmm6,(%rax) # save last round key
1117 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1119 .globl bsaes_decrypt_128
1120 .type bsaes_decrypt_128,\@function,4
1124 movdqu 0x00($inp), @XMM[0] # load input
1125 movdqu 0x10($inp), @XMM[1]
1126 movdqu 0x20($inp), @XMM[2]
1127 movdqu 0x30($inp), @XMM[3]
1128 movdqu 0x40($inp), @XMM[4]
1129 movdqu 0x50($inp), @XMM[5]
1130 movdqu 0x60($inp), @XMM[6]
1131 movdqu 0x70($inp), @XMM[7]
1132 mov $key, %rax # pass the $key
1133 lea 0x80($inp), $inp
1136 call _bsaes_decrypt8
1138 movdqu @XMM[0], 0x00($out) # write output
1139 movdqu @XMM[1], 0x10($out)
1140 movdqu @XMM[6], 0x20($out)
1141 movdqu @XMM[4], 0x30($out)
1142 movdqu @XMM[2], 0x40($out)
1143 movdqu @XMM[7], 0x50($out)
1144 movdqu @XMM[3], 0x60($out)
1145 movdqu @XMM[5], 0x70($out)
1146 lea 0x80($out), $out
1150 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1154 ######################################################################
1158 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1159 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1160 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1164 .globl bsaes_ecb_encrypt_blocks
1165 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1167 bsaes_ecb_encrypt_blocks:
1176 lea -0x48(%rsp),%rsp
1178 $code.=<<___ if ($win64);
1179 lea -0xa0(%rsp), %rsp
1180 movaps %xmm6, 0x40(%rsp)
1181 movaps %xmm7, 0x50(%rsp)
1182 movaps %xmm8, 0x60(%rsp)
1183 movaps %xmm9, 0x70(%rsp)
1184 movaps %xmm10, 0x80(%rsp)
1185 movaps %xmm11, 0x90(%rsp)
1186 movaps %xmm12, 0xa0(%rsp)
1187 movaps %xmm13, 0xb0(%rsp)
1188 movaps %xmm14, 0xc0(%rsp)
1189 movaps %xmm15, 0xd0(%rsp)
1193 mov %rsp,%rbp # backup %rsp
1194 mov 240($arg4),%eax # rounds
1195 mov $arg1,$inp # backup arguments
1202 mov %eax,%ebx # backup rounds
1203 shl \$7,%rax # 128 bytes per inner round key
1204 sub \$`128-32`,%rax # size of bit-sliced key schedule
1206 mov %rsp,%rax # pass key schedule
1207 mov $key,%rcx # pass key
1208 mov %ebx,%r10d # pass rounds
1209 call _bsaes_key_convert
1210 pxor %xmm6,%xmm7 # fix up last round key
1211 movdqa %xmm7,(%rax) # save last round key
1215 movdqu 0x00($inp), @XMM[0] # load input
1216 movdqu 0x10($inp), @XMM[1]
1217 movdqu 0x20($inp), @XMM[2]
1218 movdqu 0x30($inp), @XMM[3]
1219 movdqu 0x40($inp), @XMM[4]
1220 movdqu 0x50($inp), @XMM[5]
1221 mov %rsp, %rax # pass key schedule
1222 movdqu 0x60($inp), @XMM[6]
1223 mov %ebx,%r10d # pass rounds
1224 movdqu 0x70($inp), @XMM[7]
1225 lea 0x80($inp), $inp
1227 call _bsaes_encrypt8
1229 movdqu @XMM[0], 0x00($out) # write output
1230 movdqu @XMM[1], 0x10($out)
1231 movdqu @XMM[4], 0x20($out)
1232 movdqu @XMM[6], 0x30($out)
1233 movdqu @XMM[3], 0x40($out)
1234 movdqu @XMM[7], 0x50($out)
1235 movdqu @XMM[2], 0x60($out)
1236 movdqu @XMM[5], 0x70($out)
1237 lea 0x80($out), $out
1244 movdqu 0x00($inp), @XMM[0] # load input
1245 mov %rsp, %rax # pass key schedule
1246 mov %ebx,%r10d # pass rounds
1249 movdqu 0x10($inp), @XMM[1]
1251 movdqu 0x20($inp), @XMM[2]
1254 movdqu 0x30($inp), @XMM[3]
1256 movdqu 0x40($inp), @XMM[4]
1259 movdqu 0x50($inp), @XMM[5]
1261 movdqu 0x60($inp), @XMM[6]
1262 call _bsaes_encrypt8
1263 movdqu @XMM[0], 0x00($out) # write output
1264 movdqu @XMM[1], 0x10($out)
1265 movdqu @XMM[4], 0x20($out)
1266 movdqu @XMM[6], 0x30($out)
1267 movdqu @XMM[3], 0x40($out)
1268 movdqu @XMM[7], 0x50($out)
1269 movdqu @XMM[2], 0x60($out)
1273 call _bsaes_encrypt8
1274 movdqu @XMM[0], 0x00($out) # write output
1275 movdqu @XMM[1], 0x10($out)
1276 movdqu @XMM[4], 0x20($out)
1277 movdqu @XMM[6], 0x30($out)
1278 movdqu @XMM[3], 0x40($out)
1279 movdqu @XMM[7], 0x50($out)
1283 call _bsaes_encrypt8
1284 movdqu @XMM[0], 0x00($out) # write output
1285 movdqu @XMM[1], 0x10($out)
1286 movdqu @XMM[4], 0x20($out)
1287 movdqu @XMM[6], 0x30($out)
1288 movdqu @XMM[3], 0x40($out)
1292 call _bsaes_encrypt8
1293 movdqu @XMM[0], 0x00($out) # write output
1294 movdqu @XMM[1], 0x10($out)
1295 movdqu @XMM[4], 0x20($out)
1296 movdqu @XMM[6], 0x30($out)
1300 call _bsaes_encrypt8
1301 movdqu @XMM[0], 0x00($out) # write output
1302 movdqu @XMM[1], 0x10($out)
1303 movdqu @XMM[4], 0x20($out)
1307 call _bsaes_encrypt8
1308 movdqu @XMM[0], 0x00($out) # write output
1309 movdqu @XMM[1], 0x10($out)
1313 call _bsaes_encrypt8
1314 movdqu @XMM[0], 0x00($out) # write output
1321 call asm_AES_encrypt
1330 .Lecb_enc_bzero: # wipe key schedule [if any]
1331 movdqa %xmm0, 0x00(%rax)
1332 movdqa %xmm0, 0x10(%rax)
1333 lea 0x20(%rax), %rax
1337 lea (%rbp),%rsp # restore %rsp
1339 $code.=<<___ if ($win64);
1340 movaps 0x40(%rbp), %xmm6
1341 movaps 0x50(%rbp), %xmm7
1342 movaps 0x60(%rbp), %xmm8
1343 movaps 0x70(%rbp), %xmm9
1344 movaps 0x80(%rbp), %xmm10
1345 movaps 0x90(%rbp), %xmm11
1346 movaps 0xa0(%rbp), %xmm12
1347 movaps 0xb0(%rbp), %xmm13
1348 movaps 0xc0(%rbp), %xmm14
1349 movaps 0xd0(%rbp), %xmm15
1350 lea 0xa0(%rbp), %rsp
1353 mov 0x48(%rsp), %r15
1354 mov 0x50(%rsp), %r14
1355 mov 0x58(%rsp), %r13
1356 mov 0x60(%rsp), %r12
1357 mov 0x68(%rsp), %rbx
1358 mov 0x70(%rsp), %rax
1359 lea 0x78(%rsp), %rsp
1363 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1365 .globl bsaes_ecb_decrypt_blocks
1366 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1368 bsaes_ecb_decrypt_blocks:
1377 lea -0x48(%rsp),%rsp
1379 $code.=<<___ if ($win64);
1380 lea -0xa0(%rsp), %rsp
1381 movaps %xmm6, 0x40(%rsp)
1382 movaps %xmm7, 0x50(%rsp)
1383 movaps %xmm8, 0x60(%rsp)
1384 movaps %xmm9, 0x70(%rsp)
1385 movaps %xmm10, 0x80(%rsp)
1386 movaps %xmm11, 0x90(%rsp)
1387 movaps %xmm12, 0xa0(%rsp)
1388 movaps %xmm13, 0xb0(%rsp)
1389 movaps %xmm14, 0xc0(%rsp)
1390 movaps %xmm15, 0xd0(%rsp)
1394 mov %rsp,%rbp # backup %rsp
1395 mov 240($arg4),%eax # rounds
1396 mov $arg1,$inp # backup arguments
1403 mov %eax,%ebx # backup rounds
1404 shl \$7,%rax # 128 bytes per inner round key
1405 sub \$`128-32`,%rax # size of bit-sliced key schedule
1407 mov %rsp,%rax # pass key schedule
1408 mov $key,%rcx # pass key
1409 mov %ebx,%r10d # pass rounds
1410 call _bsaes_key_convert
1411 pxor (%rsp),%xmm7 # fix up 0 round key
1412 movdqa %xmm6,(%rax) # save last round key
1417 movdqu 0x00($inp), @XMM[0] # load input
1418 movdqu 0x10($inp), @XMM[1]
1419 movdqu 0x20($inp), @XMM[2]
1420 movdqu 0x30($inp), @XMM[3]
1421 movdqu 0x40($inp), @XMM[4]
1422 movdqu 0x50($inp), @XMM[5]
1423 mov %rsp, %rax # pass key schedule
1424 movdqu 0x60($inp), @XMM[6]
1425 mov %ebx,%r10d # pass rounds
1426 movdqu 0x70($inp), @XMM[7]
1427 lea 0x80($inp), $inp
1429 call _bsaes_decrypt8
1431 movdqu @XMM[0], 0x00($out) # write output
1432 movdqu @XMM[1], 0x10($out)
1433 movdqu @XMM[6], 0x20($out)
1434 movdqu @XMM[4], 0x30($out)
1435 movdqu @XMM[2], 0x40($out)
1436 movdqu @XMM[7], 0x50($out)
1437 movdqu @XMM[3], 0x60($out)
1438 movdqu @XMM[5], 0x70($out)
1439 lea 0x80($out), $out
1446 movdqu 0x00($inp), @XMM[0] # load input
1447 mov %rsp, %rax # pass key schedule
1448 mov %ebx,%r10d # pass rounds
1451 movdqu 0x10($inp), @XMM[1]
1453 movdqu 0x20($inp), @XMM[2]
1456 movdqu 0x30($inp), @XMM[3]
1458 movdqu 0x40($inp), @XMM[4]
1461 movdqu 0x50($inp), @XMM[5]
1463 movdqu 0x60($inp), @XMM[6]
1464 call _bsaes_decrypt8
1465 movdqu @XMM[0], 0x00($out) # write output
1466 movdqu @XMM[1], 0x10($out)
1467 movdqu @XMM[6], 0x20($out)
1468 movdqu @XMM[4], 0x30($out)
1469 movdqu @XMM[2], 0x40($out)
1470 movdqu @XMM[7], 0x50($out)
1471 movdqu @XMM[3], 0x60($out)
1475 call _bsaes_decrypt8
1476 movdqu @XMM[0], 0x00($out) # write output
1477 movdqu @XMM[1], 0x10($out)
1478 movdqu @XMM[6], 0x20($out)
1479 movdqu @XMM[4], 0x30($out)
1480 movdqu @XMM[2], 0x40($out)
1481 movdqu @XMM[7], 0x50($out)
1485 call _bsaes_decrypt8
1486 movdqu @XMM[0], 0x00($out) # write output
1487 movdqu @XMM[1], 0x10($out)
1488 movdqu @XMM[6], 0x20($out)
1489 movdqu @XMM[4], 0x30($out)
1490 movdqu @XMM[2], 0x40($out)
1494 call _bsaes_decrypt8
1495 movdqu @XMM[0], 0x00($out) # write output
1496 movdqu @XMM[1], 0x10($out)
1497 movdqu @XMM[6], 0x20($out)
1498 movdqu @XMM[4], 0x30($out)
1502 call _bsaes_decrypt8
1503 movdqu @XMM[0], 0x00($out) # write output
1504 movdqu @XMM[1], 0x10($out)
1505 movdqu @XMM[6], 0x20($out)
1509 call _bsaes_decrypt8
1510 movdqu @XMM[0], 0x00($out) # write output
1511 movdqu @XMM[1], 0x10($out)
1515 call _bsaes_decrypt8
1516 movdqu @XMM[0], 0x00($out) # write output
1523 call asm_AES_decrypt
1532 .Lecb_dec_bzero: # wipe key schedule [if any]
1533 movdqa %xmm0, 0x00(%rax)
1534 movdqa %xmm0, 0x10(%rax)
1535 lea 0x20(%rax), %rax
1539 lea (%rbp),%rsp # restore %rsp
1541 $code.=<<___ if ($win64);
1542 movaps 0x40(%rbp), %xmm6
1543 movaps 0x50(%rbp), %xmm7
1544 movaps 0x60(%rbp), %xmm8
1545 movaps 0x70(%rbp), %xmm9
1546 movaps 0x80(%rbp), %xmm10
1547 movaps 0x90(%rbp), %xmm11
1548 movaps 0xa0(%rbp), %xmm12
1549 movaps 0xb0(%rbp), %xmm13
1550 movaps 0xc0(%rbp), %xmm14
1551 movaps 0xd0(%rbp), %xmm15
1552 lea 0xa0(%rbp), %rsp
1555 mov 0x48(%rsp), %r15
1556 mov 0x50(%rsp), %r14
1557 mov 0x58(%rsp), %r13
1558 mov 0x60(%rsp), %r12
1559 mov 0x68(%rsp), %rbx
1560 mov 0x70(%rsp), %rax
1561 lea 0x78(%rsp), %rsp
1565 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1569 .extern asm_AES_cbc_encrypt
1570 .globl bsaes_cbc_encrypt
1571 .type bsaes_cbc_encrypt,\@abi-omnipotent
1575 $code.=<<___ if ($win64);
1576 mov 48(%rsp),$arg6 # pull direction flag
1580 jne asm_AES_cbc_encrypt
1582 jb asm_AES_cbc_encrypt
1592 lea -0x48(%rsp), %rsp
1594 $code.=<<___ if ($win64);
1595 mov 0xa0(%rsp),$arg5 # pull ivp
1596 lea -0xa0(%rsp), %rsp
1597 movaps %xmm6, 0x40(%rsp)
1598 movaps %xmm7, 0x50(%rsp)
1599 movaps %xmm8, 0x60(%rsp)
1600 movaps %xmm9, 0x70(%rsp)
1601 movaps %xmm10, 0x80(%rsp)
1602 movaps %xmm11, 0x90(%rsp)
1603 movaps %xmm12, 0xa0(%rsp)
1604 movaps %xmm13, 0xb0(%rsp)
1605 movaps %xmm14, 0xc0(%rsp)
1606 movaps %xmm15, 0xd0(%rsp)
1610 mov %rsp, %rbp # backup %rsp
1611 mov 240($arg4), %eax # rounds
1612 mov $arg1, $inp # backup arguments
1617 shr \$4, $len # bytes to blocks
1619 mov %eax, %edx # rounds
1620 shl \$7, %rax # 128 bytes per inner round key
1621 sub \$`128-32`, %rax # size of bit-sliced key schedule
1624 mov %rsp, %rax # pass key schedule
1625 mov $key, %rcx # pass key
1626 mov %edx, %r10d # pass rounds
1627 call _bsaes_key_convert
1628 pxor (%rsp),%xmm7 # fix up 0 round key
1629 movdqa %xmm6,(%rax) # save last round key
1632 movdqu (%rbx), @XMM[15] # load IV
1635 movdqu 0x00($inp), @XMM[0] # load input
1636 movdqu 0x10($inp), @XMM[1]
1637 movdqu 0x20($inp), @XMM[2]
1638 movdqu 0x30($inp), @XMM[3]
1639 movdqu 0x40($inp), @XMM[4]
1640 movdqu 0x50($inp), @XMM[5]
1641 mov %rsp, %rax # pass key schedule
1642 movdqu 0x60($inp), @XMM[6]
1643 mov %edx,%r10d # pass rounds
1644 movdqu 0x70($inp), @XMM[7]
1645 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1647 call _bsaes_decrypt8
1649 pxor 0x20(%rbp), @XMM[0] # ^= IV
1650 movdqu 0x00($inp), @XMM[8] # re-load input
1651 movdqu 0x10($inp), @XMM[9]
1652 pxor @XMM[8], @XMM[1]
1653 movdqu 0x20($inp), @XMM[10]
1654 pxor @XMM[9], @XMM[6]
1655 movdqu 0x30($inp), @XMM[11]
1656 pxor @XMM[10], @XMM[4]
1657 movdqu 0x40($inp), @XMM[12]
1658 pxor @XMM[11], @XMM[2]
1659 movdqu 0x50($inp), @XMM[13]
1660 pxor @XMM[12], @XMM[7]
1661 movdqu 0x60($inp), @XMM[14]
1662 pxor @XMM[13], @XMM[3]
1663 movdqu 0x70($inp), @XMM[15] # IV
1664 pxor @XMM[14], @XMM[5]
1665 movdqu @XMM[0], 0x00($out) # write output
1666 lea 0x80($inp), $inp
1667 movdqu @XMM[1], 0x10($out)
1668 movdqu @XMM[6], 0x20($out)
1669 movdqu @XMM[4], 0x30($out)
1670 movdqu @XMM[2], 0x40($out)
1671 movdqu @XMM[7], 0x50($out)
1672 movdqu @XMM[3], 0x60($out)
1673 movdqu @XMM[5], 0x70($out)
1674 lea 0x80($out), $out
1681 movdqu 0x00($inp), @XMM[0] # load input
1682 mov %rsp, %rax # pass key schedule
1683 mov %edx, %r10d # pass rounds
1686 movdqu 0x10($inp), @XMM[1]
1688 movdqu 0x20($inp), @XMM[2]
1691 movdqu 0x30($inp), @XMM[3]
1693 movdqu 0x40($inp), @XMM[4]
1696 movdqu 0x50($inp), @XMM[5]
1698 movdqu 0x60($inp), @XMM[6]
1699 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1700 call _bsaes_decrypt8
1701 pxor 0x20(%rbp), @XMM[0] # ^= IV
1702 movdqu 0x00($inp), @XMM[8] # re-load input
1703 movdqu 0x10($inp), @XMM[9]
1704 pxor @XMM[8], @XMM[1]
1705 movdqu 0x20($inp), @XMM[10]
1706 pxor @XMM[9], @XMM[6]
1707 movdqu 0x30($inp), @XMM[11]
1708 pxor @XMM[10], @XMM[4]
1709 movdqu 0x40($inp), @XMM[12]
1710 pxor @XMM[11], @XMM[2]
1711 movdqu 0x50($inp), @XMM[13]
1712 pxor @XMM[12], @XMM[7]
1713 movdqu 0x60($inp), @XMM[15] # IV
1714 pxor @XMM[13], @XMM[3]
1715 movdqu @XMM[0], 0x00($out) # write output
1716 movdqu @XMM[1], 0x10($out)
1717 movdqu @XMM[6], 0x20($out)
1718 movdqu @XMM[4], 0x30($out)
1719 movdqu @XMM[2], 0x40($out)
1720 movdqu @XMM[7], 0x50($out)
1721 movdqu @XMM[3], 0x60($out)
1725 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1726 call _bsaes_decrypt8
1727 pxor 0x20(%rbp), @XMM[0] # ^= IV
1728 movdqu 0x00($inp), @XMM[8] # re-load input
1729 movdqu 0x10($inp), @XMM[9]
1730 pxor @XMM[8], @XMM[1]
1731 movdqu 0x20($inp), @XMM[10]
1732 pxor @XMM[9], @XMM[6]
1733 movdqu 0x30($inp), @XMM[11]
1734 pxor @XMM[10], @XMM[4]
1735 movdqu 0x40($inp), @XMM[12]
1736 pxor @XMM[11], @XMM[2]
1737 movdqu 0x50($inp), @XMM[15] # IV
1738 pxor @XMM[12], @XMM[7]
1739 movdqu @XMM[0], 0x00($out) # write output
1740 movdqu @XMM[1], 0x10($out)
1741 movdqu @XMM[6], 0x20($out)
1742 movdqu @XMM[4], 0x30($out)
1743 movdqu @XMM[2], 0x40($out)
1744 movdqu @XMM[7], 0x50($out)
1748 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1749 call _bsaes_decrypt8
1750 pxor 0x20(%rbp), @XMM[0] # ^= IV
1751 movdqu 0x00($inp), @XMM[8] # re-load input
1752 movdqu 0x10($inp), @XMM[9]
1753 pxor @XMM[8], @XMM[1]
1754 movdqu 0x20($inp), @XMM[10]
1755 pxor @XMM[9], @XMM[6]
1756 movdqu 0x30($inp), @XMM[11]
1757 pxor @XMM[10], @XMM[4]
1758 movdqu 0x40($inp), @XMM[15] # IV
1759 pxor @XMM[11], @XMM[2]
1760 movdqu @XMM[0], 0x00($out) # write output
1761 movdqu @XMM[1], 0x10($out)
1762 movdqu @XMM[6], 0x20($out)
1763 movdqu @XMM[4], 0x30($out)
1764 movdqu @XMM[2], 0x40($out)
1768 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1769 call _bsaes_decrypt8
1770 pxor 0x20(%rbp), @XMM[0] # ^= IV
1771 movdqu 0x00($inp), @XMM[8] # re-load input
1772 movdqu 0x10($inp), @XMM[9]
1773 pxor @XMM[8], @XMM[1]
1774 movdqu 0x20($inp), @XMM[10]
1775 pxor @XMM[9], @XMM[6]
1776 movdqu 0x30($inp), @XMM[15] # IV
1777 pxor @XMM[10], @XMM[4]
1778 movdqu @XMM[0], 0x00($out) # write output
1779 movdqu @XMM[1], 0x10($out)
1780 movdqu @XMM[6], 0x20($out)
1781 movdqu @XMM[4], 0x30($out)
1785 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1786 call _bsaes_decrypt8
1787 pxor 0x20(%rbp), @XMM[0] # ^= IV
1788 movdqu 0x00($inp), @XMM[8] # re-load input
1789 movdqu 0x10($inp), @XMM[9]
1790 pxor @XMM[8], @XMM[1]
1791 movdqu 0x20($inp), @XMM[15] # IV
1792 pxor @XMM[9], @XMM[6]
1793 movdqu @XMM[0], 0x00($out) # write output
1794 movdqu @XMM[1], 0x10($out)
1795 movdqu @XMM[6], 0x20($out)
1799 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1800 call _bsaes_decrypt8
1801 pxor 0x20(%rbp), @XMM[0] # ^= IV
1802 movdqu 0x00($inp), @XMM[8] # re-load input
1803 movdqu 0x10($inp), @XMM[15] # IV
1804 pxor @XMM[8], @XMM[1]
1805 movdqu @XMM[0], 0x00($out) # write output
1806 movdqu @XMM[1], 0x10($out)
1811 lea 0x20(%rbp), $arg2 # buffer output
1813 call asm_AES_decrypt # doesn't touch %xmm
1814 pxor 0x20(%rbp), @XMM[15] # ^= IV
1815 movdqu @XMM[15], ($out) # write output
1816 movdqa @XMM[0], @XMM[15] # IV
1819 movdqu @XMM[15], (%rbx) # return IV
1822 .Lcbc_dec_bzero: # wipe key schedule [if any]
1823 movdqa %xmm0, 0x00(%rax)
1824 movdqa %xmm0, 0x10(%rax)
1825 lea 0x20(%rax), %rax
1829 lea (%rbp),%rsp # restore %rsp
1831 $code.=<<___ if ($win64);
1832 movaps 0x40(%rbp), %xmm6
1833 movaps 0x50(%rbp), %xmm7
1834 movaps 0x60(%rbp), %xmm8
1835 movaps 0x70(%rbp), %xmm9
1836 movaps 0x80(%rbp), %xmm10
1837 movaps 0x90(%rbp), %xmm11
1838 movaps 0xa0(%rbp), %xmm12
1839 movaps 0xb0(%rbp), %xmm13
1840 movaps 0xc0(%rbp), %xmm14
1841 movaps 0xd0(%rbp), %xmm15
1842 lea 0xa0(%rbp), %rsp
1845 mov 0x48(%rsp), %r15
1846 mov 0x50(%rsp), %r14
1847 mov 0x58(%rsp), %r13
1848 mov 0x60(%rsp), %r12
1849 mov 0x68(%rsp), %rbx
1850 mov 0x70(%rsp), %rax
1851 lea 0x78(%rsp), %rsp
1855 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1857 .globl bsaes_ctr32_encrypt_blocks
1858 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1860 bsaes_ctr32_encrypt_blocks:
1869 lea -0x48(%rsp), %rsp
1871 $code.=<<___ if ($win64);
1872 mov 0xa0(%rsp),$arg5 # pull ivp
1873 lea -0xa0(%rsp), %rsp
1874 movaps %xmm6, 0x40(%rsp)
1875 movaps %xmm7, 0x50(%rsp)
1876 movaps %xmm8, 0x60(%rsp)
1877 movaps %xmm9, 0x70(%rsp)
1878 movaps %xmm10, 0x80(%rsp)
1879 movaps %xmm11, 0x90(%rsp)
1880 movaps %xmm12, 0xa0(%rsp)
1881 movaps %xmm13, 0xb0(%rsp)
1882 movaps %xmm14, 0xc0(%rsp)
1883 movaps %xmm15, 0xd0(%rsp)
1887 mov %rsp, %rbp # backup %rsp
1888 movdqu ($arg5), %xmm0 # load counter
1889 mov 240($arg4), %eax # rounds
1890 mov $arg1, $inp # backup arguments
1894 movdqa %xmm0, 0x20(%rbp) # copy counter
1898 mov %eax, %ebx # rounds
1899 shl \$7, %rax # 128 bytes per inner round key
1900 sub \$`128-32`, %rax # size of bit-sliced key schedule
1903 mov %rsp, %rax # pass key schedule
1904 mov $key, %rcx # pass key
1905 mov %ebx, %r10d # pass rounds
1906 call _bsaes_key_convert
1907 pxor %xmm6,%xmm7 # fix up last round key
1908 movdqa %xmm7,(%rax) # save last round key
1910 movdqa (%rsp), @XMM[9] # load round0 key
1911 lea .LADD1(%rip), %r11
1912 movdqa 0x20(%rbp), @XMM[0] # counter copy
1913 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1914 pshufb @XMM[8], @XMM[9] # byte swap upper part
1915 pshufb @XMM[8], @XMM[0]
1916 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1920 movdqa @XMM[0], 0x20(%rbp) # save counter
1921 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1922 movdqa @XMM[0], @XMM[2]
1923 paddd 0x00(%r11), @XMM[1] # .LADD1
1924 movdqa @XMM[0], @XMM[3]
1925 paddd 0x10(%r11), @XMM[2] # .LADD2
1926 movdqa @XMM[0], @XMM[4]
1927 paddd 0x20(%r11), @XMM[3] # .LADD3
1928 movdqa @XMM[0], @XMM[5]
1929 paddd 0x30(%r11), @XMM[4] # .LADD4
1930 movdqa @XMM[0], @XMM[6]
1931 paddd 0x40(%r11), @XMM[5] # .LADD5
1932 movdqa @XMM[0], @XMM[7]
1933 paddd 0x50(%r11), @XMM[6] # .LADD6
1934 paddd 0x60(%r11), @XMM[7] # .LADD7
1936 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1937 # to flip byte order in 32-bit counter
1938 movdqa (%rsp), @XMM[9] # round 0 key
1939 lea 0x10(%rsp), %rax # pass key schedule
1940 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1941 pxor @XMM[9], @XMM[0] # xor with round0 key
1942 pxor @XMM[9], @XMM[1]
1943 pxor @XMM[9], @XMM[2]
1944 pxor @XMM[9], @XMM[3]
1945 pshufb @XMM[8], @XMM[0]
1946 pshufb @XMM[8], @XMM[1]
1947 pxor @XMM[9], @XMM[4]
1948 pxor @XMM[9], @XMM[5]
1949 pshufb @XMM[8], @XMM[2]
1950 pshufb @XMM[8], @XMM[3]
1951 pxor @XMM[9], @XMM[6]
1952 pxor @XMM[9], @XMM[7]
1953 pshufb @XMM[8], @XMM[4]
1954 pshufb @XMM[8], @XMM[5]
1955 pshufb @XMM[8], @XMM[6]
1956 pshufb @XMM[8], @XMM[7]
1957 lea .LBS0(%rip), %r11 # constants table
1958 mov %ebx,%r10d # pass rounds
1960 call _bsaes_encrypt8_bitslice
1963 jc .Lctr_enc_loop_done
1965 movdqu 0x00($inp), @XMM[8] # load input
1966 movdqu 0x10($inp), @XMM[9]
1967 movdqu 0x20($inp), @XMM[10]
1968 movdqu 0x30($inp), @XMM[11]
1969 movdqu 0x40($inp), @XMM[12]
1970 movdqu 0x50($inp), @XMM[13]
1971 movdqu 0x60($inp), @XMM[14]
1972 movdqu 0x70($inp), @XMM[15]
1974 pxor @XMM[0], @XMM[8]
1975 movdqa 0x20(%rbp), @XMM[0] # load counter
1976 pxor @XMM[9], @XMM[1]
1977 movdqu @XMM[8], 0x00($out) # write output
1978 pxor @XMM[10], @XMM[4]
1979 movdqu @XMM[1], 0x10($out)
1980 pxor @XMM[11], @XMM[6]
1981 movdqu @XMM[4], 0x20($out)
1982 pxor @XMM[12], @XMM[3]
1983 movdqu @XMM[6], 0x30($out)
1984 pxor @XMM[13], @XMM[7]
1985 movdqu @XMM[3], 0x40($out)
1986 pxor @XMM[14], @XMM[2]
1987 movdqu @XMM[7], 0x50($out)
1988 pxor @XMM[15], @XMM[5]
1989 movdqu @XMM[2], 0x60($out)
1990 lea .LADD1(%rip), %r11
1991 movdqu @XMM[5], 0x70($out)
1992 lea 0x80($out), $out
1993 paddd 0x70(%r11), @XMM[0] # .LADD8
1998 .Lctr_enc_loop_done:
2000 movdqu 0x00($inp), @XMM[8] # load input
2001 pxor @XMM[8], @XMM[0]
2002 movdqu @XMM[0], 0x00($out) # write output
2005 movdqu 0x10($inp), @XMM[9]
2006 pxor @XMM[9], @XMM[1]
2007 movdqu @XMM[1], 0x10($out)
2009 movdqu 0x20($inp), @XMM[10]
2010 pxor @XMM[10], @XMM[4]
2011 movdqu @XMM[4], 0x20($out)
2014 movdqu 0x30($inp), @XMM[11]
2015 pxor @XMM[11], @XMM[6]
2016 movdqu @XMM[6], 0x30($out)
2018 movdqu 0x40($inp), @XMM[12]
2019 pxor @XMM[12], @XMM[3]
2020 movdqu @XMM[3], 0x40($out)
2023 movdqu 0x50($inp), @XMM[13]
2024 pxor @XMM[13], @XMM[7]
2025 movdqu @XMM[7], 0x50($out)
2027 movdqu 0x60($inp), @XMM[14]
2028 pxor @XMM[14], @XMM[2]
2029 movdqu @XMM[2], 0x60($out)
2034 lea 0x20(%rbp), $arg1
2035 lea 0x30(%rbp), $arg2
2037 call asm_AES_encrypt
2038 movdqu ($inp), @XMM[1]
2040 mov 0x2c(%rbp), %eax # load 32-bit counter
2042 pxor 0x30(%rbp), @XMM[1]
2043 inc %eax # increment
2044 movdqu @XMM[1], ($out)
2047 mov %eax, 0x2c(%rsp) # save 32-bit counter
2054 .Lctr_enc_bzero: # wipe key schedule [if any]
2055 movdqa %xmm0, 0x00(%rax)
2056 movdqa %xmm0, 0x10(%rax)
2057 lea 0x20(%rax), %rax
2061 lea (%rbp),%rsp # restore %rsp
2063 $code.=<<___ if ($win64);
2064 movaps 0x40(%rbp), %xmm6
2065 movaps 0x50(%rbp), %xmm7
2066 movaps 0x60(%rbp), %xmm8
2067 movaps 0x70(%rbp), %xmm9
2068 movaps 0x80(%rbp), %xmm10
2069 movaps 0x90(%rbp), %xmm11
2070 movaps 0xa0(%rbp), %xmm12
2071 movaps 0xb0(%rbp), %xmm13
2072 movaps 0xc0(%rbp), %xmm14
2073 movaps 0xd0(%rbp), %xmm15
2074 lea 0xa0(%rbp), %rsp
2077 mov 0x48(%rsp), %r15
2078 mov 0x50(%rsp), %r14
2079 mov 0x58(%rsp), %r13
2080 mov 0x60(%rsp), %r12
2081 mov 0x68(%rsp), %rbx
2082 mov 0x70(%rsp), %rax
2083 lea 0x78(%rsp), %rsp
2087 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2089 ######################################################################
2090 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2091 # const AES_KEY *key1, const AES_KEY *key2,
2092 # const unsigned char iv[16]);
2094 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2098 .globl bsaes_xts_encrypt
2099 .type bsaes_xts_encrypt,\@abi-omnipotent
2110 lea -0x48(%rsp), %rsp
2112 $code.=<<___ if ($win64);
2113 mov 0xa0(%rsp),$arg5 # pull key2
2114 mov 0xa8(%rsp),$arg6 # pull ivp
2115 lea -0xa0(%rsp), %rsp
2116 movaps %xmm6, 0x40(%rsp)
2117 movaps %xmm7, 0x50(%rsp)
2118 movaps %xmm8, 0x60(%rsp)
2119 movaps %xmm9, 0x70(%rsp)
2120 movaps %xmm10, 0x80(%rsp)
2121 movaps %xmm11, 0x90(%rsp)
2122 movaps %xmm12, 0xa0(%rsp)
2123 movaps %xmm13, 0xb0(%rsp)
2124 movaps %xmm14, 0xc0(%rsp)
2125 movaps %xmm15, 0xd0(%rsp)
2129 mov %rsp, %rbp # backup %rsp
2130 mov $arg1, $inp # backup arguments
2136 lea 0x20(%rbp), $arg2
2138 call asm_AES_encrypt # generate initial tweak
2140 mov 240($key), %eax # rounds
2141 mov $len, %rbx # backup $len
2143 mov %eax, %edx # rounds
2144 shl \$7, %rax # 128 bytes per inner round key
2145 sub \$`128-32`, %rax # size of bit-sliced key schedule
2148 mov %rsp, %rax # pass key schedule
2149 mov $key, %rcx # pass key
2150 mov %edx, %r10d # pass rounds
2151 call _bsaes_key_convert
2152 pxor %xmm6, %xmm7 # fix up last round key
2153 movdqa %xmm7, (%rax) # save last round key
2156 sub \$0x80, %rsp # place for tweak[8]
2157 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2160 movdqa .Lxts_magic(%rip), $twmask
2161 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2170 for ($i=0;$i<7;$i++) {
2172 pshufd \$0x13, $twtmp, $twres
2174 movdqa @XMM[7], @XMM[$i]
2175 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2177 pand $twmask, $twres # isolate carry and residue
2178 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2179 pxor $twres, @XMM[7]
2181 $code.=<<___ if ($i>=1);
2182 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2184 $code.=<<___ if ($i>=2);
2185 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2189 movdqu 0x60($inp), @XMM[8+6]
2190 pxor @XMM[8+5], @XMM[5]
2191 movdqu 0x70($inp), @XMM[8+7]
2192 lea 0x80($inp), $inp
2193 movdqa @XMM[7], 0x70(%rsp)
2194 pxor @XMM[8+6], @XMM[6]
2195 lea 0x80(%rsp), %rax # pass key schedule
2196 pxor @XMM[8+7], @XMM[7]
2197 mov %edx, %r10d # pass rounds
2199 call _bsaes_encrypt8
2201 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2202 pxor 0x10(%rsp), @XMM[1]
2203 movdqu @XMM[0], 0x00($out) # write output
2204 pxor 0x20(%rsp), @XMM[4]
2205 movdqu @XMM[1], 0x10($out)
2206 pxor 0x30(%rsp), @XMM[6]
2207 movdqu @XMM[4], 0x20($out)
2208 pxor 0x40(%rsp), @XMM[3]
2209 movdqu @XMM[6], 0x30($out)
2210 pxor 0x50(%rsp), @XMM[7]
2211 movdqu @XMM[3], 0x40($out)
2212 pxor 0x60(%rsp), @XMM[2]
2213 movdqu @XMM[7], 0x50($out)
2214 pxor 0x70(%rsp), @XMM[5]
2215 movdqu @XMM[2], 0x60($out)
2216 movdqu @XMM[5], 0x70($out)
2217 lea 0x80($out), $out
2219 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2221 movdqa .Lxts_magic(%rip), $twmask
2222 pcmpgtd @XMM[7], $twtmp
2223 pshufd \$0x13, $twtmp, $twres
2225 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2226 pand $twmask, $twres # isolate carry and residue
2227 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2228 pxor $twres, @XMM[7]
2237 for ($i=0;$i<7;$i++) {
2239 pshufd \$0x13, $twtmp, $twres
2241 movdqa @XMM[7], @XMM[$i]
2242 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2243 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2244 pand $twmask, $twres # isolate carry and residue
2245 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2246 pxor $twres, @XMM[7]
2248 $code.=<<___ if ($i>=1);
2249 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2250 cmp \$`0x10*$i`,$len
2253 $code.=<<___ if ($i>=2);
2254 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2258 movdqu 0x60($inp), @XMM[8+6]
2259 pxor @XMM[8+5], @XMM[5]
2260 movdqa @XMM[7], 0x70(%rsp)
2261 lea 0x70($inp), $inp
2262 pxor @XMM[8+6], @XMM[6]
2263 lea 0x80(%rsp), %rax # pass key schedule
2264 mov %edx, %r10d # pass rounds
2266 call _bsaes_encrypt8
2268 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2269 pxor 0x10(%rsp), @XMM[1]
2270 movdqu @XMM[0], 0x00($out) # write output
2271 pxor 0x20(%rsp), @XMM[4]
2272 movdqu @XMM[1], 0x10($out)
2273 pxor 0x30(%rsp), @XMM[6]
2274 movdqu @XMM[4], 0x20($out)
2275 pxor 0x40(%rsp), @XMM[3]
2276 movdqu @XMM[6], 0x30($out)
2277 pxor 0x50(%rsp), @XMM[7]
2278 movdqu @XMM[3], 0x40($out)
2279 pxor 0x60(%rsp), @XMM[2]
2280 movdqu @XMM[7], 0x50($out)
2281 movdqu @XMM[2], 0x60($out)
2282 lea 0x70($out), $out
2284 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2288 pxor @XMM[8+4], @XMM[4]
2289 lea 0x60($inp), $inp
2290 pxor @XMM[8+5], @XMM[5]
2291 lea 0x80(%rsp), %rax # pass key schedule
2292 mov %edx, %r10d # pass rounds
2294 call _bsaes_encrypt8
2296 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2297 pxor 0x10(%rsp), @XMM[1]
2298 movdqu @XMM[0], 0x00($out) # write output
2299 pxor 0x20(%rsp), @XMM[4]
2300 movdqu @XMM[1], 0x10($out)
2301 pxor 0x30(%rsp), @XMM[6]
2302 movdqu @XMM[4], 0x20($out)
2303 pxor 0x40(%rsp), @XMM[3]
2304 movdqu @XMM[6], 0x30($out)
2305 pxor 0x50(%rsp), @XMM[7]
2306 movdqu @XMM[3], 0x40($out)
2307 movdqu @XMM[7], 0x50($out)
2308 lea 0x60($out), $out
2310 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2314 pxor @XMM[8+3], @XMM[3]
2315 lea 0x50($inp), $inp
2316 pxor @XMM[8+4], @XMM[4]
2317 lea 0x80(%rsp), %rax # pass key schedule
2318 mov %edx, %r10d # pass rounds
2320 call _bsaes_encrypt8
2322 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2323 pxor 0x10(%rsp), @XMM[1]
2324 movdqu @XMM[0], 0x00($out) # write output
2325 pxor 0x20(%rsp), @XMM[4]
2326 movdqu @XMM[1], 0x10($out)
2327 pxor 0x30(%rsp), @XMM[6]
2328 movdqu @XMM[4], 0x20($out)
2329 pxor 0x40(%rsp), @XMM[3]
2330 movdqu @XMM[6], 0x30($out)
2331 movdqu @XMM[3], 0x40($out)
2332 lea 0x50($out), $out
2334 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2338 pxor @XMM[8+2], @XMM[2]
2339 lea 0x40($inp), $inp
2340 pxor @XMM[8+3], @XMM[3]
2341 lea 0x80(%rsp), %rax # pass key schedule
2342 mov %edx, %r10d # pass rounds
2344 call _bsaes_encrypt8
2346 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2347 pxor 0x10(%rsp), @XMM[1]
2348 movdqu @XMM[0], 0x00($out) # write output
2349 pxor 0x20(%rsp), @XMM[4]
2350 movdqu @XMM[1], 0x10($out)
2351 pxor 0x30(%rsp), @XMM[6]
2352 movdqu @XMM[4], 0x20($out)
2353 movdqu @XMM[6], 0x30($out)
2354 lea 0x40($out), $out
2356 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2360 pxor @XMM[8+1], @XMM[1]
2361 lea 0x30($inp), $inp
2362 pxor @XMM[8+2], @XMM[2]
2363 lea 0x80(%rsp), %rax # pass key schedule
2364 mov %edx, %r10d # pass rounds
2366 call _bsaes_encrypt8
2368 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2369 pxor 0x10(%rsp), @XMM[1]
2370 movdqu @XMM[0], 0x00($out) # write output
2371 pxor 0x20(%rsp), @XMM[4]
2372 movdqu @XMM[1], 0x10($out)
2373 movdqu @XMM[4], 0x20($out)
2374 lea 0x30($out), $out
2376 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2380 pxor @XMM[8+0], @XMM[0]
2381 lea 0x20($inp), $inp
2382 pxor @XMM[8+1], @XMM[1]
2383 lea 0x80(%rsp), %rax # pass key schedule
2384 mov %edx, %r10d # pass rounds
2386 call _bsaes_encrypt8
2388 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2389 pxor 0x10(%rsp), @XMM[1]
2390 movdqu @XMM[0], 0x00($out) # write output
2391 movdqu @XMM[1], 0x10($out)
2392 lea 0x20($out), $out
2394 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2398 pxor @XMM[0], @XMM[8]
2399 lea 0x10($inp), $inp
2400 movdqa @XMM[8], 0x20(%rbp)
2401 lea 0x20(%rbp), $arg1
2402 lea 0x20(%rbp), $arg2
2404 call asm_AES_encrypt # doesn't touch %xmm
2405 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2406 #pxor @XMM[8], @XMM[0]
2407 #lea 0x80(%rsp), %rax # pass key schedule
2408 #mov %edx, %r10d # pass rounds
2409 #call _bsaes_encrypt8
2410 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2411 movdqu @XMM[0], 0x00($out) # write output
2412 lea 0x10($out), $out
2414 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2423 movzb -16(%rdx), %ecx
2431 movdqu -16($out), @XMM[0]
2432 lea 0x20(%rbp), $arg1
2433 pxor @XMM[7], @XMM[0]
2434 lea 0x20(%rbp), $arg2
2435 movdqa @XMM[0], 0x20(%rbp)
2437 call asm_AES_encrypt # doesn't touch %xmm
2438 pxor 0x20(%rbp), @XMM[7]
2439 movdqu @XMM[7], -16($out)
2444 .Lxts_enc_bzero: # wipe key schedule [if any]
2445 movdqa %xmm0, 0x00(%rax)
2446 movdqa %xmm0, 0x10(%rax)
2447 lea 0x20(%rax), %rax
2451 lea (%rbp),%rsp # restore %rsp
2453 $code.=<<___ if ($win64);
2454 movaps 0x40(%rbp), %xmm6
2455 movaps 0x50(%rbp), %xmm7
2456 movaps 0x60(%rbp), %xmm8
2457 movaps 0x70(%rbp), %xmm9
2458 movaps 0x80(%rbp), %xmm10
2459 movaps 0x90(%rbp), %xmm11
2460 movaps 0xa0(%rbp), %xmm12
2461 movaps 0xb0(%rbp), %xmm13
2462 movaps 0xc0(%rbp), %xmm14
2463 movaps 0xd0(%rbp), %xmm15
2464 lea 0xa0(%rbp), %rsp
2467 mov 0x48(%rsp), %r15
2468 mov 0x50(%rsp), %r14
2469 mov 0x58(%rsp), %r13
2470 mov 0x60(%rsp), %r12
2471 mov 0x68(%rsp), %rbx
2472 mov 0x70(%rsp), %rax
2473 lea 0x78(%rsp), %rsp
2477 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2479 .globl bsaes_xts_decrypt
2480 .type bsaes_xts_decrypt,\@abi-omnipotent
2491 lea -0x48(%rsp), %rsp
2493 $code.=<<___ if ($win64);
2494 mov 0xa0(%rsp),$arg5 # pull key2
2495 mov 0xa8(%rsp),$arg6 # pull ivp
2496 lea -0xa0(%rsp), %rsp
2497 movaps %xmm6, 0x40(%rsp)
2498 movaps %xmm7, 0x50(%rsp)
2499 movaps %xmm8, 0x60(%rsp)
2500 movaps %xmm9, 0x70(%rsp)
2501 movaps %xmm10, 0x80(%rsp)
2502 movaps %xmm11, 0x90(%rsp)
2503 movaps %xmm12, 0xa0(%rsp)
2504 movaps %xmm13, 0xb0(%rsp)
2505 movaps %xmm14, 0xc0(%rsp)
2506 movaps %xmm15, 0xd0(%rsp)
2510 mov %rsp, %rbp # backup %rsp
2511 mov $arg1, $inp # backup arguments
2517 lea 0x20(%rbp), $arg2
2519 call asm_AES_encrypt # generate initial tweak
2521 mov 240($key), %eax # rounds
2522 mov $len, %rbx # backup $len
2524 mov %eax, %edx # rounds
2525 shl \$7, %rax # 128 bytes per inner round key
2526 sub \$`128-32`, %rax # size of bit-sliced key schedule
2529 mov %rsp, %rax # pass key schedule
2530 mov $key, %rcx # pass key
2531 mov %edx, %r10d # pass rounds
2532 call _bsaes_key_convert
2533 pxor (%rsp), %xmm7 # fix up round 0 key
2534 movdqa %xmm6, (%rax) # save last round key
2535 movdqa %xmm7, (%rsp)
2537 xor %eax, %eax # if ($len%16) len-=16;
2544 sub \$0x80, %rsp # place for tweak[8]
2545 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2548 movdqa .Lxts_magic(%rip), $twmask
2549 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2558 for ($i=0;$i<7;$i++) {
2560 pshufd \$0x13, $twtmp, $twres
2562 movdqa @XMM[7], @XMM[$i]
2563 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2565 pand $twmask, $twres # isolate carry and residue
2566 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2567 pxor $twres, @XMM[7]
2569 $code.=<<___ if ($i>=1);
2570 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2572 $code.=<<___ if ($i>=2);
2573 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2577 movdqu 0x60($inp), @XMM[8+6]
2578 pxor @XMM[8+5], @XMM[5]
2579 movdqu 0x70($inp), @XMM[8+7]
2580 lea 0x80($inp), $inp
2581 movdqa @XMM[7], 0x70(%rsp)
2582 pxor @XMM[8+6], @XMM[6]
2583 lea 0x80(%rsp), %rax # pass key schedule
2584 pxor @XMM[8+7], @XMM[7]
2585 mov %edx, %r10d # pass rounds
2587 call _bsaes_decrypt8
2589 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2590 pxor 0x10(%rsp), @XMM[1]
2591 movdqu @XMM[0], 0x00($out) # write output
2592 pxor 0x20(%rsp), @XMM[6]
2593 movdqu @XMM[1], 0x10($out)
2594 pxor 0x30(%rsp), @XMM[4]
2595 movdqu @XMM[6], 0x20($out)
2596 pxor 0x40(%rsp), @XMM[2]
2597 movdqu @XMM[4], 0x30($out)
2598 pxor 0x50(%rsp), @XMM[7]
2599 movdqu @XMM[2], 0x40($out)
2600 pxor 0x60(%rsp), @XMM[3]
2601 movdqu @XMM[7], 0x50($out)
2602 pxor 0x70(%rsp), @XMM[5]
2603 movdqu @XMM[3], 0x60($out)
2604 movdqu @XMM[5], 0x70($out)
2605 lea 0x80($out), $out
2607 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2609 movdqa .Lxts_magic(%rip), $twmask
2610 pcmpgtd @XMM[7], $twtmp
2611 pshufd \$0x13, $twtmp, $twres
2613 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2614 pand $twmask, $twres # isolate carry and residue
2615 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2616 pxor $twres, @XMM[7]
2625 for ($i=0;$i<7;$i++) {
2627 pshufd \$0x13, $twtmp, $twres
2629 movdqa @XMM[7], @XMM[$i]
2630 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2631 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2632 pand $twmask, $twres # isolate carry and residue
2633 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2634 pxor $twres, @XMM[7]
2636 $code.=<<___ if ($i>=1);
2637 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2638 cmp \$`0x10*$i`,$len
2641 $code.=<<___ if ($i>=2);
2642 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2646 movdqu 0x60($inp), @XMM[8+6]
2647 pxor @XMM[8+5], @XMM[5]
2648 movdqa @XMM[7], 0x70(%rsp)
2649 lea 0x70($inp), $inp
2650 pxor @XMM[8+6], @XMM[6]
2651 lea 0x80(%rsp), %rax # pass key schedule
2652 mov %edx, %r10d # pass rounds
2654 call _bsaes_decrypt8
2656 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2657 pxor 0x10(%rsp), @XMM[1]
2658 movdqu @XMM[0], 0x00($out) # write output
2659 pxor 0x20(%rsp), @XMM[6]
2660 movdqu @XMM[1], 0x10($out)
2661 pxor 0x30(%rsp), @XMM[4]
2662 movdqu @XMM[6], 0x20($out)
2663 pxor 0x40(%rsp), @XMM[2]
2664 movdqu @XMM[4], 0x30($out)
2665 pxor 0x50(%rsp), @XMM[7]
2666 movdqu @XMM[2], 0x40($out)
2667 pxor 0x60(%rsp), @XMM[3]
2668 movdqu @XMM[7], 0x50($out)
2669 movdqu @XMM[3], 0x60($out)
2670 lea 0x70($out), $out
2672 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2676 pxor @XMM[8+4], @XMM[4]
2677 lea 0x60($inp), $inp
2678 pxor @XMM[8+5], @XMM[5]
2679 lea 0x80(%rsp), %rax # pass key schedule
2680 mov %edx, %r10d # pass rounds
2682 call _bsaes_decrypt8
2684 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2685 pxor 0x10(%rsp), @XMM[1]
2686 movdqu @XMM[0], 0x00($out) # write output
2687 pxor 0x20(%rsp), @XMM[6]
2688 movdqu @XMM[1], 0x10($out)
2689 pxor 0x30(%rsp), @XMM[4]
2690 movdqu @XMM[6], 0x20($out)
2691 pxor 0x40(%rsp), @XMM[2]
2692 movdqu @XMM[4], 0x30($out)
2693 pxor 0x50(%rsp), @XMM[7]
2694 movdqu @XMM[2], 0x40($out)
2695 movdqu @XMM[7], 0x50($out)
2696 lea 0x60($out), $out
2698 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2702 pxor @XMM[8+3], @XMM[3]
2703 lea 0x50($inp), $inp
2704 pxor @XMM[8+4], @XMM[4]
2705 lea 0x80(%rsp), %rax # pass key schedule
2706 mov %edx, %r10d # pass rounds
2708 call _bsaes_decrypt8
2710 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2711 pxor 0x10(%rsp), @XMM[1]
2712 movdqu @XMM[0], 0x00($out) # write output
2713 pxor 0x20(%rsp), @XMM[6]
2714 movdqu @XMM[1], 0x10($out)
2715 pxor 0x30(%rsp), @XMM[4]
2716 movdqu @XMM[6], 0x20($out)
2717 pxor 0x40(%rsp), @XMM[2]
2718 movdqu @XMM[4], 0x30($out)
2719 movdqu @XMM[2], 0x40($out)
2720 lea 0x50($out), $out
2722 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2726 pxor @XMM[8+2], @XMM[2]
2727 lea 0x40($inp), $inp
2728 pxor @XMM[8+3], @XMM[3]
2729 lea 0x80(%rsp), %rax # pass key schedule
2730 mov %edx, %r10d # pass rounds
2732 call _bsaes_decrypt8
2734 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2735 pxor 0x10(%rsp), @XMM[1]
2736 movdqu @XMM[0], 0x00($out) # write output
2737 pxor 0x20(%rsp), @XMM[6]
2738 movdqu @XMM[1], 0x10($out)
2739 pxor 0x30(%rsp), @XMM[4]
2740 movdqu @XMM[6], 0x20($out)
2741 movdqu @XMM[4], 0x30($out)
2742 lea 0x40($out), $out
2744 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2748 pxor @XMM[8+1], @XMM[1]
2749 lea 0x30($inp), $inp
2750 pxor @XMM[8+2], @XMM[2]
2751 lea 0x80(%rsp), %rax # pass key schedule
2752 mov %edx, %r10d # pass rounds
2754 call _bsaes_decrypt8
2756 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2757 pxor 0x10(%rsp), @XMM[1]
2758 movdqu @XMM[0], 0x00($out) # write output
2759 pxor 0x20(%rsp), @XMM[6]
2760 movdqu @XMM[1], 0x10($out)
2761 movdqu @XMM[6], 0x20($out)
2762 lea 0x30($out), $out
2764 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2768 pxor @XMM[8+0], @XMM[0]
2769 lea 0x20($inp), $inp
2770 pxor @XMM[8+1], @XMM[1]
2771 lea 0x80(%rsp), %rax # pass key schedule
2772 mov %edx, %r10d # pass rounds
2774 call _bsaes_decrypt8
2776 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2777 pxor 0x10(%rsp), @XMM[1]
2778 movdqu @XMM[0], 0x00($out) # write output
2779 movdqu @XMM[1], 0x10($out)
2780 lea 0x20($out), $out
2782 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2786 pxor @XMM[0], @XMM[8]
2787 lea 0x10($inp), $inp
2788 movdqa @XMM[8], 0x20(%rbp)
2789 lea 0x20(%rbp), $arg1
2790 lea 0x20(%rbp), $arg2
2792 call asm_AES_decrypt # doesn't touch %xmm
2793 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2794 #pxor @XMM[8], @XMM[0]
2795 #lea 0x80(%rsp), %rax # pass key schedule
2796 #mov %edx, %r10d # pass rounds
2797 #call _bsaes_decrypt8
2798 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2799 movdqu @XMM[0], 0x00($out) # write output
2800 lea 0x10($out), $out
2802 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2809 movdqa .Lxts_magic(%rip), $twmask
2810 pcmpgtd @XMM[7], $twtmp
2811 pshufd \$0x13, $twtmp, $twres
2812 movdqa @XMM[7], @XMM[6]
2813 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2814 pand $twmask, $twres # isolate carry and residue
2815 movdqu ($inp), @XMM[0]
2816 pxor $twres, @XMM[7]
2818 lea 0x20(%rbp), $arg1
2819 pxor @XMM[7], @XMM[0]
2820 lea 0x20(%rbp), $arg2
2821 movdqa @XMM[0], 0x20(%rbp)
2823 call asm_AES_decrypt # doesn't touch %xmm
2824 pxor 0x20(%rbp), @XMM[7]
2826 movdqu @XMM[7], ($out)
2829 movzb 16($inp), %eax
2838 movdqu ($out), @XMM[0]
2839 lea 0x20(%rbp), $arg1
2840 pxor @XMM[6], @XMM[0]
2841 lea 0x20(%rbp), $arg2
2842 movdqa @XMM[0], 0x20(%rbp)
2844 call asm_AES_decrypt # doesn't touch %xmm
2845 pxor 0x20(%rbp), @XMM[6]
2846 movdqu @XMM[6], ($out)
2851 .Lxts_dec_bzero: # wipe key schedule [if any]
2852 movdqa %xmm0, 0x00(%rax)
2853 movdqa %xmm0, 0x10(%rax)
2854 lea 0x20(%rax), %rax
2858 lea (%rbp),%rsp # restore %rsp
2860 $code.=<<___ if ($win64);
2861 movaps 0x40(%rbp), %xmm6
2862 movaps 0x50(%rbp), %xmm7
2863 movaps 0x60(%rbp), %xmm8
2864 movaps 0x70(%rbp), %xmm9
2865 movaps 0x80(%rbp), %xmm10
2866 movaps 0x90(%rbp), %xmm11
2867 movaps 0xa0(%rbp), %xmm12
2868 movaps 0xb0(%rbp), %xmm13
2869 movaps 0xc0(%rbp), %xmm14
2870 movaps 0xd0(%rbp), %xmm15
2871 lea 0xa0(%rbp), %rsp
2874 mov 0x48(%rsp), %r15
2875 mov 0x50(%rsp), %r14
2876 mov 0x58(%rsp), %r13
2877 mov 0x60(%rsp), %r12
2878 mov 0x68(%rsp), %rbx
2879 mov 0x70(%rsp), %rax
2880 lea 0x78(%rsp), %rsp
2884 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2888 .type _bsaes_const,\@object
2891 .LM0ISR: # InvShiftRows constants
2892 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2894 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2896 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2897 .LBS0: # bit-slice constants
2898 .quad 0x5555555555555555, 0x5555555555555555
2900 .quad 0x3333333333333333, 0x3333333333333333
2902 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2903 .LSR: # shiftrows constants
2904 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2906 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2908 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2909 .LSWPUP: # byte-swap upper dword
2910 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2912 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2913 .LADD1: # counter increment constants
2914 .quad 0x0000000000000000, 0x0000000100000000
2916 .quad 0x0000000000000000, 0x0000000200000000
2918 .quad 0x0000000000000000, 0x0000000300000000
2920 .quad 0x0000000000000000, 0x0000000400000000
2922 .quad 0x0000000000000000, 0x0000000500000000
2924 .quad 0x0000000000000000, 0x0000000600000000
2926 .quad 0x0000000000000000, 0x0000000700000000
2928 .quad 0x0000000000000000, 0x0000000800000000
2932 .quad 0x0101010101010101, 0x0101010101010101
2933 .quad 0x0202020202020202, 0x0202020202020202
2934 .quad 0x0404040404040404, 0x0404040404040404
2935 .quad 0x0808080808080808, 0x0808080808080808
2937 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2939 .quad 0x6363636363636363, 0x6363636363636363
2940 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2942 .size _bsaes_const,.-_bsaes_const
2945 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2946 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2954 .extern __imp_RtlVirtualUnwind
2955 .type se_handler,\@abi-omnipotent
2969 mov 120($context),%rax # pull context->Rax
2970 mov 248($context),%rbx # pull context->Rip
2972 mov 8($disp),%rsi # disp->ImageBase
2973 mov 56($disp),%r11 # disp->HandlerData
2975 mov 0(%r11),%r10d # HandlerData[0]
2976 lea (%rsi,%r10),%r10 # prologue label
2977 cmp %r10,%rbx # context->Rip<prologue label
2980 mov 152($context),%rax # pull context->Rsp
2982 mov 4(%r11),%r10d # HandlerData[1]
2983 lea (%rsi,%r10),%r10 # epilogue label
2984 cmp %r10,%rbx # context->Rip>=epilogue label
2987 mov 160($context),%rax # pull context->Rbp
2989 lea 0x40(%rax),%rsi # %xmm save area
2990 lea 512($context),%rdi # &context.Xmm6
2991 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2992 .long 0xa548f3fc # cld; rep movsq
2993 lea 0xa0(%rax),%rax # adjust stack pointer
3001 lea 0x78(%rax),%rax # adjust stack pointer
3002 mov %rbx,144($context) # restore context->Rbx
3003 mov %rbp,160($context) # restore context->Rbp
3004 mov %r12,216($context) # restore context->R12
3005 mov %r13,224($context) # restore context->R13
3006 mov %r14,232($context) # restore context->R14
3007 mov %r15,240($context) # restore context->R15
3010 mov %rax,152($context) # restore context->Rsp
3012 mov 40($disp),%rdi # disp->ContextRecord
3013 mov $context,%rsi # context
3014 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3015 .long 0xa548f3fc # cld; rep movsq
3018 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3019 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3020 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3021 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3022 mov 40(%rsi),%r10 # disp->ContextRecord
3023 lea 56(%rsi),%r11 # &disp->HandlerData
3024 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3025 mov %r10,32(%rsp) # arg5
3026 mov %r11,40(%rsp) # arg6
3027 mov %r12,48(%rsp) # arg7
3028 mov %rcx,56(%rsp) # arg8, (NULL)
3029 call *__imp_RtlVirtualUnwind(%rip)
3031 mov \$1,%eax # ExceptionContinueSearch
3043 .size se_handler,.-se_handler
3048 $code.=<<___ if ($ecb);
3049 .rva .Lecb_enc_prologue
3050 .rva .Lecb_enc_epilogue
3053 .rva .Lecb_dec_prologue
3054 .rva .Lecb_dec_epilogue
3058 .rva .Lcbc_dec_prologue
3059 .rva .Lcbc_dec_epilogue
3062 .rva .Lctr_enc_prologue
3063 .rva .Lctr_enc_epilogue
3066 .rva .Lxts_enc_prologue
3067 .rva .Lxts_enc_epilogue
3070 .rva .Lxts_dec_prologue
3071 .rva .Lxts_dec_epilogue
3077 $code.=<<___ if ($ecb);
3081 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3085 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3091 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3095 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3099 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3103 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3107 $code =~ s/\`([^\`]*)\`/eval($1)/gem;