2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49 die "can't locate arm-xlate.pl";
51 open OUT,"| \"$^X\" $xlate $flavour $output";
54 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
59 $rp="x0"; # BN_ULONG *rp,
60 $ap="x1"; # const BN_ULONG *ap,
61 $bp="x2"; # const BN_ULONG *bp,
62 $np="x3"; # const BN_ULONG *np,
63 $n0="x4"; # const BN_ULONG *n0,
64 $num="x5"; # int num);
70 .type bn_mul_mont,%function
78 stp x29,x30,[sp,#-64]!
84 ldr $m0,[$bp],#8 // bp[0]
86 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
89 and $tp,$tp,#-16 // ABI says so
90 ldp $hi1,$nj,[$np],#16 // np[0..1]
92 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
93 sub $j,$num,#16 // j=num-2
95 mul $alo,$aj,$m0 // ap[1]*bp[0]
98 mul $m1,$lo0,$n0 // "tp[0]"*n0
101 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
103 mul $nlo,$nj,$m1 // np[1]*m1
104 // (*) adds $lo1,$lo1,$lo0 // discarded
105 // (*) As for removal of first multiplication and addition
106 // instructions. The outcome of first addition is
107 // guaranteed to be zero, which leaves two computationally
108 // significant outcomes: it either carries or not. Then
109 // question is when does it carry? Is there alternative
110 // way to deduce it? If you follow operations, you can
111 // observe that condition for carry is quite simple:
112 // $lo0 being non-zero. So that carry can be calculated
113 // by adding -1 to $lo0. That's what next instruction does.
114 subs xzr,$lo0,#1 // (*)
127 mul $alo,$aj,$m0 // ap[j]*bp[0]
132 mul $nlo,$nj,$m1 // np[j]*m1
135 str $lo1,[$tp],#8 // tp[j-1]
140 sub $ap,$ap,$num // rewind $ap
144 sub $np,$np,$num // rewind $np
148 sub $i,$num,#8 // i=num-1
151 adc $ovf,xzr,xzr // upmost overflow bit
155 ldr $m0,[$bp],#8 // bp[i]
156 ldp $hi0,$aj,[$ap],#16
157 ldr $tj,[sp] // tp[0]
160 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
161 sub $j,$num,#16 // j=num-2
163 ldp $hi1,$nj,[$np],#16
164 mul $alo,$aj,$m0 // ap[1]*bp[i]
172 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
174 mul $nlo,$nj,$m1 // np[1]*m1
175 // (*) adds $lo1,$lo1,$lo0
176 subs xzr,$lo0,#1 // (*)
183 ldr $tj,[$tp],#8 // tp[j]
192 mul $alo,$aj,$m0 // ap[j]*bp[i]
197 mul $nlo,$nj,$m1 // np[j]*m1
200 str $lo1,[$tp,#-16] // tp[j-1]
204 ldr $tj,[$tp],#8 // tp[j]
207 sub $ap,$ap,$num // rewind $ap
211 sub $np,$np,$num // rewind $np
220 adc $ovf,$ovf,xzr // upmost overflow bit
221 stp $lo1,$hi1,[$tp,#-16]
225 // Final step. We see if result is larger than modulus, and
226 // if it is, subtract the modulus. But comparison implies
227 // subtraction. So we subtract modulus, see if it borrowed,
228 // and conditionally copy original value.
229 ldr $tj,[sp] // tp[0]
231 ldr $nj,[$np],#8 // np[0]
232 subs $j,$num,#8 // j=num-1 and clear borrow
235 sbcs $aj,$tj,$nj // tp[j]-np[j]
239 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
243 sbcs $ovf,$ovf,xzr // did it borrow?
244 str $aj,[$ap],#8 // rp[num-1]
246 ldr $tj,[sp] // tp[0]
248 ldr $aj,[$rp],#8 // rp[0]
249 sub $num,$num,#8 // num--
252 sub $num,$num,#8 // num--
253 csel $nj,$tj,$aj,lo // did it borrow?
256 str xzr,[$tp,#-16] // wipe tp
258 cbnz $num,.Lcond_copy
261 str xzr,[$tp,#-8] // wipe tp
264 ldp x19,x20,[x29,#16]
266 ldp x21,x22,[x29,#32]
268 ldp x23,x24,[x29,#48]
271 .size bn_mul_mont,.-bn_mul_mont
274 ########################################################################
275 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
277 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
278 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
279 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
280 my ($cnt,$carry,$topmost)=("x27","x28","x30");
281 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
284 .type __bn_sqr8x_mont,%function
290 stp x29,x30,[sp,#-128]!
297 stp $rp,$np,[sp,#96] // offload rp and np
299 ldp $a0,$a1,[$ap,#8*0]
300 ldp $a2,$a3,[$ap,#8*2]
301 ldp $a4,$a5,[$ap,#8*4]
302 ldp $a6,$a7,[$ap,#8*6]
304 sub $tp,sp,$num,lsl#4
313 stp xzr,xzr,[$tp,#8*0]
314 stp xzr,xzr,[$tp,#8*2]
315 stp xzr,xzr,[$tp,#8*4]
316 stp xzr,xzr,[$tp,#8*6]
318 stp xzr,xzr,[$tp,#8*8]
319 stp xzr,xzr,[$tp,#8*10]
320 stp xzr,xzr,[$tp,#8*12]
321 stp xzr,xzr,[$tp,#8*14]
323 cbnz $cnt,.Lsqr8x_zero
336 str $n0,[x29,#112] // offload n0
338 // Multiply everything but a[i]*a[i]
370 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
374 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
381 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
388 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
389 adc $acc0,xzr,xzr // t[8]
390 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
397 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
410 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
417 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
418 adc $acc1,xzr,xzr // t[9]
424 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
435 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
442 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
443 adc $acc2,xzr,xzr // t[10]
447 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
456 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
463 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
464 adc $acc3,xzr,xzr // t[11]
466 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
473 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
479 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
480 adc $acc4,xzr,xzr // t[12]
484 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
489 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
491 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
492 adc $acc5,xzr,xzr // t[13]
494 sub $cnt,$ap_end,$ap // done yet?
498 sub $t0,$ap_end,$num // rewinded ap
499 adc $acc6,xzr,xzr // t[14]
502 cbz $cnt,.Lsqr8x_outer_break
505 ldp $a0,$a1,[$tp,#8*0]
506 ldp $a2,$a3,[$tp,#8*2]
507 ldp $a4,$a5,[$tp,#8*4]
508 ldp $a6,$a7,[$tp,#8*6]
511 ldp $a0,$a1,[$ap,#8*0]
514 ldp $a2,$a3,[$ap,#8*2]
517 ldp $a4,$a5,[$ap,#8*4]
521 ldp $a6,$a7,[$ap,#8*6]
523 //adc $carry,xzr,xzr // moved below
535 // a[f]a[1]........................
537 // a[f]a[2]........................
539 // a[f]a[3]........................
541 // a[f]a[4]........................
543 // a[f]a[5]........................
545 // a[f]a[6]........................
547 // a[f]a[7]........................
550 adc $carry,xzr,xzr // carry bit, modulo-scheduled
571 adc $carry,$carry,xzr
585 adcs $acc7,$carry,$t3
586 //adc $carry,xzr,xzr // moved above
587 cbnz $cnt,.Lsqr8x_mul
588 // note that carry flag is guaranteed
589 // to be zero at this point
590 cmp $ap,$ap_end // done yet?
593 ldp $a0,$a1,[$tp,#8*0]
594 ldp $a2,$a3,[$tp,#8*2]
595 ldp $a4,$a5,[$tp,#8*4]
596 ldp $a6,$a7,[$tp,#8*6]
600 ldp $a0,$a1,[$ap,#8*0]
603 ldp $a2,$a3,[$ap,#8*2]
606 ldp $a4,$a5,[$ap,#8*4]
610 ldp $a6,$a7,[$ap,#8*6]
612 //adc $carry,xzr,xzr // moved above
617 ldp $a0,$a1,[$rp,#8*0]
619 ldp $a2,$a3,[$rp,#8*2]
620 sub $t0,$ap_end,$ap // is it last iteration?
621 ldp $a4,$a5,[$rp,#8*4]
623 ldp $a6,$a7,[$rp,#8*6]
624 cbz $t0,.Lsqr8x_outer_loop
626 stp $acc0,$acc1,[$tp,#8*0]
627 ldp $acc0,$acc1,[$t1,#8*0]
628 stp $acc2,$acc3,[$tp,#8*2]
629 ldp $acc2,$acc3,[$t1,#8*2]
630 stp $acc4,$acc5,[$tp,#8*4]
631 ldp $acc4,$acc5,[$t1,#8*4]
632 stp $acc6,$acc7,[$tp,#8*6]
634 ldp $acc6,$acc7,[$t1,#8*6]
639 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
640 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
641 ldp $t1,$t2,[sp,#8*1]
642 ldp $a5,$a7,[$t0,#8*2]
644 ldp $t3,$t0,[sp,#8*3]
646 stp $acc0,$acc1,[$tp,#8*0]
648 stp $acc2,$acc3,[$tp,#8*2]
650 stp $acc4,$acc5,[$tp,#8*4]
652 stp $acc6,$acc7,[$tp,#8*6]
655 adds $acc1,$a1,$t1,lsl#1
664 ldp $t1,$t2,[$tp,#8*5]
666 ldp $a1,$a3,[$ap],#8*2
671 stp $acc0,$acc1,[$tp,#8*0]
674 stp $acc2,$acc3,[$tp,#8*2]
676 ldp $t3,$t0,[$tp,#8*7]
681 ldp $t1,$t2,[$tp,#8*9]
683 ldp $a5,$a7,[$ap],#8*2
687 stp $acc4,$acc5,[$tp,#8*4]
689 stp $acc6,$acc7,[$tp,#8*6]
694 ldp $t3,$t0,[$tp,#8*3]
696 cbnz $cnt,.Lsqr4x_shift_n_add
698 my ($np,$np_end)=($ap,$ap_end);
700 ldp $np,$n0,[x29,#104] // pull np and n0
705 ldp $t1,$t2,[$tp,#8*5]
708 stp $acc0,$acc1,[$tp,#8*0]
711 stp $acc2,$acc3,[$tp,#8*2]
715 ldp $acc0,$acc1,[sp,#8*0]
718 ldp $a0,$a1,[$np,#8*0]
721 ldp $a2,$a3,[$np,#8*2]
723 ldp $a4,$a5,[$np,#8*4]
725 // Reduce by 512 bits per iteration
726 mul $na0,$n0,$acc0 // t[0]*n0
727 ldp $a6,$a7,[$np,#8*6]
729 ldp $acc2,$acc3,[sp,#8*2]
730 stp $acc4,$acc5,[$tp,#8*4]
731 ldp $acc4,$acc5,[sp,#8*4]
732 stp $acc6,$acc7,[$tp,#8*6]
733 ldp $acc6,$acc7,[sp,#8*6]
735 mov $topmost,xzr // initial top-most carry
740 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
744 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
746 // (*) adds xzr,$acc0,$t0
747 subs xzr,$acc0,#1 // (*)
756 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
772 mul $na0,$n0,$acc0 // next t[0]*n0
777 cbnz $cnt,.Lsqr8x_reduction
779 ldp $t0,$t1,[$tp,#8*0]
780 ldp $t2,$t3,[$tp,#8*2]
782 sub $cnt,$np_end,$np // done yet?
785 ldp $t0,$t1,[$tp,#8*4]
788 ldp $t2,$t3,[$tp,#8*6]
793 //adc $carry,xzr,xzr // moved below
794 cbz $cnt,.Lsqr8x8_post_condition
797 ldp $a0,$a1,[$np,#8*0]
798 ldp $a2,$a3,[$np,#8*2]
799 ldp $a4,$a5,[$np,#8*4]
801 ldp $a6,$a7,[$np,#8*6]
806 adc $carry,xzr,xzr // carry bit, modulo-scheduled
827 adc $carry,$carry,xzr
841 adcs $acc7,$carry,$t3
842 //adc $carry,xzr,xzr // moved above
843 cbnz $cnt,.Lsqr8x_tail
844 // note that carry flag is guaranteed
845 // to be zero at this point
846 ldp $a0,$a1,[$tp,#8*0]
847 sub $cnt,$np_end,$np // done yet?
848 sub $t2,$np_end,$num // rewinded np
849 ldp $a2,$a3,[$tp,#8*2]
850 ldp $a4,$a5,[$tp,#8*4]
851 ldp $a6,$a7,[$tp,#8*6]
852 cbz $cnt,.Lsqr8x_tail_break
857 ldp $a0,$a1,[$np,#8*0]
860 ldp $a2,$a3,[$np,#8*2]
863 ldp $a4,$a5,[$np,#8*4]
867 ldp $a6,$a7,[$np,#8*6]
869 //adc $carry,xzr,xzr // moved above
874 ldr $n0,[x29,#112] // pull n0
875 add $cnt,$tp,#8*8 // end of current t[num] window
877 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
880 ldp $acc0,$acc1,[$rp,#8*0]
882 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
884 ldp $a2,$a3,[$t2,#8*2]
887 ldp $a4,$a5,[$t2,#8*4]
890 ldp $a6,$a7,[$t2,#8*6]
892 adc $topmost,xzr,xzr // top-most carry
894 stp $t0,$t1,[$tp,#8*0]
895 stp $acc2,$acc3,[$tp,#8*2]
896 ldp $acc2,$acc3,[$rp,#8*2]
897 stp $acc4,$acc5,[$tp,#8*4]
898 ldp $acc4,$acc5,[$rp,#8*4]
899 cmp $cnt,x29 // did we hit the bottom?
900 stp $acc6,$acc7,[$tp,#8*6]
901 mov $tp,$rp // slide the window
902 ldp $acc6,$acc7,[$rp,#8*6]
904 b.ne .Lsqr8x_reduction
906 // Final step. We see if result is larger than modulus, and
907 // if it is, subtract the modulus. But comparison implies
908 // subtraction. So we subtract modulus, see if it borrowed,
909 // and conditionally copy original value.
910 ldr $rp,[x29,#96] // pull rp
915 mov $ap_end,$rp // $rp copy
919 ldp $a0,$a1,[$np,#8*0]
921 stp $t0,$t1,[$rp,#8*0]
923 ldp $a2,$a3,[$np,#8*2]
925 stp $t2,$t3,[$rp,#8*2]
927 ldp $a4,$a5,[$np,#8*4]
929 ldp $a6,$a7,[$np,#8*6]
931 ldp $acc0,$acc1,[$tp,#8*0]
933 ldp $acc2,$acc3,[$tp,#8*2]
934 ldp $acc4,$acc5,[$tp,#8*4]
935 ldp $acc6,$acc7,[$tp,#8*6]
937 stp $t0,$t1,[$rp,#8*4]
939 stp $t2,$t3,[$rp,#8*6]
942 cbnz $cnt,.Lsqr8x_sub
947 ldp $a0,$a1,[$ap_end,#8*0]
949 stp $t0,$t1,[$rp,#8*0]
951 ldp $a2,$a3,[$ap_end,#8*2]
953 stp $t2,$t3,[$rp,#8*2]
955 ldp $acc0,$acc1,[$ap,#8*0]
957 ldp $acc2,$acc3,[$ap,#8*2]
958 sbcs xzr,$topmost,xzr // did it borrow?
959 ldr x30,[x29,#8] // pull return address
960 stp $t0,$t1,[$rp,#8*4]
961 stp $t2,$t3,[$rp,#8*6]
966 csel $t0,$acc0,$a0,lo
967 stp xzr,xzr,[$tp,#8*0]
968 csel $t1,$acc1,$a1,lo
969 ldp $a0,$a1,[$ap_end,#8*4]
970 ldp $acc0,$acc1,[$ap,#8*4]
971 csel $t2,$acc2,$a2,lo
972 stp xzr,xzr,[$tp,#8*2]
974 csel $t3,$acc3,$a3,lo
975 ldp $a2,$a3,[$ap_end,#8*6]
976 ldp $acc2,$acc3,[$ap,#8*6]
978 stp $t0,$t1,[$ap_end,#8*0]
979 stp $t2,$t3,[$ap_end,#8*2]
980 add $ap_end,$ap_end,#8*4
981 stp xzr,xzr,[$ap,#8*0]
982 stp xzr,xzr,[$ap,#8*2]
983 cbnz $cnt,.Lsqr4x_cond_copy
985 csel $t0,$acc0,$a0,lo
986 stp xzr,xzr,[$tp,#8*0]
987 csel $t1,$acc1,$a1,lo
988 stp xzr,xzr,[$tp,#8*2]
989 csel $t2,$acc2,$a2,lo
990 csel $t3,$acc3,$a3,lo
991 stp $t0,$t1,[$ap_end,#8*0]
992 stp $t2,$t3,[$ap_end,#8*2]
997 .Lsqr8x8_post_condition:
999 ldr x30,[x29,#8] // pull return address
1000 // $acc0-7,$carry hold result, $a0-7 hold modulus
1002 ldr $ap,[x29,#96] // pull rp
1004 stp xzr,xzr,[sp,#8*0]
1006 stp xzr,xzr,[sp,#8*2]
1008 stp xzr,xzr,[sp,#8*4]
1010 stp xzr,xzr,[sp,#8*6]
1012 stp xzr,xzr,[sp,#8*8]
1014 stp xzr,xzr,[sp,#8*10]
1016 stp xzr,xzr,[sp,#8*12]
1017 sbcs $carry,$carry,xzr // did it borrow?
1018 stp xzr,xzr,[sp,#8*14]
1020 // $a0-7 hold result-modulus
1021 csel $a0,$acc0,$a0,lo
1022 csel $a1,$acc1,$a1,lo
1023 csel $a2,$acc2,$a2,lo
1024 csel $a3,$acc3,$a3,lo
1025 stp $a0,$a1,[$ap,#8*0]
1026 csel $a4,$acc4,$a4,lo
1027 csel $a5,$acc5,$a5,lo
1028 stp $a2,$a3,[$ap,#8*2]
1029 csel $a6,$acc6,$a6,lo
1030 csel $a7,$acc7,$a7,lo
1031 stp $a4,$a5,[$ap,#8*4]
1032 stp $a6,$a7,[$ap,#8*6]
1035 ldp x19,x20,[x29,#16]
1037 ldp x21,x22,[x29,#32]
1039 ldp x23,x24,[x29,#48]
1040 ldp x25,x26,[x29,#64]
1041 ldp x27,x28,[x29,#80]
1044 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1049 ########################################################################
1050 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1051 # x86_64-mont5 module, it's different in sense that it performs
1052 # reduction 256 bits at a time.
1054 my ($a0,$a1,$a2,$a3,
1057 $acc0,$acc1,$acc2,$acc3,$acc4,
1058 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1060 my ($carry,$topmost) = ($rp,"x30");
1063 .type __bn_mul4x_mont,%function
1066 stp x29,x30,[sp,#-128]!
1068 stp x19,x20,[sp,#16]
1069 stp x21,x22,[sp,#32]
1070 stp x23,x24,[sp,#48]
1071 stp x25,x26,[sp,#64]
1072 stp x27,x28,[sp,#80]
1074 sub $tp,sp,$num,lsl#3
1076 ldr $n0,[$n0] // *n0
1077 sub sp,$tp,#8*4 // alloca
1080 add $ap_end,$ap,$num
1081 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1083 ldr $bi,[$bp,#8*0] // b[0]
1084 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1085 ldp $a2,$a3,[$ap,#8*2]
1091 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1092 ldp $m2,$m3,[$np,#8*2]
1093 adds $np,$np,#8*4 // clear carry bit
1098 .Loop_mul4x_1st_reduction:
1099 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1100 adc $carry,$carry,xzr // modulo-scheduled
1106 adds $acc0,$acc0,$t0
1107 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1108 adcs $acc1,$acc1,$t1
1109 mul $mi,$acc0,$n0 // t[0]*n0
1110 adcs $acc2,$acc2,$t2
1112 adcs $acc3,$acc3,$t3
1116 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1117 adds $acc1,$acc1,$t0
1118 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1119 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1120 adcs $acc2,$acc2,$t1
1122 adcs $acc3,$acc3,$t2
1124 adc $acc4,$acc4,$t3 // can't overflow
1126 // (*) adds xzr,$acc0,$t0
1127 subs xzr,$acc0,#1 // (*)
1128 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1129 adcs $acc0,$acc1,$t1
1131 adcs $acc1,$acc2,$t2
1133 adcs $acc2,$acc3,$t3
1135 adcs $acc3,$acc4,$carry
1137 adds $acc0,$acc0,$t0
1139 adcs $acc1,$acc1,$t1
1140 adcs $acc2,$acc2,$t2
1141 adcs $acc3,$acc3,$t3
1142 //adc $carry,$carry,xzr
1143 cbnz $cnt,.Loop_mul4x_1st_reduction
1145 cbz $t0,.Lmul4x4_post_condition
1147 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1148 ldp $a2,$a3,[$ap,#8*2]
1150 ldr $mi,[sp] // a[0]*n0
1151 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1152 ldp $m2,$m3,[$np,#8*2]
1155 .Loop_mul4x_1st_tail:
1156 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1157 adc $carry,$carry,xzr // modulo-scheduled
1163 adds $acc0,$acc0,$t0
1164 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1165 adcs $acc1,$acc1,$t1
1167 adcs $acc2,$acc2,$t2
1169 adcs $acc3,$acc3,$t3
1172 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1173 adds $acc1,$acc1,$t0
1174 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1175 adcs $acc2,$acc2,$t1
1177 adcs $acc3,$acc3,$t2
1179 adc $acc4,$acc4,$t3 // can't overflow
1181 adds $acc0,$acc0,$t0
1182 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1183 adcs $acc1,$acc1,$t1
1185 adcs $acc2,$acc2,$t2
1187 adcs $acc3,$acc3,$t3
1188 adcs $acc4,$acc4,$carry
1191 ldr $mi,[sp,$cnt] // next t[0]*n0
1192 str $acc0,[$tp],#8 // result!!!
1193 adds $acc0,$acc1,$t0
1194 sub $t0,$ap_end,$ap // done yet?
1195 adcs $acc1,$acc2,$t1
1196 adcs $acc2,$acc3,$t2
1197 adcs $acc3,$acc4,$t3
1198 //adc $carry,$carry,xzr
1199 cbnz $cnt,.Loop_mul4x_1st_tail
1201 sub $t1,$ap_end,$num // rewinded $ap
1202 cbz $t0,.Lmul4x_proceed
1204 ldp $a0,$a1,[$ap,#8*0]
1205 ldp $a2,$a3,[$ap,#8*2]
1207 ldp $m0,$m1,[$np,#8*0]
1208 ldp $m2,$m3,[$np,#8*2]
1210 b .Loop_mul4x_1st_tail
1214 ldr $bi,[$bp,#8*4]! // *++b
1215 adc $topmost,$carry,xzr
1216 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1217 sub $np,$np,$num // rewind np
1218 ldp $a2,$a3,[$t1,#8*2]
1221 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1222 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1223 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1224 ldp $acc2,$acc3,[sp,#8*6]
1226 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1228 ldp $m2,$m3,[$np,#8*2]
1229 adds $np,$np,#8*4 // clear carry bit
1233 .Loop_mul4x_reduction:
1234 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1235 adc $carry,$carry,xzr // modulo-scheduled
1241 adds $acc0,$acc0,$t0
1242 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1243 adcs $acc1,$acc1,$t1
1244 mul $mi,$acc0,$n0 // t[0]*n0
1245 adcs $acc2,$acc2,$t2
1247 adcs $acc3,$acc3,$t3
1251 ldr $bi,[$bp,$cnt] // next b[i]
1252 adds $acc1,$acc1,$t0
1253 // (*) mul $t0,$m0,$mi
1254 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1255 adcs $acc2,$acc2,$t1
1256 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1257 adcs $acc3,$acc3,$t2
1259 adc $acc4,$acc4,$t3 // can't overflow
1261 // (*) adds xzr,$acc0,$t0
1262 subs xzr,$acc0,#1 // (*)
1263 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1264 adcs $acc0,$acc1,$t1
1266 adcs $acc1,$acc2,$t2
1268 adcs $acc2,$acc3,$t3
1270 adcs $acc3,$acc4,$carry
1272 adds $acc0,$acc0,$t0
1273 adcs $acc1,$acc1,$t1
1274 adcs $acc2,$acc2,$t2
1275 adcs $acc3,$acc3,$t3
1276 //adc $carry,$carry,xzr
1277 cbnz $cnt,.Loop_mul4x_reduction
1279 adc $carry,$carry,xzr
1280 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1281 ldp $t2,$t3,[$tp,#8*6]
1282 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1283 ldp $a2,$a3,[$ap,#8*2]
1285 adds $acc0,$acc0,$t0
1286 adcs $acc1,$acc1,$t1
1287 adcs $acc2,$acc2,$t2
1288 adcs $acc3,$acc3,$t3
1289 //adc $carry,$carry,xzr
1291 ldr $mi,[sp] // t[0]*n0
1292 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1293 ldp $m2,$m3,[$np,#8*2]
1298 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1299 adc $carry,$carry,xzr // modulo-scheduled
1305 adds $acc0,$acc0,$t0
1306 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1307 adcs $acc1,$acc1,$t1
1309 adcs $acc2,$acc2,$t2
1311 adcs $acc3,$acc3,$t3
1314 ldr $bi,[$bp,$cnt] // next b[i]
1315 adds $acc1,$acc1,$t0
1316 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1317 adcs $acc2,$acc2,$t1
1319 adcs $acc3,$acc3,$t2
1321 adc $acc4,$acc4,$t3 // can't overflow
1323 adds $acc0,$acc0,$t0
1324 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1325 adcs $acc1,$acc1,$t1
1327 adcs $acc2,$acc2,$t2
1329 adcs $acc3,$acc3,$t3
1331 adcs $acc4,$acc4,$carry
1332 ldr $mi,[sp,$cnt] // next a[0]*n0
1334 str $acc0,[$tp],#8 // result!!!
1335 adds $acc0,$acc1,$t0
1336 sub $t0,$ap_end,$ap // done yet?
1337 adcs $acc1,$acc2,$t1
1338 adcs $acc2,$acc3,$t2
1339 adcs $acc3,$acc4,$t3
1340 //adc $carry,$carry,xzr
1341 cbnz $cnt,.Loop_mul4x_tail
1343 sub $t1,$np,$num // rewinded np?
1344 adc $carry,$carry,xzr
1345 cbz $t0,.Loop_mul4x_break
1347 ldp $t0,$t1,[$tp,#8*4]
1348 ldp $t2,$t3,[$tp,#8*6]
1349 ldp $a0,$a1,[$ap,#8*0]
1350 ldp $a2,$a3,[$ap,#8*2]
1352 adds $acc0,$acc0,$t0
1353 adcs $acc1,$acc1,$t1
1354 adcs $acc2,$acc2,$t2
1355 adcs $acc3,$acc3,$t3
1356 //adc $carry,$carry,xzr
1357 ldp $m0,$m1,[$np,#8*0]
1358 ldp $m2,$m3,[$np,#8*2]
1364 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1365 adds $acc0,$acc0,$topmost
1366 add $bp,$bp,#8*4 // bp++
1367 adcs $acc1,$acc1,xzr
1368 sub $ap,$ap,$num // rewind ap
1369 adcs $acc2,$acc2,xzr
1370 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1371 adcs $acc3,$acc3,xzr
1372 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1373 adc $topmost,$carry,xzr
1374 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1375 cmp $bp,$t3 // done yet?
1376 ldp $acc2,$acc3,[sp,#8*6]
1377 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1378 ldp $m2,$m3,[$t1,#8*2]
1383 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1384 ldp $a2,$a3,[$ap,#8*2]
1385 adds $ap,$ap,#8*4 // clear carry bit
1388 b .Loop_mul4x_reduction
1392 // Final step. We see if result is larger than modulus, and
1393 // if it is, subtract the modulus. But comparison implies
1394 // subtraction. So we subtract modulus, see if it borrowed,
1395 // and conditionally copy original value.
1397 mov $ap_end,$t2 // $rp copy
1405 ldp $m0,$m1,[$np,#8*0]
1407 ldp $acc0,$acc1,[$tp,#8*0]
1409 ldp $m2,$m3,[$np,#8*2]
1411 ldp $acc2,$acc3,[$tp,#8*2]
1413 stp $t0,$t1,[$rp,#8*0]
1415 stp $t2,$t3,[$rp,#8*2]
1418 cbnz $cnt,.Lmul4x_sub
1423 ldp $a0,$a1,[$ap_end,#8*0]
1425 stp $t0,$t1,[$rp,#8*0]
1426 ldp $a2,$a3,[$ap_end,#8*2]
1427 stp $t2,$t3,[$rp,#8*2]
1428 ldp $acc0,$acc1,[$ap,#8*0]
1429 ldp $acc2,$acc3,[$ap,#8*2]
1430 sbcs xzr,$topmost,xzr // did it borrow?
1431 ldr x30,[x29,#8] // pull return address
1436 csel $t0,$acc0,$a0,lo
1437 stp xzr,xzr,[$tp,#8*0]
1438 csel $t1,$acc1,$a1,lo
1439 ldp $a0,$a1,[$ap_end,#8*4]
1440 ldp $acc0,$acc1,[$ap,#8*4]
1441 csel $t2,$acc2,$a2,lo
1442 stp xzr,xzr,[$tp,#8*2]
1444 csel $t3,$acc3,$a3,lo
1445 ldp $a2,$a3,[$ap_end,#8*6]
1446 ldp $acc2,$acc3,[$ap,#8*6]
1448 stp $t0,$t1,[$ap_end,#8*0]
1449 stp $t2,$t3,[$ap_end,#8*2]
1450 add $ap_end,$ap_end,#8*4
1451 cbnz $cnt,.Lmul4x_cond_copy
1453 csel $t0,$acc0,$a0,lo
1454 stp xzr,xzr,[$tp,#8*0]
1455 csel $t1,$acc1,$a1,lo
1456 stp xzr,xzr,[$tp,#8*2]
1457 csel $t2,$acc2,$a2,lo
1458 stp xzr,xzr,[$tp,#8*3]
1459 csel $t3,$acc3,$a3,lo
1460 stp xzr,xzr,[$tp,#8*4]
1461 stp $t0,$t1,[$ap_end,#8*0]
1462 stp $t2,$t3,[$ap_end,#8*2]
1467 .Lmul4x4_post_condition:
1468 adc $carry,$carry,xzr
1469 ldr $ap,[x29,#96] // pull rp
1470 // $acc0-3,$carry hold result, $m0-7 hold modulus
1472 ldr x30,[x29,#8] // pull return address
1474 stp xzr,xzr,[sp,#8*0]
1476 stp xzr,xzr,[sp,#8*2]
1478 stp xzr,xzr,[sp,#8*4]
1479 sbcs xzr,$carry,xzr // did it borrow?
1480 stp xzr,xzr,[sp,#8*6]
1482 // $a0-3 hold result-modulus
1483 csel $a0,$acc0,$a0,lo
1484 csel $a1,$acc1,$a1,lo
1485 csel $a2,$acc2,$a2,lo
1486 csel $a3,$acc3,$a3,lo
1487 stp $a0,$a1,[$ap,#8*0]
1488 stp $a2,$a3,[$ap,#8*2]
1491 ldp x19,x20,[x29,#16]
1493 ldp x21,x22,[x29,#32]
1495 ldp x23,x24,[x29,#48]
1496 ldp x25,x26,[x29,#64]
1497 ldp x27,x28,[x29,#80]
1500 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1504 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"