2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # ECP_NISTZ256 module for ARMv8.
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
24 # with/without -DECP_NISTZ256_ASM
26 # Cortex-A53 +120-400%
27 # Cortex-A57 +120-350%
31 # Ranges denote minimum and maximum improvement coefficients depending
32 # on benchmark. Lower coefficients are for ECDSA sign, server-side
33 # operation. Keep in mind that +400% means 5x improvement.
36 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
41 die "can't locate arm-xlate.pl";
43 open OUT,"| \"$^X\" $xlate $flavour $output";
47 my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
48 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
49 map("x$_",(0..17,19,20));
51 my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
58 ########################################################################
59 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 open TABLE,"<ecp_nistz256_table.c" or
63 open TABLE,"<${dir}../ecp_nistz256_table.c" or
64 die "failed to open ecp_nistz256_table.c:",$!;
69 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
73 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
74 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
76 die "insane number of elements" if ($#arr != 64*16*37-1);
79 .globl ecp_nistz256_precomputed
80 .type ecp_nistz256_precomputed,%object
82 ecp_nistz256_precomputed:
84 ########################################################################
85 # this conversion smashes P256_POINT_AFFINE by individual bytes with
86 # 64 byte interval, similar to
90 @tbl = splice(@arr,0,64*16);
91 for($i=0;$i<64;$i++) {
93 for($j=0;$j<64;$j++) {
94 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
97 $code.=join(',',map { sprintf "0x%02x",$_} @line);
102 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
105 .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
106 .LRR: // 2^512 mod P precomputed for NIST P256 polynomial
107 .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
109 .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
112 .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
114 // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
115 .globl ecp_nistz256_to_mont
116 .type ecp_nistz256_to_mont,%function
118 ecp_nistz256_to_mont:
119 stp x29,x30,[sp,#-32]!
123 ldr $bi,.LRR // bp[0]
125 ldp $a2,$a3,[$ap,#16]
128 adr $bp,.LRR // &bp[0]
130 bl __ecp_nistz256_mul_mont
135 .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
137 // void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
138 .globl ecp_nistz256_from_mont
139 .type ecp_nistz256_from_mont,%function
141 ecp_nistz256_from_mont:
142 stp x29,x30,[sp,#-32]!
148 ldp $a2,$a3,[$ap,#16]
151 adr $bp,.Lone // &bp[0]
153 bl __ecp_nistz256_mul_mont
158 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
160 // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
161 // const BN_ULONG x2[4]);
162 .globl ecp_nistz256_mul_mont
163 .type ecp_nistz256_mul_mont,%function
165 ecp_nistz256_mul_mont:
166 stp x29,x30,[sp,#-32]!
170 ldr $bi,[$bp] // bp[0]
172 ldp $a2,$a3,[$ap,#16]
176 bl __ecp_nistz256_mul_mont
181 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
183 // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
184 .globl ecp_nistz256_sqr_mont
185 .type ecp_nistz256_sqr_mont,%function
187 ecp_nistz256_sqr_mont:
188 stp x29,x30,[sp,#-32]!
193 ldp $a2,$a3,[$ap,#16]
197 bl __ecp_nistz256_sqr_mont
202 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
204 // void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
205 // const BN_ULONG x2[4]);
206 .globl ecp_nistz256_add
207 .type ecp_nistz256_add,%function
210 stp x29,x30,[sp,#-16]!
213 ldp $acc0,$acc1,[$ap]
215 ldp $acc2,$acc3,[$ap,#16]
216 ldp $t2,$t3,[$bp,#16]
220 bl __ecp_nistz256_add
224 .size ecp_nistz256_add,.-ecp_nistz256_add
226 // void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
227 .globl ecp_nistz256_div_by_2
228 .type ecp_nistz256_div_by_2,%function
230 ecp_nistz256_div_by_2:
231 stp x29,x30,[sp,#-16]!
234 ldp $acc0,$acc1,[$ap]
235 ldp $acc2,$acc3,[$ap,#16]
239 bl __ecp_nistz256_div_by_2
243 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
245 // void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
246 .globl ecp_nistz256_mul_by_2
247 .type ecp_nistz256_mul_by_2,%function
249 ecp_nistz256_mul_by_2:
250 stp x29,x30,[sp,#-16]!
253 ldp $acc0,$acc1,[$ap]
254 ldp $acc2,$acc3,[$ap,#16]
262 bl __ecp_nistz256_add // ret = a+a // 2*a
266 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
268 // void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
269 .globl ecp_nistz256_mul_by_3
270 .type ecp_nistz256_mul_by_3,%function
272 ecp_nistz256_mul_by_3:
273 stp x29,x30,[sp,#-16]!
276 ldp $acc0,$acc1,[$ap]
277 ldp $acc2,$acc3,[$ap,#16]
289 bl __ecp_nistz256_add // ret = a+a // 2*a
296 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a
300 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
302 // void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
303 // const BN_ULONG x2[4]);
304 .globl ecp_nistz256_sub
305 .type ecp_nistz256_sub,%function
308 stp x29,x30,[sp,#-16]!
311 ldp $acc0,$acc1,[$ap]
312 ldp $acc2,$acc3,[$ap,#16]
316 bl __ecp_nistz256_sub_from
320 .size ecp_nistz256_sub,.-ecp_nistz256_sub
322 // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
323 .globl ecp_nistz256_neg
324 .type ecp_nistz256_neg,%function
327 stp x29,x30,[sp,#-16]!
331 mov $acc0,xzr // a = 0
338 bl __ecp_nistz256_sub_from
342 .size ecp_nistz256_neg,.-ecp_nistz256_neg
344 // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
345 // to $a0-$a3 and b[0] - to $bi
346 .type __ecp_nistz256_mul_mont,%function
348 __ecp_nistz256_mul_mont:
349 mul $acc0,$a0,$bi // a[0]*b[0]
352 mul $acc1,$a1,$bi // a[1]*b[0]
355 mul $acc2,$a2,$bi // a[2]*b[0]
358 mul $acc3,$a3,$bi // a[3]*b[0]
360 ldr $bi,[$bp,#8] // b[1]
362 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
370 for($i=1;$i<4;$i++) {
371 # Reduction iteration is normally performed by accumulating
372 # result of multiplication of modulus by "magic" digit [and
373 # omitting least significant word, which is guaranteed to
374 # be 0], but thanks to special form of modulus and "magic"
375 # digit being equal to least significant word, it can be
376 # performed with additions and subtractions alone. Indeed:
378 # ffff0001.00000000.0000ffff.ffffffff
380 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
382 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
385 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
386 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
387 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
389 # or marking redundant operations:
391 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
392 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
393 # - 0000abcd.efgh0000.--------.--------.--------
396 subs $t2,$acc0,$t0 // "*0xffff0001"
398 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
399 mul $t0,$a0,$bi // lo(a[0]*b[i])
401 mul $t1,$a1,$bi // lo(a[1]*b[i])
402 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
403 mul $t2,$a2,$bi // lo(a[2]*b[i])
405 mul $t3,$a3,$bi // lo(a[3]*b[i])
408 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
409 umulh $t0,$a0,$bi // hi(a[0]*b[i])
411 umulh $t1,$a1,$bi // hi(a[1]*b[i])
413 umulh $t2,$a2,$bi // hi(a[2]*b[i])
415 umulh $t3,$a3,$bi // hi(a[3]*b[i])
418 $code.=<<___ if ($i<3);
419 ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
422 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
433 subs $t2,$acc0,$t0 // "*0xffff0001"
435 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
437 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
441 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
442 sbcs $t1,$acc1,$poly1
444 sbcs $t3,$acc3,$poly3
445 sbcs xzr,$acc4,xzr // did it borrow?
447 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
448 csel $acc1,$acc1,$t1,lo
449 csel $acc2,$acc2,$t2,lo
450 stp $acc0,$acc1,[$rp]
451 csel $acc3,$acc3,$t3,lo
452 stp $acc2,$acc3,[$rp,#16]
455 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
457 // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
459 .type __ecp_nistz256_sqr_mont,%function
461 __ecp_nistz256_sqr_mont:
462 // | | | | | |a1*a0| |
463 // | | | | |a2*a0| | |
464 // | |a3*a2|a3*a0| | | |
465 // | | | |a2*a1| | | |
466 // | | |a3*a1| | | | |
467 // *| | | | | | | | 2|
468 // +|a3*a3|a2*a2|a1*a1|a0*a0|
469 // |--+--+--+--+--+--+--+--|
470 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
472 // "can't overflow" below mark carrying into high part of
473 // multiplication result, which can't overflow, because it
474 // can never be all ones.
476 mul $acc1,$a1,$a0 // a[1]*a[0]
478 mul $acc2,$a2,$a0 // a[2]*a[0]
480 mul $acc3,$a3,$a0 // a[3]*a[0]
483 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
484 mul $t0,$a2,$a1 // a[2]*a[1]
487 mul $t2,$a3,$a1 // a[3]*a[1]
489 adc $acc4,$acc4,xzr // can't overflow
491 mul $acc5,$a3,$a2 // a[3]*a[2]
494 adds $t1,$t1,$t2 // accumulate high parts of multiplication
495 mul $acc0,$a0,$a0 // a[0]*a[0]
496 adc $t2,$t3,xzr // can't overflow
498 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
501 mul $t1,$a1,$a1 // a[1]*a[1]
504 adc $acc6,$acc6,xzr // can't overflow
506 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
507 mul $t2,$a2,$a2 // a[2]*a[2]
508 adcs $acc2,$acc2,$acc2
510 adcs $acc3,$acc3,$acc3
511 mul $t3,$a3,$a3 // a[3]*a[3]
512 adcs $acc4,$acc4,$acc4
514 adcs $acc5,$acc5,$acc5
515 adcs $acc6,$acc6,$acc6
518 adds $acc1,$acc1,$a0 // +a[i]*a[i]
528 for($i=0;$i<3;$i++) { # reductions, see commentary in
529 # multiplication for details
531 subs $t2,$acc0,$t0 // "*0xffff0001"
533 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
536 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
538 adc $acc3,$t3,xzr // can't overflow
542 subs $t2,$acc0,$t0 // "*0xffff0001"
544 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
546 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
547 adc $acc3,$t3,xzr // can't overflow
549 adds $acc0,$acc0,$acc4 // accumulate upper half
550 adcs $acc1,$acc1,$acc5
551 adcs $acc2,$acc2,$acc6
552 adcs $acc3,$acc3,$acc7
555 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
556 sbcs $t1,$acc1,$poly1
558 sbcs $t3,$acc3,$poly3
559 sbcs xzr,$acc4,xzr // did it borrow?
561 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
562 csel $acc1,$acc1,$t1,lo
563 csel $acc2,$acc2,$t2,lo
564 stp $acc0,$acc1,[$rp]
565 csel $acc3,$acc3,$t3,lo
566 stp $acc2,$acc3,[$rp,#16]
569 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
571 // Note that __ecp_nistz256_add expects both input vectors pre-loaded to
572 // $a0-$a3 and $t0-$t3. This is done because it's used in multiple
573 // contexts, e.g. in multiplication by 2 and 3...
574 .type __ecp_nistz256_add,%function
577 adds $acc0,$acc0,$t0 // ret = a+b
581 adc $ap,xzr,xzr // zap $ap
583 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
584 sbcs $t1,$acc1,$poly1
586 sbcs $t3,$acc3,$poly3
587 sbcs xzr,$ap,xzr // did subtraction borrow?
589 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
590 csel $acc1,$acc1,$t1,lo
591 csel $acc2,$acc2,$t2,lo
592 stp $acc0,$acc1,[$rp]
593 csel $acc3,$acc3,$t3,lo
594 stp $acc2,$acc3,[$rp,#16]
597 .size __ecp_nistz256_add,.-__ecp_nistz256_add
599 .type __ecp_nistz256_sub_from,%function
601 __ecp_nistz256_sub_from:
603 ldp $t2,$t3,[$bp,#16]
604 subs $acc0,$acc0,$t0 // ret = a-b
608 sbc $ap,xzr,xzr // zap $ap
610 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
611 adcs $t1,$acc1,$poly1
614 cmp $ap,xzr // did subtraction borrow?
616 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
617 csel $acc1,$acc1,$t1,eq
618 csel $acc2,$acc2,$t2,eq
619 stp $acc0,$acc1,[$rp]
620 csel $acc3,$acc3,$t3,eq
621 stp $acc2,$acc3,[$rp,#16]
624 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
626 .type __ecp_nistz256_sub_morf,%function
628 __ecp_nistz256_sub_morf:
630 ldp $t2,$t3,[$bp,#16]
631 subs $acc0,$t0,$acc0 // ret = b-a
635 sbc $ap,xzr,xzr // zap $ap
637 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
638 adcs $t1,$acc1,$poly1
641 cmp $ap,xzr // did subtraction borrow?
643 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
644 csel $acc1,$acc1,$t1,eq
645 csel $acc2,$acc2,$t2,eq
646 stp $acc0,$acc1,[$rp]
647 csel $acc3,$acc3,$t3,eq
648 stp $acc2,$acc3,[$rp,#16]
651 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
653 .type __ecp_nistz256_div_by_2,%function
655 __ecp_nistz256_div_by_2:
656 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
657 adcs $t1,$acc1,$poly1
659 adcs $t3,$acc3,$poly3
660 adc $ap,xzr,xzr // zap $ap
661 tst $acc0,#1 // is a even?
663 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
664 csel $acc1,$acc1,$t1,eq
665 csel $acc2,$acc2,$t2,eq
666 csel $acc3,$acc3,$t3,eq
669 lsr $acc0,$acc0,#1 // ret >>= 1
670 orr $acc0,$acc0,$acc1,lsl#63
672 orr $acc1,$acc1,$acc2,lsl#63
674 orr $acc2,$acc2,$acc3,lsl#63
676 stp $acc0,$acc1,[$rp]
677 orr $acc3,$acc3,$ap,lsl#63
678 stp $acc2,$acc3,[$rp,#16]
681 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
683 ########################################################################
684 # following subroutines are "literal" implementation of those found in
687 ########################################################################
688 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
691 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
692 # above map() describes stack layout with 4 temporary
693 # 256-bit vectors on top.
694 my ($rp_real,$ap_real) = map("x$_",(21,22));
697 .globl ecp_nistz256_point_double
698 .type ecp_nistz256_point_double,%function
700 ecp_nistz256_point_double:
701 stp x29,x30,[sp,#-80]!
708 ldp $acc0,$acc1,[$ap,#32]
710 ldp $acc2,$acc3,[$ap,#48]
716 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
719 ldp $a2,$a3,[$ap_real,#64+16]
721 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
724 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
726 ldp $t0,$t1,[$ap_real]
727 ldp $t2,$t3,[$ap_real,#16]
728 mov $a0,$acc0 // put Zsqr aside for p256_sub
733 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
736 mov $acc0,$a0 // restore Zsqr
738 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
741 ldp $a2,$a3,[sp,#$S+16]
743 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
746 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
748 ldr $bi,[$ap_real,#32]
749 ldp $a0,$a1,[$ap_real,#64]
750 ldp $a2,$a3,[$ap_real,#64+16]
753 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
757 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
760 ldp $a2,$a3,[sp,#$S+16]
762 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
765 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
767 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
769 ldp $a2,$a3,[sp,#$M+16]
771 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
775 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
777 mov $t0,$acc0 // duplicate M
781 mov $a0,$acc0 // put M aside
786 bl __ecp_nistz256_add
787 mov $t0,$a0 // restore M
789 ldr $bi,[$ap_real] // forward load for p256_mul_mont
793 ldp $a2,$a3,[sp,#$S+16]
794 bl __ecp_nistz256_add // p256_mul_by_3(M, M);
798 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
802 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
805 ldp $a2,$a3,[sp,#$M+16]
807 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
810 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
813 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
817 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
820 mov $a0,$acc0 // copy S
825 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
829 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
831 add sp,x29,#0 // destroy frame
832 ldp x19,x20,[x29,#16]
833 ldp x21,x22,[x29,#32]
836 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
840 ########################################################################
841 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
842 # const P256_POINT *in2);
844 my ($res_x,$res_y,$res_z,
845 $H,$Hsqr,$R,$Rsqr,$Hcub,
846 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
847 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
848 # above map() describes stack layout with 12 temporary
849 # 256-bit vectors on top.
850 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
853 .globl ecp_nistz256_point_add
854 .type ecp_nistz256_point_add,%function
856 ecp_nistz256_point_add:
857 stp x29,x30,[sp,#-80]!
865 ldp $a0,$a1,[$bp,#64] // in2_z
866 ldp $a2,$a3,[$bp,#64+16]
874 orr $in2infty,$t0,$t2
876 csetm $in2infty,ne // !in2infty
878 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
880 ldp $a0,$a1,[$ap_real,#64] // in1_z
881 ldp $a2,$a3,[$ap_real,#64+16]
884 orr $in1infty,$t0,$t2
886 csetm $in1infty,ne // !in1infty
888 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
890 ldr $bi,[$bp_real,#64]
891 ldp $a0,$a1,[sp,#$Z2sqr]
892 ldp $a2,$a3,[sp,#$Z2sqr+16]
895 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
897 ldr $bi,[$ap_real,#64]
898 ldp $a0,$a1,[sp,#$Z1sqr]
899 ldp $a2,$a3,[sp,#$Z1sqr+16]
902 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
904 ldr $bi,[$ap_real,#32]
905 ldp $a0,$a1,[sp,#$S1]
906 ldp $a2,$a3,[sp,#$S1+16]
909 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
911 ldr $bi,[$bp_real,#32]
912 ldp $a0,$a1,[sp,#$S2]
913 ldp $a2,$a3,[sp,#$S2+16]
916 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
919 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont
920 ldp $a0,$a1,[$ap_real]
921 ldp $a2,$a3,[$ap_real,#16]
923 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
925 orr $acc0,$acc0,$acc1 // see if result is zero
926 orr $acc2,$acc2,$acc3
927 orr $temp,$acc0,$acc2
931 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
934 ldp $a0,$a1,[$bp_real]
935 ldp $a2,$a3,[$bp_real,#16]
938 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
941 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont
942 ldp $a2,$a3,[sp,#$R+16]
944 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
946 orr $acc0,$acc0,$acc1 // see if result is zero
947 orr $acc2,$acc2,$acc3
948 orr $acc0,$acc0,$acc2
950 b.ne .Ladd_proceed // is_equal(U1,U2)?
952 tst $in1infty,$in2infty
953 b.eq .Ladd_proceed // (in1infty || in2infty)?
956 b.eq .Ladd_double // is_equal(S1,S2)?
960 stp $a0,$a1,[$rp_real]
961 stp $a0,$a1,[$rp_real,#16]
962 stp $a0,$a1,[$rp_real,#32]
963 stp $a0,$a1,[$rp_real,#48]
964 stp $a0,$a1,[$rp_real,#64]
965 stp $a0,$a1,[$rp_real,#80]
972 ldp x23,x24,[x29,#48]
973 ldp x25,x26,[x29,#64]
974 add sp,sp,#32*(12-4) // difference in stack frames
980 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
982 ldr $bi,[$ap_real,#64]
984 ldp $a2,$a3,[sp,#$H+16]
987 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
990 ldp $a2,$a3,[sp,#$H+16]
992 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
994 ldr $bi,[$bp_real,#64]
995 ldp $a0,$a1,[sp,#$res_z]
996 ldp $a2,$a3,[sp,#$res_z+16]
999 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
1002 ldp $a0,$a1,[sp,#$Hsqr]
1003 ldp $a2,$a3,[sp,#$Hsqr+16]
1006 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1009 ldp $a0,$a1,[sp,#$U1]
1010 ldp $a2,$a3,[sp,#$U1+16]
1013 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
1020 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1024 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1027 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1030 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont
1031 ldp $a0,$a1,[sp,#$S1]
1032 ldp $a2,$a3,[sp,#$S1+16]
1034 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1038 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
1041 ldp $a0,$a1,[sp,#$res_y]
1042 ldp $a2,$a3,[sp,#$res_y+16]
1045 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1048 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1050 ldp $a0,$a1,[sp,#$res_x] // res
1051 ldp $a2,$a3,[sp,#$res_x+16]
1052 ldp $t0,$t1,[$bp_real] // in2
1053 ldp $t2,$t3,[$bp_real,#16]
1055 for($i=0;$i<64;$i+=32) { # conditional moves
1057 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1058 cmp $in1infty,#0 // !$in1intfy, remember?
1059 ldp $acc2,$acc3,[$ap_real,#$i+16]
1062 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1065 cmp $in2infty,#0 // !$in2intfy, remember?
1066 ldp $a2,$a3,[sp,#$res_x+$i+48]
1067 csel $acc0,$t0,$acc0,ne
1068 csel $acc1,$t1,$acc1,ne
1069 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1070 csel $acc2,$t2,$acc2,ne
1071 csel $acc3,$t3,$acc3,ne
1072 ldp $t2,$t3,[$bp_real,#$i+48]
1073 stp $acc0,$acc1,[$rp_real,#$i]
1074 stp $acc2,$acc3,[$rp_real,#$i+16]
1078 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1079 cmp $in1infty,#0 // !$in1intfy, remember?
1080 ldp $acc2,$acc3,[$ap_real,#$i+16]
1085 cmp $in2infty,#0 // !$in2intfy, remember?
1086 csel $acc0,$t0,$acc0,ne
1087 csel $acc1,$t1,$acc1,ne
1088 csel $acc2,$t2,$acc2,ne
1089 csel $acc3,$t3,$acc3,ne
1090 stp $acc0,$acc1,[$rp_real,#$i]
1091 stp $acc2,$acc3,[$rp_real,#$i+16]
1094 add sp,x29,#0 // destroy frame
1095 ldp x19,x20,[x29,#16]
1096 ldp x21,x22,[x29,#32]
1097 ldp x23,x24,[x29,#48]
1098 ldp x25,x26,[x29,#64]
1099 ldp x29,x30,[sp],#80
1101 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1105 ########################################################################
1106 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1107 # const P256_POINT_AFFINE *in2);
1109 my ($res_x,$res_y,$res_z,
1110 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1112 # above map() describes stack layout with 10 temporary
1113 # 256-bit vectors on top.
1114 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1117 .globl ecp_nistz256_point_add_affine
1118 .type ecp_nistz256_point_add_affine,%function
1120 ecp_nistz256_point_add_affine:
1121 stp x29,x30,[sp,#-80]!
1123 stp x19,x20,[sp,#16]
1124 stp x21,x22,[sp,#32]
1125 stp x23,x24,[sp,#48]
1126 stp x25,x26,[sp,#64]
1133 ldr $poly3,.Lpoly+24
1135 ldp $a0,$a1,[$ap,#64] // in1_z
1136 ldp $a2,$a3,[$ap,#64+16]
1139 orr $in1infty,$t0,$t2
1141 csetm $in1infty,ne // !in1infty
1143 ldp $acc0,$acc1,[$bp] // in2_x
1144 ldp $acc2,$acc3,[$bp,#16]
1145 ldp $t0,$t1,[$bp,#32] // in2_y
1146 ldp $t2,$t3,[$bp,#48]
1147 orr $acc0,$acc0,$acc1
1148 orr $acc2,$acc2,$acc3
1151 orr $acc0,$acc0,$acc2
1153 orr $in2infty,$acc0,$t0
1155 csetm $in2infty,ne // !in2infty
1158 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
1167 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
1170 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
1171 ldp $a0,$a1,[sp,#$Z1sqr]
1172 ldp $a2,$a3,[sp,#$Z1sqr+16]
1174 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
1176 add $bp,$ap_real,#64
1178 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
1180 ldr $bi,[$ap_real,#64]
1181 ldp $a0,$a1,[sp,#$H]
1182 ldp $a2,$a3,[sp,#$H+16]
1183 add $bp,$ap_real,#64
1185 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1187 ldr $bi,[$bp_real,#32]
1188 ldp $a0,$a1,[sp,#$S2]
1189 ldp $a2,$a3,[sp,#$S2+16]
1190 add $bp,$bp_real,#32
1192 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
1194 add $bp,$ap_real,#32
1195 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
1196 ldp $a2,$a3,[sp,#$H+16]
1198 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
1201 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1203 ldp $a0,$a1,[sp,#$R]
1204 ldp $a2,$a3,[sp,#$R+16]
1206 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
1209 ldp $a0,$a1,[sp,#$Hsqr]
1210 ldp $a2,$a3,[sp,#$Hsqr+16]
1213 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1216 ldp $a0,$a1,[sp,#$Hsqr]
1217 ldp $a2,$a3,[sp,#$Hsqr+16]
1220 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
1227 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1231 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1234 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1237 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
1238 ldp $a0,$a1,[sp,#$Hcub]
1239 ldp $a2,$a3,[sp,#$Hcub+16]
1241 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1243 add $bp,$ap_real,#32
1245 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
1248 ldp $a0,$a1,[sp,#$res_y]
1249 ldp $a2,$a3,[sp,#$res_y+16]
1252 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1255 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1257 ldp $a0,$a1,[sp,#$res_x] // res
1258 ldp $a2,$a3,[sp,#$res_x+16]
1259 ldp $t0,$t1,[$bp_real] // in2
1260 ldp $t2,$t3,[$bp_real,#16]
1262 for($i=0;$i<64;$i+=32) { # conditional moves
1264 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1265 cmp $in1infty,#0 // !$in1intfy, remember?
1266 ldp $acc2,$acc3,[$ap_real,#$i+16]
1269 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1272 cmp $in2infty,#0 // !$in2intfy, remember?
1273 ldp $a2,$a3,[sp,#$res_x+$i+48]
1274 csel $acc0,$t0,$acc0,ne
1275 csel $acc1,$t1,$acc1,ne
1276 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1277 csel $acc2,$t2,$acc2,ne
1278 csel $acc3,$t3,$acc3,ne
1279 ldp $t2,$t3,[$bp_real,#$i+48]
1280 stp $acc0,$acc1,[$rp_real,#$i]
1281 stp $acc2,$acc3,[$rp_real,#$i+16]
1283 $code.=<<___ if ($i == 0);
1284 adr $bp_real,.Lone_mont-64
1288 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1289 cmp $in1infty,#0 // !$in1intfy, remember?
1290 ldp $acc2,$acc3,[$ap_real,#$i+16]
1295 cmp $in2infty,#0 // !$in2intfy, remember?
1296 csel $acc0,$t0,$acc0,ne
1297 csel $acc1,$t1,$acc1,ne
1298 csel $acc2,$t2,$acc2,ne
1299 csel $acc3,$t3,$acc3,ne
1300 stp $acc0,$acc1,[$rp_real,#$i]
1301 stp $acc2,$acc3,[$rp_real,#$i+16]
1303 add sp,x29,#0 // destroy frame
1304 ldp x19,x20,[x29,#16]
1305 ldp x21,x22,[x29,#32]
1306 ldp x23,x24,[x29,#48]
1307 ldp x25,x26,[x29,#64]
1308 ldp x29,x30,[sp],#80
1310 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1314 ########################################################################
1315 # scatter-gather subroutines
1317 my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1319 // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1321 .globl ecp_nistz256_scatter_w5
1322 .type ecp_nistz256_scatter_w5,%function
1324 ecp_nistz256_scatter_w5:
1325 stp x29,x30,[sp,#-16]!
1328 add $out,$out,$index,lsl#2
1330 ldp x4,x5,[$inp] // X
1331 ldp x6,x7,[$inp,#16]
1332 str w4,[$out,#64*0-4]
1334 str w5,[$out,#64*1-4]
1336 str w6,[$out,#64*2-4]
1338 str w7,[$out,#64*3-4]
1340 str w4,[$out,#64*4-4]
1341 str w5,[$out,#64*5-4]
1342 str w6,[$out,#64*6-4]
1343 str w7,[$out,#64*7-4]
1346 ldp x4,x5,[$inp,#32] // Y
1347 ldp x6,x7,[$inp,#48]
1348 str w4,[$out,#64*0-4]
1350 str w5,[$out,#64*1-4]
1352 str w6,[$out,#64*2-4]
1354 str w7,[$out,#64*3-4]
1356 str w4,[$out,#64*4-4]
1357 str w5,[$out,#64*5-4]
1358 str w6,[$out,#64*6-4]
1359 str w7,[$out,#64*7-4]
1362 ldp x4,x5,[$inp,#64] // Z
1363 ldp x6,x7,[$inp,#80]
1364 str w4,[$out,#64*0-4]
1366 str w5,[$out,#64*1-4]
1368 str w6,[$out,#64*2-4]
1370 str w7,[$out,#64*3-4]
1372 str w4,[$out,#64*4-4]
1373 str w5,[$out,#64*5-4]
1374 str w6,[$out,#64*6-4]
1375 str w7,[$out,#64*7-4]
1379 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1381 // void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1383 .globl ecp_nistz256_gather_w5
1384 .type ecp_nistz256_gather_w5,%function
1386 ecp_nistz256_gather_w5:
1387 stp x29,x30,[sp,#-16]!
1392 add $index,$index,x3
1393 add $inp,$inp,$index,lsl#2
1401 ldr w10,[$inp,#64*6]
1402 ldr w11,[$inp,#64*7]
1406 orr x6,x6,x10,lsl#32
1407 orr x7,x7,x11,lsl#32
1412 stp x4,x5,[$out] // X
1413 stp x6,x7,[$out,#16]
1421 ldr w10,[$inp,#64*6]
1422 ldr w11,[$inp,#64*7]
1426 orr x6,x6,x10,lsl#32
1427 orr x7,x7,x11,lsl#32
1432 stp x4,x5,[$out,#32] // Y
1433 stp x6,x7,[$out,#48]
1441 ldr w10,[$inp,#64*6]
1442 ldr w11,[$inp,#64*7]
1445 orr x6,x6,x10,lsl#32
1446 orr x7,x7,x11,lsl#32
1451 stp x4,x5,[$out,#64] // Z
1452 stp x6,x7,[$out,#80]
1456 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1458 // void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1460 .globl ecp_nistz256_scatter_w7
1461 .type ecp_nistz256_scatter_w7,%function
1463 ecp_nistz256_scatter_w7:
1464 stp x29,x30,[sp,#-16]!
1467 add $out,$out,$index
1471 subs $index,$index,#1
1472 prfm pstl1strm,[$out,#4096+64*0]
1473 prfm pstl1strm,[$out,#4096+64*1]
1474 prfm pstl1strm,[$out,#4096+64*2]
1475 prfm pstl1strm,[$out,#4096+64*3]
1476 prfm pstl1strm,[$out,#4096+64*4]
1477 prfm pstl1strm,[$out,#4096+64*5]
1478 prfm pstl1strm,[$out,#4096+64*6]
1479 prfm pstl1strm,[$out,#4096+64*7]
1480 strb w3,[$out,#64*0-1]
1482 strb w3,[$out,#64*1-1]
1484 strb w3,[$out,#64*2-1]
1486 strb w3,[$out,#64*3-1]
1488 strb w3,[$out,#64*4-1]
1490 strb w3,[$out,#64*5-1]
1492 strb w3,[$out,#64*6-1]
1494 strb w3,[$out,#64*7-1]
1496 b.ne .Loop_scatter_w7
1500 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1502 // void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1504 .globl ecp_nistz256_gather_w7
1505 .type ecp_nistz256_gather_w7,%function
1507 ecp_nistz256_gather_w7:
1508 stp x29,x30,[sp,#-16]!
1513 add $index,$index,x3
1514 add $inp,$inp,$index
1518 ldrb w4,[$inp,#64*0]
1519 prfm pldl1strm,[$inp,#4096+64*0]
1520 subs $index,$index,#1
1521 ldrb w5,[$inp,#64*1]
1522 prfm pldl1strm,[$inp,#4096+64*1]
1523 ldrb w6,[$inp,#64*2]
1524 prfm pldl1strm,[$inp,#4096+64*2]
1525 ldrb w7,[$inp,#64*3]
1526 prfm pldl1strm,[$inp,#4096+64*3]
1527 ldrb w8,[$inp,#64*4]
1528 prfm pldl1strm,[$inp,#4096+64*4]
1529 ldrb w9,[$inp,#64*5]
1530 prfm pldl1strm,[$inp,#4096+64*5]
1531 ldrb w10,[$inp,#64*6]
1532 prfm pldl1strm,[$inp,#4096+64*6]
1533 ldrb w11,[$inp,#64*7]
1534 prfm pldl1strm,[$inp,#4096+64*7]
1540 orr x10,x10,x11,lsl#8
1542 orr x4,x4,x10,lsl#48
1545 b.ne .Loop_gather_w7
1549 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1553 foreach (split("\n",$code)) {
1554 s/\`([^\`]*)\`/eval $1/ge;
1558 close STDOUT; # enforce flush