2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
32 # (*) Engine accessing the driver in question is on my TODO list.
33 # For reference, acceleator is estimated to give 6 to 10 times
34 # improvement on single-threaded RSA sign. It should be noted
35 # that 6-10x improvement coefficient does not actually mean
36 # something extraordinary in terms of absolute [single-threaded]
37 # performance, as SPARCv9 instruction set is by all means least
38 # suitable for high performance crypto among other 64 bit
39 # platforms. 6-10x factor simply places T1 in same performance
40 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 # appear impressive at all, but it's the sign operation which is
42 # far more critical/interesting.
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
53 open STDOUT,">$output";
56 $rp="%i0"; # BN_ULONG *rp,
57 $ap="%i1"; # const BN_ULONG *ap,
58 $bp="%i2"; # const BN_ULONG *bp,
59 $np="%i3"; # const BN_ULONG *np,
60 $n0="%i4"; # const BN_ULONG *n0,
61 $num="%i5"; # int num);
71 $mask="%g1"; # 32 bits, what a waste...
84 $fname="bn_mul_mont_int";
87 #include "sparc_arch.h"
89 .section ".text",#alloc,#execinstr
94 cmp %o5,4 ! 128 bits minimum
96 sethi %hi(0xffffffff),$mask
102 sll $num,2,$num ! num*=4
103 or $mask,%lo(0xffffffff),$mask
107 ld [$bp],$mul0 ! bp[0]
110 add %sp,$bias,%o7 ! real top of stack
111 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
113 ld [$ap+4],$apj ! ap[1]
115 ld [$np],$car1 ! np[0]
116 sub %o7,$bias,%sp ! alloca
117 ld [$np+4],$npj ! np[1]
118 be,pt SIZE_T_CC,.Lbn_sqr_mont
121 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
122 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
123 and $car0,$mask,$acc0
124 add %sp,$bias+$frame,$tp
125 ld [$ap+8],$apj !prologue!
127 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
128 and $mul1,$mask,$mul1
130 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
131 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
133 add $acc0,$car1,$car1
134 ld [$np+8],$npj !prologue!
136 mov $tmp0,$acc0 !prologue!
139 mulx $apj,$mul0,$tmp0
140 mulx $npj,$mul1,$tmp1
141 add $acc0,$car0,$car0
142 ld [$ap+$j],$apj ! ap[j]
143 and $car0,$mask,$acc0
144 add $acc1,$car1,$car1
145 ld [$np+$j],$npj ! np[j]
147 add $acc0,$car1,$car1
158 mulx $apj,$mul0,$tmp0 !epilogue!
159 mulx $npj,$mul1,$tmp1
160 add $acc0,$car0,$car0
161 and $car0,$mask,$acc0
162 add $acc1,$car1,$car1
164 add $acc0,$car1,$car1
168 add $tmp0,$car0,$car0
169 and $car0,$mask,$acc0
170 add $tmp1,$car1,$car1
172 add $acc0,$car1,$car1
176 add $car0,$car1,$car1
181 ld [$bp+4],$mul0 ! bp[1]
183 add %sp,$bias+$frame,$tp
184 ld [$ap],$car0 ! ap[0]
185 ld [$ap+4],$apj ! ap[1]
186 ld [$np],$car1 ! np[0]
187 ld [$np+4],$npj ! np[1]
188 ld [$tp],$tmp1 ! tp[0]
189 ld [$tp+4],$tpj ! tp[1]
192 mulx $car0,$mul0,$car0
193 mulx $apj,$mul0,$tmp0 !prologue!
194 add $tmp1,$car0,$car0
195 ld [$ap+8],$apj !prologue!
196 and $car0,$mask,$acc0
199 and $mul1,$mask,$mul1
201 mulx $car1,$mul1,$car1
202 mulx $npj,$mul1,$acc1 !prologue!
204 add $acc0,$car1,$car1
205 ld [$np+8],$npj !prologue!
207 mov $tmp0,$acc0 !prologue!
210 mulx $apj,$mul0,$tmp0
211 mulx $npj,$mul1,$tmp1
213 ld [$ap+$j],$apj ! ap[j]
214 add $acc0,$car0,$car0
215 add $acc1,$car1,$car1
216 ld [$np+$j],$npj ! np[j]
217 and $car0,$mask,$acc0
218 ld [$tp+8],$tpj ! tp[j]
220 add $acc0,$car1,$car1
223 st $car1,[$tp] ! tp[j-1]
231 mulx $apj,$mul0,$tmp0 !epilogue!
232 mulx $npj,$mul1,$tmp1
234 add $acc0,$car0,$car0
235 ld [$tp+8],$tpj ! tp[j]
236 and $car0,$mask,$acc0
237 add $acc1,$car1,$car1
239 add $acc0,$car1,$car1
240 st $car1,[$tp] ! tp[j-1]
244 add $tmp0,$car0,$car0
245 and $car0,$mask,$acc0
246 add $tmp1,$car1,$car1
247 add $acc0,$car1,$car1
248 st $car1,[$tp+4] ! tp[j-1]
253 add $car0,$car1,$car1
255 add $car2,$car1,$car1
260 ld [$bp+$i],$mul0 ! bp[i]
269 sub %g0,$num,%o7 ! k=-num
271 subcc %g0,%g0,%g0 ! clear %icc.c
276 subccc %o0,%o1,%o1 ! tp[j]-np[j]
281 subc $car2,0,$car2 ! handle upmost overflow bit
288 ld [$ap+%o7],%o0 ! copy or in-place refresh
289 st %g0,[$tp+%o7] ! zap tp
300 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
301 ######## code without following dedicated squaring procedure.
303 $sbit="%i2"; # re-use $bp!
308 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
309 mulx $apj,$mul0,$tmp0 !prologue!
310 and $car0,$mask,$acc0
311 add %sp,$bias+$frame,$tp
312 ld [$ap+8],$apj !prologue!
314 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
316 and $mul1,$mask,$mul1
318 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
319 mulx $npj,$mul1,$acc1 !prologue!
321 ld [$np+8],$npj !prologue!
323 add $acc0,$car1,$car1
325 mov $tmp0,$acc0 !prologue!
328 mulx $apj,$mul0,$tmp0
329 mulx $npj,$mul1,$tmp1
330 add $acc0,$car0,$car0 ! ap[j]*a0+c0
331 add $acc1,$car1,$car1
332 ld [$ap+$j],$apj ! ap[j]
333 and $car0,$mask,$acc0
334 ld [$np+$j],$npj ! np[j]
336 add $acc0,$acc0,$acc0
341 and $acc0,$mask,$acc0
343 add $acc0,$car1,$car1
351 mulx $apj,$mul0,$tmp0 ! epilogue
352 mulx $npj,$mul1,$tmp1
353 add $acc0,$car0,$car0 ! ap[j]*a0+c0
354 add $acc1,$car1,$car1
355 and $car0,$mask,$acc0
357 add $acc0,$acc0,$acc0
360 and $acc0,$mask,$acc0
361 add $acc0,$car1,$car1
365 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
366 add $tmp1,$car1,$car1
367 and $car0,$mask,$acc0
369 add $acc0,$acc0,$acc0
372 and $acc0,$mask,$acc0
373 add $acc0,$car1,$car1
377 add $car0,$car0,$car0
379 add $car0,$car1,$car1
383 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
384 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
385 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
386 ld [$ap+4],$mul0 ! ap[1]
387 ld [$ap+8],$apj ! ap[2]
388 ld [$np],$car1 ! np[0]
389 ld [$np+4],$npj ! np[1]
392 mulx $mul0,$mul0,$car0
393 and $mul1,$mask,$mul1
395 mulx $car1,$mul1,$car1
396 mulx $npj,$mul1,$acc1
397 add $tmp0,$car1,$car1
398 and $car0,$mask,$acc0
399 ld [$np+8],$npj ! np[2]
401 add $tmp1,$car1,$car1
403 add $acc0,$car1,$car1
405 add $acc1,$car1,$car1
408 st $car1,[%sp+$bias+$frame] ! tp[0]=
410 add %sp,$bias+$frame+4,$tp
413 mulx $apj,$mul0,$acc0
414 mulx $npj,$mul1,$acc1
415 add $acc0,$car0,$car0
417 ld [$ap+$j],$apj ! ap[j]
418 and $car0,$mask,$acc0
419 ld [$np+$j],$npj ! np[j]
421 add $acc1,$car1,$car1
422 ld [$tp+8],$tpj ! tp[j]
423 add $acc0,$acc0,$acc0
427 and $acc0,$mask,$acc0
429 add $acc0,$car1,$car1
430 st $car1,[$tp] ! tp[j-1]
436 mulx $apj,$mul0,$acc0
437 mulx $npj,$mul1,$acc1
438 add $acc0,$car0,$car0
440 and $car0,$mask,$acc0
442 add $acc1,$car1,$car1
443 add $acc0,$acc0,$acc0
446 and $acc0,$mask,$acc0
447 add $acc0,$car1,$car1
448 st $car1,[$tp] ! tp[j-1]
451 add $car0,$car0,$car0
453 add $car0,$car1,$car1
454 add $car2,$car1,$car1
458 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
459 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
460 ld [$ap+8],$mul0 ! ap[2]
461 ld [$np],$car1 ! np[0]
462 ld [$np+4],$npj ! np[1]
464 and $mul1,$mask,$mul1
467 mulx $mul0,$mul0,$car0
468 mulx $car1,$mul1,$car1
469 and $car0,$mask,$acc0
470 add $tmp1,$car1,$car1
472 add %sp,$bias+$frame,$tp
480 mulx $npj,$mul1,$acc1
485 add $acc1,$car1,$car1
494 ld [$ap+$j],$apj ! ap[j]
495 mulx $npj,$mul1,$acc1
497 ld [$np+$j],$npj ! np[j]
498 add $acc0,$car1,$car1
499 ld [$tp+8],$tpj ! tp[j]
500 add $acc1,$car1,$car1
506 be,pn %icc,.Lsqr_no_inner2
510 mulx $apj,$mul0,$acc0
511 mulx $npj,$mul1,$acc1
513 add $acc0,$car0,$car0
514 ld [$ap+$j],$apj ! ap[j]
515 and $car0,$mask,$acc0
516 ld [$np+$j],$npj ! np[j]
518 add $acc0,$acc0,$acc0
519 ld [$tp+8],$tpj ! tp[j]
523 and $acc0,$mask,$acc0
525 add $acc0,$car1,$car1
526 add $acc1,$car1,$car1
527 st $car1,[$tp] ! tp[j-1]
533 mulx $apj,$mul0,$acc0
534 mulx $npj,$mul1,$acc1
536 add $acc0,$car0,$car0
537 and $car0,$mask,$acc0
539 add $acc0,$acc0,$acc0
542 and $acc0,$mask,$acc0
543 add $acc0,$car1,$car1
544 add $acc1,$car1,$car1
545 st $car1,[$tp] ! tp[j-1]
548 add $car0,$car0,$car0
550 add $car0,$car1,$car1
551 add $car2,$car1,$car1
556 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
557 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
558 ld [$ap+$i],$mul0 ! ap[j]
559 ld [$np],$car1 ! np[0]
560 ld [$np+4],$npj ! np[1]
562 and $mul1,$mask,$mul1
565 mulx $mul0,$mul0,$car0
566 mulx $car1,$mul1,$car1
567 and $car0,$mask,$acc0
568 add $tmp1,$car1,$car1
570 add %sp,$bias+$frame,$tp
575 cmp $tmp0,$num ! i<num-1
580 mulx $npj,$mul1,$acc1
585 add $acc1,$car1,$car1
593 mulx $npj,$mul1,$acc1
595 add $acc0,$car1,$car1
596 add $acc1,$car1,$car1
600 add $car0,$car0,$car0 ! recover $car0
602 add $car0,$car1,$car1
603 add $car2,$car1,$car1
609 .type $fname,#function
610 .size $fname,(.-$fname)
611 .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
614 $code =~ s/\`([^\`]*)\`/eval($1)/gem;