3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA512 block procedure for ARMv4. September 2007.
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 25.5 cycles or 47% faster than integer-only code.
31 # Byte order [in]dependence. =========================================
33 # Originally caller was expected to maintain specific *dword* order in
34 # h[0-7], namely with most significant dword at *lower* address, which
35 # was reflected in below two parameters as 0 and 4. Now caller is
36 # expected to maintain native byte order for whole 64-bit values.
39 # ====================================================================
41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42 open STDOUT,">$output";
44 $ctx="r0"; # parameter block
58 ############ r13 is stack pointer
60 ############ r15 is program counter
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
79 str $Tlo,[sp,#$Xoff+0]
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
104 str $Elo,[sp,#$Eoff+0]
106 str $Ehi,[sp,#$Eoff+4]
108 str $Alo,[sp,#$Aoff+0]
110 str $Ahi,[sp,#$Aoff+4]
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
122 adc $Thi,$Thi,$t3 @ T += K[i]
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
128 ldr $t3,[sp,#$Coff+0] @ c.lo
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
149 ldr $t1,[sp,#$Boff+4] @ b.hi
151 ldr $t2,[sp,#$Coff+4] @ c.hi
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
160 adc $Ahi,$Ahi,$Thi @ h += T
166 #include "arm_arch.h"
170 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
174 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
224 .word OPENSSL_armcap_P-sha512_block_data_order
227 .global sha512_block_data_order
228 .type sha512_block_data_order,%function
229 sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
280 orr $Tlo,$Tlo,$t0,lsl#8
282 orr $Tlo,$Tlo,$t1,lsl#16
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
363 str $t0, [$ctx,#$Aoff+$lo]
365 str $t1, [$ctx,#$Aoff+$hi]
367 str $t2, [$ctx,#$Boff+$lo]
369 str $t3, [$ctx,#$Boff+$hi]
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
380 str $t0, [$ctx,#$Coff+$lo]
382 str $t1, [$ctx,#$Coff+$hi]
384 str $t2, [$ctx,#$Doff+$lo]
386 str $t3, [$ctx,#$Doff+$hi]
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
395 str $Elo,[$ctx,#$Eoff+$lo]
397 str $Ehi,[$ctx,#$Eoff+$hi]
399 str $t2, [$ctx,#$Foff+$lo]
401 str $t3, [$ctx,#$Foff+$hi]
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
412 str $t0, [$ctx,#$Goff+$lo]
414 str $t1, [$ctx,#$Goff+$hi]
416 str $t2, [$ctx,#$Hoff+$lo]
418 str $t3, [$ctx,#$Hoff+$hi]
426 add sp,sp,#8*9 @ destroy frame
428 ldmia sp!,{r4-r12,pc}
430 ldmia sp!,{r4-r12,lr}
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
438 my @Sigma0=(28,34,39);
439 my @Sigma1=(14,18,41);
440 my @sigma0=(1, 8, 7);
441 my @sigma1=(19,61,6);
444 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
446 my @X=map("d$_",(0..15));
447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
454 $code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467 #if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
477 vshr.u64 $t0,$a,#@Sigma0[0]
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
500 if ($i&1) { &NEON_00_15($i,@_); return; }
502 # 2x-vectorized, therefore runs every 2nd round
503 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506 my $e=@_[4]; # $e from NEON_00_15
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
532 &NEON_00_15(2*$i,@_);
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
547 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
553 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
562 vstmia $ctx,{$A-$H} @ save context
564 sub $Ktbl,#640 @ rewind K512
567 vldmia sp!,{d8-d15} @ epilogue
573 .size sha512_block_data_order,.-sha512_block_data_order
574 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
576 .comm OPENSSL_armcap_P,4,4
579 $code =~ s/\`([^\`]*)\`/eval $1/gem;
580 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
582 close STDOUT; # enforce flush