2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
19 # SHA512 block procedure for ARMv4. September 2007.
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
50 # Byte order [in]dependence. =========================================
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
58 # ====================================================================
61 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
62 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
64 if ($flavour && $flavour ne "void") {
65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
67 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
68 die "can't locate arm-xlate.pl";
70 open STDOUT,"| \"$^X\" $xlate $flavour $output";
72 open STDOUT,">$output";
75 $ctx="r0"; # parameter block
89 ############ r13 is stack pointer
91 ############ r15 is program counter
106 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
107 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
108 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
110 str $Tlo,[sp,#$Xoff+0]
112 str $Thi,[sp,#$Xoff+4]
113 eor $t0,$t0,$Ehi,lsl#18
114 ldr $t2,[sp,#$Hoff+0] @ h.lo
115 eor $t1,$t1,$Elo,lsl#18
116 ldr $t3,[sp,#$Hoff+4] @ h.hi
117 eor $t0,$t0,$Elo,lsr#18
118 eor $t1,$t1,$Ehi,lsr#18
119 eor $t0,$t0,$Ehi,lsl#14
120 eor $t1,$t1,$Elo,lsl#14
121 eor $t0,$t0,$Ehi,lsr#9
122 eor $t1,$t1,$Elo,lsr#9
123 eor $t0,$t0,$Elo,lsl#23
124 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
126 ldr $t0,[sp,#$Foff+0] @ f.lo
127 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
128 ldr $t1,[sp,#$Foff+4] @ f.hi
130 ldr $t2,[sp,#$Goff+0] @ g.lo
131 adc $Thi,$Thi,$t3 @ T += h
132 ldr $t3,[sp,#$Goff+4] @ g.hi
135 str $Elo,[sp,#$Eoff+0]
137 str $Ehi,[sp,#$Eoff+4]
139 str $Alo,[sp,#$Aoff+0]
141 str $Ahi,[sp,#$Aoff+4]
143 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
144 eor $t1,$t1,$t3 @ Ch(e,f,g)
145 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
148 ldr $Elo,[sp,#$Doff+0] @ d.lo
149 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
150 ldr $Ehi,[sp,#$Doff+4] @ d.hi
153 adc $Thi,$Thi,$t3 @ T += K[i]
155 ldr $t2,[sp,#$Boff+0] @ b.lo
156 adc $Ehi,$Ehi,$Thi @ d += T
159 ldr $t3,[sp,#$Coff+0] @ c.lo
161 it eq @ Thumb2 thing, sanity check in ARM
164 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
165 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
166 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
169 eor $t0,$t0,$Ahi,lsl#4
170 eor $t1,$t1,$Alo,lsl#4
171 eor $t0,$t0,$Ahi,lsr#2
172 eor $t1,$t1,$Alo,lsr#2
173 eor $t0,$t0,$Alo,lsl#30
174 eor $t1,$t1,$Ahi,lsl#30
175 eor $t0,$t0,$Ahi,lsr#7
176 eor $t1,$t1,$Alo,lsr#7
177 eor $t0,$t0,$Alo,lsl#25
178 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
181 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
183 ldr $t1,[sp,#$Boff+4] @ b.hi
185 ldr $t2,[sp,#$Coff+4] @ c.hi
189 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
192 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
194 adc $Ahi,$Ahi,$Thi @ h += T
201 # include "arm_arch.h"
202 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
203 # define VFP_ABI_POP vldmia sp!,{d8-d15}
205 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
206 # define __ARM_MAX_ARCH__ 7
207 # define VFP_ABI_PUSH
214 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
218 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
222 #if defined(__thumb2__)
233 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
234 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
235 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
236 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
237 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
238 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
239 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
240 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
241 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
242 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
243 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
244 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
245 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
246 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
247 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
248 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
249 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
250 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
251 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
252 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
253 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
254 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
255 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
256 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
257 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
258 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
259 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
260 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
261 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
262 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
263 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
264 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
265 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
266 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
267 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
268 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
269 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
270 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
271 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
272 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
274 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
276 .word OPENSSL_armcap_P-.Lsha512_block_data_order
282 .global sha512_block_data_order
283 .type sha512_block_data_order,%function
284 sha512_block_data_order:
285 .Lsha512_block_data_order:
286 #if __ARM_ARCH__<7 && !defined(__thumb2__)
287 sub r3,pc,#8 @ sha512_block_data_order
289 adr r3,.Lsha512_block_data_order
291 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
292 ldr r12,.LOPENSSL_armcap
293 ldr r12,[r3,r12] @ OPENSSL_armcap_P
300 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
301 stmdb sp!,{r4-r12,lr}
302 sub $Ktbl,r3,#672 @ K512
305 ldr $Elo,[$ctx,#$Eoff+$lo]
306 ldr $Ehi,[$ctx,#$Eoff+$hi]
307 ldr $t0, [$ctx,#$Goff+$lo]
308 ldr $t1, [$ctx,#$Goff+$hi]
309 ldr $t2, [$ctx,#$Hoff+$lo]
310 ldr $t3, [$ctx,#$Hoff+$hi]
312 str $t0, [sp,#$Goff+0]
313 str $t1, [sp,#$Goff+4]
314 str $t2, [sp,#$Hoff+0]
315 str $t3, [sp,#$Hoff+4]
316 ldr $Alo,[$ctx,#$Aoff+$lo]
317 ldr $Ahi,[$ctx,#$Aoff+$hi]
318 ldr $Tlo,[$ctx,#$Boff+$lo]
319 ldr $Thi,[$ctx,#$Boff+$hi]
320 ldr $t0, [$ctx,#$Coff+$lo]
321 ldr $t1, [$ctx,#$Coff+$hi]
322 ldr $t2, [$ctx,#$Doff+$lo]
323 ldr $t3, [$ctx,#$Doff+$hi]
324 str $Tlo,[sp,#$Boff+0]
325 str $Thi,[sp,#$Boff+4]
326 str $t0, [sp,#$Coff+0]
327 str $t1, [sp,#$Coff+4]
328 str $t2, [sp,#$Doff+0]
329 str $t3, [sp,#$Doff+4]
330 ldr $Tlo,[$ctx,#$Foff+$lo]
331 ldr $Thi,[$ctx,#$Foff+$hi]
332 str $Tlo,[sp,#$Foff+0]
333 str $Thi,[sp,#$Foff+4]
343 orr $Tlo,$Tlo,$t0,lsl#8
345 orr $Tlo,$Tlo,$t1,lsl#16
347 orr $Tlo,$Tlo,$t2,lsl#24
348 orr $Thi,$Thi,$t3,lsl#8
349 orr $Thi,$Thi,$t0,lsl#16
350 orr $Thi,$Thi,$t1,lsl#24
364 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
365 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
368 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
369 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
370 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
372 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
374 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
375 eor $Tlo,$Tlo,$t1,lsl#31
376 eor $Thi,$Thi,$t0,lsl#31
377 eor $Tlo,$Tlo,$t0,lsr#8
378 eor $Thi,$Thi,$t1,lsr#8
379 eor $Tlo,$Tlo,$t1,lsl#24
380 eor $Thi,$Thi,$t0,lsl#24
381 eor $Tlo,$Tlo,$t0,lsr#7
382 eor $Thi,$Thi,$t1,lsr#7
383 eor $Tlo,$Tlo,$t1,lsl#25
385 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
386 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
387 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
390 eor $t0,$t0,$t3,lsl#13
391 eor $t1,$t1,$t2,lsl#13
392 eor $t0,$t0,$t3,lsr#29
393 eor $t1,$t1,$t2,lsr#29
394 eor $t0,$t0,$t2,lsl#3
395 eor $t1,$t1,$t3,lsl#3
396 eor $t0,$t0,$t2,lsr#6
397 eor $t1,$t1,$t3,lsr#6
398 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
399 eor $t0,$t0,$t3,lsl#26
401 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
403 ldr $t0,[sp,#`$Xoff+8*16`+0]
406 ldr $t1,[sp,#`$Xoff+8*16`+4]
415 ittt eq @ Thumb2 thing, sanity check in ARM
417 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
418 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
422 ldr $Tlo,[sp,#$Boff+0]
423 ldr $Thi,[sp,#$Boff+4]
424 ldr $t0, [$ctx,#$Aoff+$lo]
425 ldr $t1, [$ctx,#$Aoff+$hi]
426 ldr $t2, [$ctx,#$Boff+$lo]
427 ldr $t3, [$ctx,#$Boff+$hi]
429 str $t0, [$ctx,#$Aoff+$lo]
431 str $t1, [$ctx,#$Aoff+$hi]
433 str $t2, [$ctx,#$Boff+$lo]
435 str $t3, [$ctx,#$Boff+$hi]
437 ldr $Alo,[sp,#$Coff+0]
438 ldr $Ahi,[sp,#$Coff+4]
439 ldr $Tlo,[sp,#$Doff+0]
440 ldr $Thi,[sp,#$Doff+4]
441 ldr $t0, [$ctx,#$Coff+$lo]
442 ldr $t1, [$ctx,#$Coff+$hi]
443 ldr $t2, [$ctx,#$Doff+$lo]
444 ldr $t3, [$ctx,#$Doff+$hi]
446 str $t0, [$ctx,#$Coff+$lo]
448 str $t1, [$ctx,#$Coff+$hi]
450 str $t2, [$ctx,#$Doff+$lo]
452 str $t3, [$ctx,#$Doff+$hi]
454 ldr $Tlo,[sp,#$Foff+0]
455 ldr $Thi,[sp,#$Foff+4]
456 ldr $t0, [$ctx,#$Eoff+$lo]
457 ldr $t1, [$ctx,#$Eoff+$hi]
458 ldr $t2, [$ctx,#$Foff+$lo]
459 ldr $t3, [$ctx,#$Foff+$hi]
461 str $Elo,[$ctx,#$Eoff+$lo]
463 str $Ehi,[$ctx,#$Eoff+$hi]
465 str $t2, [$ctx,#$Foff+$lo]
467 str $t3, [$ctx,#$Foff+$hi]
469 ldr $Alo,[sp,#$Goff+0]
470 ldr $Ahi,[sp,#$Goff+4]
471 ldr $Tlo,[sp,#$Hoff+0]
472 ldr $Thi,[sp,#$Hoff+4]
473 ldr $t0, [$ctx,#$Goff+$lo]
474 ldr $t1, [$ctx,#$Goff+$hi]
475 ldr $t2, [$ctx,#$Hoff+$lo]
476 ldr $t3, [$ctx,#$Hoff+$hi]
478 str $t0, [$ctx,#$Goff+$lo]
480 str $t1, [$ctx,#$Goff+$hi]
482 str $t2, [$ctx,#$Hoff+$lo]
484 str $t3, [$ctx,#$Hoff+$hi]
492 add sp,sp,#8*9 @ destroy frame
494 ldmia sp!,{r4-r12,pc}
496 ldmia sp!,{r4-r12,lr}
498 moveq pc,lr @ be binary compatible with V4, yet
499 bx lr @ interoperable with Thumb ISA:-)
501 .size sha512_block_data_order,.-sha512_block_data_order
505 my @Sigma0=(28,34,39);
506 my @Sigma1=(14,18,41);
507 my @sigma0=(1, 8, 7);
508 my @sigma1=(19,61,6);
511 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
513 my @X=map("d$_",(0..15));
514 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
518 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
519 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
521 $code.=<<___ if ($i<16 || $i&1);
522 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
524 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
526 vshr.u64 $t1,$e,#@Sigma1[1]
528 vadd.i64 $a,$Maj @ h+=Maj from the past
530 vshr.u64 $t2,$e,#@Sigma1[2]
533 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
534 vsli.64 $t0,$e,#`64-@Sigma1[0]`
535 vsli.64 $t1,$e,#`64-@Sigma1[1]`
537 vsli.64 $t2,$e,#`64-@Sigma1[2]`
538 #if $i<16 && defined(__ARMEL__)
539 vrev64.8 @X[$i],@X[$i]
542 vbsl $Ch,$f,$g @ Ch(e,f,g)
543 vshr.u64 $t0,$a,#@Sigma0[0]
544 veor $t2,$t1 @ Sigma1(e)
546 vshr.u64 $t1,$a,#@Sigma0[1]
547 vsli.64 $t0,$a,#`64-@Sigma0[0]`
549 vshr.u64 $t2,$a,#@Sigma0[2]
550 vadd.i64 $K,@X[$i%16]
551 vsli.64 $t1,$a,#`64-@Sigma0[1]`
553 vsli.64 $t2,$a,#`64-@Sigma0[2]`
556 vbsl $Maj,$c,$b @ Maj(a,b,c)
557 veor $h,$t2 @ Sigma0(a)
567 if ($i&1) { &NEON_00_15($i,@_); return; }
569 # 2x-vectorized, therefore runs every 2nd round
570 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
571 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
572 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
573 my $e=@_[4]; # $e from NEON_00_15
576 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
577 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
578 vadd.i64 @_[0],d30 @ h+=Maj from the past
579 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
580 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
581 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
582 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
584 vshr.u64 $t0,$s0,#@sigma0[0]
585 veor $s1,$t1 @ sigma1(X[i+14])
586 vshr.u64 $t1,$s0,#@sigma0[1]
587 vadd.i64 @X[$i%8],$s1
588 vshr.u64 $s1,$s0,#@sigma0[2]
589 vsli.64 $t0,$s0,#`64-@sigma0[0]`
590 vsli.64 $t1,$s0,#`64-@sigma0[1]`
591 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
593 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
594 vadd.i64 @X[$i%8],$s0
595 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
596 veor $s1,$t1 @ sigma0(X[i+1])
597 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
598 vadd.i64 @X[$i%8],$s1
600 &NEON_00_15(2*$i,@_);
604 #if __ARM_MAX_ARCH__>=7
608 .global sha512_block_data_order_neon
609 .type sha512_block_data_order_neon,%function
611 sha512_block_data_order_neon:
613 dmb @ errata #451034 on early Cortex A8
614 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
617 vldmia $ctx,{$A-$H} @ load context
620 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
626 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
630 vadd.i64 $A,d30 @ h+=Maj from the past
631 vldmia $ctx,{d24-d31} @ load context to temp
632 vadd.i64 q8,q12 @ vectorized accumulate
636 vstmia $ctx,{$A-$H} @ save context
638 sub $Ktbl,#640 @ rewind K512
643 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
648 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
650 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
651 .comm OPENSSL_armcap_P,4,4
655 $code =~ s/\`([^\`]*)\`/eval $1/gem;
656 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
657 $code =~ s/\bret\b/bx lr/gm;
662 last if (!s/^#/@/ and !/^$/);
668 close STDOUT; # enforce flush