]> WPIA git - cassiopeia.git/blob - lib/openssl/crypto/chacha/asm/chacha-armv4.pl
upd: openssl to 1.1.0
[cassiopeia.git] / lib / openssl / crypto / chacha / asm / chacha-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # December 2014
18
19 # ChaCha20 for ARMv4.
20 #
21 # Performance in cycles per byte out of large buffer.
22 #
23 #                       IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24 #
25 # Cortex-A5             19.3(*)/+95%    21.8        14.1
26 # Cortex-A8             10.5(*)/+160%   13.9        6.35
27 # Cortex-A9             12.9(**)/+110%  14.3        6.50
28 # Cortex-A15            11.0/+40%       16.0        5.00
29 # Snapdragon S4         11.5/+125%      13.6        4.90
30 #
31 # (*)   most "favourable" result for aligned data on little-endian
32 #       processor, result for misaligned data is 10-15% lower;
33 # (**)  this result is a trade-off: it can be improved by 20%,
34 #       but then Snapdragon S4 and Cortex-A8 results get
35 #       20-25% worse;
36
37 $flavour = shift;
38 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41 if ($flavour && $flavour ne "void") {
42     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45     die "can't locate arm-xlate.pl";
46
47     open STDOUT,"| \"$^X\" $xlate $flavour $output";
48 } else {
49     open STDOUT,">$output";
50 }
51
52 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
53 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
54   my $arg = pop;
55     $arg = "#$arg" if ($arg*1 eq $arg);
56     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
57 }
58
59 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
60 my @t=map("r$_",(8..11));
61
62 sub ROUND {
63 my ($a0,$b0,$c0,$d0)=@_;
64 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
65 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
66 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
67 my $odd = $d0&1;
68 my ($xc,$xc_) = (@t[0..1]);
69 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
70 my @ret;
71
72         # Consider order in which variables are addressed by their
73         # index:
74         #
75         #       a   b   c   d
76         #
77         #       0   4   8  12 < even round
78         #       1   5   9  13
79         #       2   6  10  14
80         #       3   7  11  15
81         #       0   5  10  15 < odd round
82         #       1   6  11  12
83         #       2   7   8  13
84         #       3   4   9  14
85         #
86         # 'a', 'b' are permanently allocated in registers, @x[0..7],
87         # while 'c's and pair of 'd's are maintained in memory. If
88         # you observe 'c' column, you'll notice that pair of 'c's is
89         # invariant between rounds. This means that we have to reload
90         # them once per round, in the middle. This is why you'll see
91         # bunch of 'c' stores and loads in the middle, but none in
92         # the beginning or end. If you observe 'd' column, you'll
93         # notice that 15 and 13 are reused in next pair of rounds.
94         # This is why these two are chosen for offloading to memory,
95         # to make loads count more.
96                                                         push @ret,(
97         "&add   (@x[$a0],@x[$a0],@x[$b0])",
98         "&mov   ($xd,$xd,'ror#16')",
99          "&add  (@x[$a1],@x[$a1],@x[$b1])",
100          "&mov  ($xd_,$xd_,'ror#16')",
101         "&eor   ($xd,$xd,@x[$a0],'ror#16')",
102          "&eor  ($xd_,$xd_,@x[$a1],'ror#16')",
103
104         "&add   ($xc,$xc,$xd)",
105         "&mov   (@x[$b0],@x[$b0],'ror#20')",
106          "&add  ($xc_,$xc_,$xd_)",
107          "&mov  (@x[$b1],@x[$b1],'ror#20')",
108         "&eor   (@x[$b0],@x[$b0],$xc,'ror#20')",
109          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#20')",
110
111         "&add   (@x[$a0],@x[$a0],@x[$b0])",
112         "&mov   ($xd,$xd,'ror#24')",
113          "&add  (@x[$a1],@x[$a1],@x[$b1])",
114          "&mov  ($xd_,$xd_,'ror#24')",
115         "&eor   ($xd,$xd,@x[$a0],'ror#24')",
116          "&eor  ($xd_,$xd_,@x[$a1],'ror#24')",
117
118         "&add   ($xc,$xc,$xd)",
119         "&mov   (@x[$b0],@x[$b0],'ror#25')"             );
120                                                         push @ret,(
121         "&str   ($xd,'[sp,#4*(16+$d0)]')",
122         "&ldr   ($xd,'[sp,#4*(16+$d2)]')"               ) if ($odd);
123                                                         push @ret,(
124          "&add  ($xc_,$xc_,$xd_)",
125          "&mov  (@x[$b1],@x[$b1],'ror#25')"             );
126                                                         push @ret,(
127          "&str  ($xd_,'[sp,#4*(16+$d1)]')",
128          "&ldr  ($xd_,'[sp,#4*(16+$d3)]')"              ) if (!$odd);
129                                                         push @ret,(
130         "&eor   (@x[$b0],@x[$b0],$xc,'ror#25')",
131          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#25')"        );
132
133         $xd=@x[$d2]                                     if (!$odd);
134         $xd_=@x[$d3]                                    if ($odd);
135                                                         push @ret,(
136         "&str   ($xc,'[sp,#4*(16+$c0)]')",
137         "&ldr   ($xc,'[sp,#4*(16+$c2)]')",
138         "&add   (@x[$a2],@x[$a2],@x[$b2])",
139         "&mov   ($xd,$xd,'ror#16')",
140          "&str  ($xc_,'[sp,#4*(16+$c1)]')",
141          "&ldr  ($xc_,'[sp,#4*(16+$c3)]')",
142          "&add  (@x[$a3],@x[$a3],@x[$b3])",
143          "&mov  ($xd_,$xd_,'ror#16')",
144         "&eor   ($xd,$xd,@x[$a2],'ror#16')",
145          "&eor  ($xd_,$xd_,@x[$a3],'ror#16')",
146
147         "&add   ($xc,$xc,$xd)",
148         "&mov   (@x[$b2],@x[$b2],'ror#20')",
149          "&add  ($xc_,$xc_,$xd_)",
150          "&mov  (@x[$b3],@x[$b3],'ror#20')",
151         "&eor   (@x[$b2],@x[$b2],$xc,'ror#20')",
152          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#20')",
153
154         "&add   (@x[$a2],@x[$a2],@x[$b2])",
155         "&mov   ($xd,$xd,'ror#24')",
156          "&add  (@x[$a3],@x[$a3],@x[$b3])",
157          "&mov  ($xd_,$xd_,'ror#24')",
158         "&eor   ($xd,$xd,@x[$a2],'ror#24')",
159          "&eor  ($xd_,$xd_,@x[$a3],'ror#24')",
160
161         "&add   ($xc,$xc,$xd)",
162         "&mov   (@x[$b2],@x[$b2],'ror#25')",
163          "&add  ($xc_,$xc_,$xd_)",
164          "&mov  (@x[$b3],@x[$b3],'ror#25')",
165         "&eor   (@x[$b2],@x[$b2],$xc,'ror#25')",
166          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#25')"        );
167
168         @ret;
169 }
170
171 $code.=<<___;
172 #include "arm_arch.h"
173
174 .text
175 #if defined(__thumb2__)
176 .syntax unified
177 .thumb
178 #else
179 .code   32
180 #endif
181
182 #if defined(__thumb2__) || defined(__clang__)
183 #define ldrhsb  ldrbhs
184 #endif
185
186 .align  5
187 .Lsigma:
188 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     @ endian-neutral
189 .Lone:
190 .long   1,0,0,0
191 #if __ARM_MAX_ARCH__>=7
192 .LOPENSSL_armcap:
193 .word   OPENSSL_armcap_P-.LChaCha20_ctr32
194 #else
195 .word   -1
196 #endif
197
198 .globl  ChaCha20_ctr32
199 .type   ChaCha20_ctr32,%function
200 .align  5
201 ChaCha20_ctr32:
202 .LChaCha20_ctr32:
203         ldr     r12,[sp,#0]             @ pull pointer to counter and nonce
204         stmdb   sp!,{r0-r2,r4-r11,lr}
205 #if __ARM_ARCH__<7 && !defined(__thumb2__)
206         sub     r14,pc,#16              @ ChaCha20_ctr32
207 #else
208         adr     r14,.LChaCha20_ctr32
209 #endif
210         cmp     r2,#0                   @ len==0?
211 #ifdef  __thumb2__
212         itt     eq
213 #endif
214         addeq   sp,sp,#4*3
215         beq     .Lno_data
216 #if __ARM_MAX_ARCH__>=7
217         cmp     r2,#192                 @ test len
218         bls     .Lshort
219         ldr     r4,[r14,#-32]
220         ldr     r4,[r14,r4]
221 # ifdef __APPLE__
222         ldr     r4,[r4]
223 # endif
224         tst     r4,#ARMV7_NEON
225         bne     .LChaCha20_neon
226 .Lshort:
227 #endif
228         ldmia   r12,{r4-r7}             @ load counter and nonce
229         sub     sp,sp,#4*(16)           @ off-load area
230         sub     r14,r14,#64             @ .Lsigma
231         stmdb   sp!,{r4-r7}             @ copy counter and nonce
232         ldmia   r3,{r4-r11}             @ load key
233         ldmia   r14,{r0-r3}             @ load sigma
234         stmdb   sp!,{r4-r11}            @ copy key
235         stmdb   sp!,{r0-r3}             @ copy sigma
236         str     r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
237         str     r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
238         b       .Loop_outer_enter
239
240 .align  4
241 .Loop_outer:
242         ldmia   sp,{r0-r9}              @ load key material
243         str     @t[3],[sp,#4*(32+2)]    @ save len
244         str     r12,  [sp,#4*(32+1)]    @ save inp
245         str     r14,  [sp,#4*(32+0)]    @ save out
246 .Loop_outer_enter:
247         ldr     @t[3], [sp,#4*(15)]
248         ldr     @x[12],[sp,#4*(12)]     @ modulo-scheduled load
249         ldr     @t[2], [sp,#4*(13)]
250         ldr     @x[14],[sp,#4*(14)]
251         str     @t[3], [sp,#4*(16+15)]
252         mov     @t[3],#10
253         b       .Loop
254
255 .align  4
256 .Loop:
257         subs    @t[3],@t[3],#1
258 ___
259         foreach (&ROUND(0, 4, 8,12)) { eval; }
260         foreach (&ROUND(0, 5,10,15)) { eval; }
261 $code.=<<___;
262         bne     .Loop
263
264         ldr     @t[3],[sp,#4*(32+2)]    @ load len
265
266         str     @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
267         str     @t[1], [sp,#4*(16+9)]
268         str     @x[12],[sp,#4*(16+12)]
269         str     @t[2], [sp,#4*(16+13)]
270         str     @x[14],[sp,#4*(16+14)]
271
272         @ at this point we have first half of 512-bit result in
273         @ @x[0-7] and second half at sp+4*(16+8)
274
275         cmp     @t[3],#64               @ done yet?
276 #ifdef  __thumb2__
277         itete   lo
278 #endif
279         addlo   r12,sp,#4*(0)           @ shortcut or ...
280         ldrhs   r12,[sp,#4*(32+1)]      @ ... load inp
281         addlo   r14,sp,#4*(0)           @ shortcut or ...
282         ldrhs   r14,[sp,#4*(32+0)]      @ ... load out
283
284         ldr     @t[0],[sp,#4*(0)]       @ load key material
285         ldr     @t[1],[sp,#4*(1)]
286
287 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
288 # if __ARM_ARCH__<7
289         orr     @t[2],r12,r14
290         tst     @t[2],#3                @ are input and output aligned?
291         ldr     @t[2],[sp,#4*(2)]
292         bne     .Lunaligned
293         cmp     @t[3],#64               @ restore flags
294 # else
295         ldr     @t[2],[sp,#4*(2)]
296 # endif
297         ldr     @t[3],[sp,#4*(3)]
298
299         add     @x[0],@x[0],@t[0]       @ accumulate key material
300         add     @x[1],@x[1],@t[1]
301 # ifdef __thumb2__
302         itt     hs
303 # endif
304         ldrhs   @t[0],[r12],#16         @ load input
305         ldrhs   @t[1],[r12,#-12]
306
307         add     @x[2],@x[2],@t[2]
308         add     @x[3],@x[3],@t[3]
309 # ifdef __thumb2__
310         itt     hs
311 # endif
312         ldrhs   @t[2],[r12,#-8]
313         ldrhs   @t[3],[r12,#-4]
314 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
315         rev     @x[0],@x[0]
316         rev     @x[1],@x[1]
317         rev     @x[2],@x[2]
318         rev     @x[3],@x[3]
319 # endif
320 # ifdef __thumb2__
321         itt     hs
322 # endif
323         eorhs   @x[0],@x[0],@t[0]       @ xor with input
324         eorhs   @x[1],@x[1],@t[1]
325          add    @t[0],sp,#4*(4)
326         str     @x[0],[r14],#16         @ store output
327 # ifdef __thumb2__
328         itt     hs
329 # endif
330         eorhs   @x[2],@x[2],@t[2]
331         eorhs   @x[3],@x[3],@t[3]
332          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
333         str     @x[1],[r14,#-12]
334         str     @x[2],[r14,#-8]
335         str     @x[3],[r14,#-4]
336
337         add     @x[4],@x[4],@t[0]       @ accumulate key material
338         add     @x[5],@x[5],@t[1]
339 # ifdef __thumb2__
340         itt     hs
341 # endif
342         ldrhs   @t[0],[r12],#16         @ load input
343         ldrhs   @t[1],[r12,#-12]
344         add     @x[6],@x[6],@t[2]
345         add     @x[7],@x[7],@t[3]
346 # ifdef __thumb2__
347         itt     hs
348 # endif
349         ldrhs   @t[2],[r12,#-8]
350         ldrhs   @t[3],[r12,#-4]
351 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
352         rev     @x[4],@x[4]
353         rev     @x[5],@x[5]
354         rev     @x[6],@x[6]
355         rev     @x[7],@x[7]
356 # endif
357 # ifdef __thumb2__
358         itt     hs
359 # endif
360         eorhs   @x[4],@x[4],@t[0]
361         eorhs   @x[5],@x[5],@t[1]
362          add    @t[0],sp,#4*(8)
363         str     @x[4],[r14],#16         @ store output
364 # ifdef __thumb2__
365         itt     hs
366 # endif
367         eorhs   @x[6],@x[6],@t[2]
368         eorhs   @x[7],@x[7],@t[3]
369         str     @x[5],[r14,#-12]
370          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
371         str     @x[6],[r14,#-8]
372          add    @x[0],sp,#4*(16+8)
373         str     @x[7],[r14,#-4]
374
375         ldmia   @x[0],{@x[0]-@x[7]}     @ load second half
376
377         add     @x[0],@x[0],@t[0]       @ accumulate key material
378         add     @x[1],@x[1],@t[1]
379 # ifdef __thumb2__
380         itt     hs
381 # endif
382         ldrhs   @t[0],[r12],#16         @ load input
383         ldrhs   @t[1],[r12,#-12]
384 # ifdef __thumb2__
385         itt     hi
386 # endif
387          strhi  @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
388          strhi  @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
389         add     @x[2],@x[2],@t[2]
390         add     @x[3],@x[3],@t[3]
391 # ifdef __thumb2__
392         itt     hs
393 # endif
394         ldrhs   @t[2],[r12,#-8]
395         ldrhs   @t[3],[r12,#-4]
396 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
397         rev     @x[0],@x[0]
398         rev     @x[1],@x[1]
399         rev     @x[2],@x[2]
400         rev     @x[3],@x[3]
401 # endif
402 # ifdef __thumb2__
403         itt     hs
404 # endif
405         eorhs   @x[0],@x[0],@t[0]
406         eorhs   @x[1],@x[1],@t[1]
407          add    @t[0],sp,#4*(12)
408         str     @x[0],[r14],#16         @ store output
409 # ifdef __thumb2__
410         itt     hs
411 # endif
412         eorhs   @x[2],@x[2],@t[2]
413         eorhs   @x[3],@x[3],@t[3]
414         str     @x[1],[r14,#-12]
415          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
416         str     @x[2],[r14,#-8]
417         str     @x[3],[r14,#-4]
418
419         add     @x[4],@x[4],@t[0]       @ accumulate key material
420         add     @x[5],@x[5],@t[1]
421 # ifdef __thumb2__
422         itt     hi
423 # endif
424          addhi  @t[0],@t[0],#1          @ next counter value
425          strhi  @t[0],[sp,#4*(12)]      @ save next counter value
426 # ifdef __thumb2__
427         itt     hs
428 # endif
429         ldrhs   @t[0],[r12],#16         @ load input
430         ldrhs   @t[1],[r12,#-12]
431         add     @x[6],@x[6],@t[2]
432         add     @x[7],@x[7],@t[3]
433 # ifdef __thumb2__
434         itt     hs
435 # endif
436         ldrhs   @t[2],[r12,#-8]
437         ldrhs   @t[3],[r12,#-4]
438 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
439         rev     @x[4],@x[4]
440         rev     @x[5],@x[5]
441         rev     @x[6],@x[6]
442         rev     @x[7],@x[7]
443 # endif
444 # ifdef __thumb2__
445         itt     hs
446 # endif
447         eorhs   @x[4],@x[4],@t[0]
448         eorhs   @x[5],@x[5],@t[1]
449 # ifdef __thumb2__
450          it     ne
451 # endif
452          ldrne  @t[0],[sp,#4*(32+2)]    @ re-load len
453 # ifdef __thumb2__
454         itt     hs
455 # endif
456         eorhs   @x[6],@x[6],@t[2]
457         eorhs   @x[7],@x[7],@t[3]
458         str     @x[4],[r14],#16         @ store output
459         str     @x[5],[r14,#-12]
460 # ifdef __thumb2__
461         it      hs
462 # endif
463          subhs  @t[3],@t[0],#64         @ len-=64
464         str     @x[6],[r14,#-8]
465         str     @x[7],[r14,#-4]
466         bhi     .Loop_outer
467
468         beq     .Ldone
469 # if __ARM_ARCH__<7
470         b       .Ltail
471
472 .align  4
473 .Lunaligned:                            @ unaligned endian-neutral path
474         cmp     @t[3],#64               @ restore flags
475 # endif
476 #endif
477 #if __ARM_ARCH__<7
478         ldr     @t[3],[sp,#4*(3)]
479 ___
480 for ($i=0;$i<16;$i+=4) {
481 my $j=$i&0x7;
482
483 $code.=<<___    if ($i==4);
484         add     @x[0],sp,#4*(16+8)
485 ___
486 $code.=<<___    if ($i==8);
487         ldmia   @x[0],{@x[0]-@x[7]}             @ load second half
488 # ifdef __thumb2__
489         itt     hi
490 # endif
491         strhi   @t[2],[sp,#4*(16+10)]           @ copy "@x[10]"
492         strhi   @t[3],[sp,#4*(16+11)]           @ copy "@x[11]"
493 ___
494 $code.=<<___;
495         add     @x[$j+0],@x[$j+0],@t[0]         @ accumulate key material
496 ___
497 $code.=<<___    if ($i==12);
498 # ifdef __thumb2__
499         itt     hi
500 # endif
501         addhi   @t[0],@t[0],#1                  @ next counter value
502         strhi   @t[0],[sp,#4*(12)]              @ save next counter value
503 ___
504 $code.=<<___;
505         add     @x[$j+1],@x[$j+1],@t[1]
506         add     @x[$j+2],@x[$j+2],@t[2]
507 # ifdef __thumb2__
508         itete   lo
509 # endif
510         eorlo   @t[0],@t[0],@t[0]               @ zero or ...
511         ldrhsb  @t[0],[r12],#16                 @ ... load input
512         eorlo   @t[1],@t[1],@t[1]
513         ldrhsb  @t[1],[r12,#-12]
514
515         add     @x[$j+3],@x[$j+3],@t[3]
516 # ifdef __thumb2__
517         itete   lo
518 # endif
519         eorlo   @t[2],@t[2],@t[2]
520         ldrhsb  @t[2],[r12,#-8]
521         eorlo   @t[3],@t[3],@t[3]
522         ldrhsb  @t[3],[r12,#-4]
523
524         eor     @x[$j+0],@t[0],@x[$j+0]         @ xor with input (or zero)
525         eor     @x[$j+1],@t[1],@x[$j+1]
526 # ifdef __thumb2__
527         itt     hs
528 # endif
529         ldrhsb  @t[0],[r12,#-15]                @ load more input
530         ldrhsb  @t[1],[r12,#-11]
531         eor     @x[$j+2],@t[2],@x[$j+2]
532          strb   @x[$j+0],[r14],#16              @ store output
533         eor     @x[$j+3],@t[3],@x[$j+3]
534 # ifdef __thumb2__
535         itt     hs
536 # endif
537         ldrhsb  @t[2],[r12,#-7]
538         ldrhsb  @t[3],[r12,#-3]
539          strb   @x[$j+1],[r14,#-12]
540         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
541          strb   @x[$j+2],[r14,#-8]
542         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
543 # ifdef __thumb2__
544         itt     hs
545 # endif
546         ldrhsb  @t[0],[r12,#-14]                @ load more input
547         ldrhsb  @t[1],[r12,#-10]
548          strb   @x[$j+3],[r14,#-4]
549         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
550          strb   @x[$j+0],[r14,#-15]
551         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
552 # ifdef __thumb2__
553         itt     hs
554 # endif
555         ldrhsb  @t[2],[r12,#-6]
556         ldrhsb  @t[3],[r12,#-2]
557          strb   @x[$j+1],[r14,#-11]
558         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
559          strb   @x[$j+2],[r14,#-7]
560         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
561 # ifdef __thumb2__
562         itt     hs
563 # endif
564         ldrhsb  @t[0],[r12,#-13]                @ load more input
565         ldrhsb  @t[1],[r12,#-9]
566          strb   @x[$j+3],[r14,#-3]
567         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
568          strb   @x[$j+0],[r14,#-14]
569         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
570 # ifdef __thumb2__
571         itt     hs
572 # endif
573         ldrhsb  @t[2],[r12,#-5]
574         ldrhsb  @t[3],[r12,#-1]
575          strb   @x[$j+1],[r14,#-10]
576          strb   @x[$j+2],[r14,#-6]
577         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
578          strb   @x[$j+3],[r14,#-2]
579         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
580          strb   @x[$j+0],[r14,#-13]
581         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
582          strb   @x[$j+1],[r14,#-9]
583         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
584          strb   @x[$j+2],[r14,#-5]
585          strb   @x[$j+3],[r14,#-1]
586 ___
587 $code.=<<___    if ($i<12);
588         add     @t[0],sp,#4*(4+$i)
589         ldmia   @t[0],{@t[0]-@t[3]}             @ load key material
590 ___
591 }
592 $code.=<<___;
593 # ifdef __thumb2__
594         it      ne
595 # endif
596         ldrne   @t[0],[sp,#4*(32+2)]            @ re-load len
597 # ifdef __thumb2__
598         it      hs
599 # endif
600         subhs   @t[3],@t[0],#64                 @ len-=64
601         bhi     .Loop_outer
602
603         beq     .Ldone
604 #endif
605
606 .Ltail:
607         ldr     r12,[sp,#4*(32+1)]      @ load inp
608         add     @t[1],sp,#4*(0)
609         ldr     r14,[sp,#4*(32+0)]      @ load out
610
611 .Loop_tail:
612         ldrb    @t[2],[@t[1]],#1        @ read buffer on stack
613         ldrb    @t[3],[r12],#1          @ read input
614         subs    @t[0],@t[0],#1
615         eor     @t[3],@t[3],@t[2]
616         strb    @t[3],[r14],#1          @ store output
617         bne     .Loop_tail
618
619 .Ldone:
620         add     sp,sp,#4*(32+3)
621 .Lno_data:
622         ldmia   sp!,{r4-r11,pc}
623 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
624 ___
625
626 {{{
627 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
628     map("q$_",(0..15));
629
630 sub NEONROUND {
631 my $odd = pop;
632 my ($a,$b,$c,$d,$t)=@_;
633
634         (
635         "&vadd_i32      ($a,$a,$b)",
636         "&veor          ($d,$d,$a)",
637         "&vrev32_16     ($d,$d)",       # vrot ($d,16)
638
639         "&vadd_i32      ($c,$c,$d)",
640         "&veor          ($t,$b,$c)",
641         "&vshr_u32      ($b,$t,20)",
642         "&vsli_32       ($b,$t,12)",
643
644         "&vadd_i32      ($a,$a,$b)",
645         "&veor          ($t,$d,$a)",
646         "&vshr_u32      ($d,$t,24)",
647         "&vsli_32       ($d,$t,8)",
648
649         "&vadd_i32      ($c,$c,$d)",
650         "&veor          ($t,$b,$c)",
651         "&vshr_u32      ($b,$t,25)",
652         "&vsli_32       ($b,$t,7)",
653
654         "&vext_8        ($c,$c,$c,8)",
655         "&vext_8        ($b,$b,$b,$odd?12:4)",
656         "&vext_8        ($d,$d,$d,$odd?4:12)"
657         );
658 }
659
660 $code.=<<___;
661 #if __ARM_MAX_ARCH__>=7
662 .arch   armv7-a
663 .fpu    neon
664
665 .type   ChaCha20_neon,%function
666 .align  5
667 ChaCha20_neon:
668         ldr             r12,[sp,#0]             @ pull pointer to counter and nonce
669         stmdb           sp!,{r0-r2,r4-r11,lr}
670 .LChaCha20_neon:
671         adr             r14,.Lsigma
672         vstmdb          sp!,{d8-d15}            @ ABI spec says so
673         stmdb           sp!,{r0-r3}
674
675         vld1.32         {$b0-$c0},[r3]          @ load key
676         ldmia           r3,{r4-r11}             @ load key
677
678         sub             sp,sp,#4*(16+16)
679         vld1.32         {$d0},[r12]             @ load counter and nonce
680         add             r12,sp,#4*8
681         ldmia           r14,{r0-r3}             @ load sigma
682         vld1.32         {$a0},[r14]!            @ load sigma
683         vld1.32         {$t0},[r14]             @ one
684         vst1.32         {$c0-$d0},[r12]         @ copy 1/2key|counter|nonce
685         vst1.32         {$a0-$b0},[sp]          @ copy sigma|1/2key
686
687         str             r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
688         str             r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
689         vshl.i32        $t1#lo,$t0#lo,#1        @ two
690         vstr            $t0#lo,[sp,#4*(16+0)]
691         vshl.i32        $t2#lo,$t0#lo,#2        @ four
692         vstr            $t1#lo,[sp,#4*(16+2)]
693         vmov            $a1,$a0
694         vstr            $t2#lo,[sp,#4*(16+4)]
695         vmov            $a2,$a0
696         vmov            $b1,$b0
697         vmov            $b2,$b0
698         b               .Loop_neon_enter
699
700 .align  4
701 .Loop_neon_outer:
702         ldmia           sp,{r0-r9}              @ load key material
703         cmp             @t[3],#64*2             @ if len<=64*2
704         bls             .Lbreak_neon            @ switch to integer-only
705         vmov            $a1,$a0
706         str             @t[3],[sp,#4*(32+2)]    @ save len
707         vmov            $a2,$a0
708         str             r12,  [sp,#4*(32+1)]    @ save inp
709         vmov            $b1,$b0
710         str             r14,  [sp,#4*(32+0)]    @ save out
711         vmov            $b2,$b0
712 .Loop_neon_enter:
713         ldr             @t[3], [sp,#4*(15)]
714         vadd.i32        $d1,$d0,$t0             @ counter+1
715         ldr             @x[12],[sp,#4*(12)]     @ modulo-scheduled load
716         vmov            $c1,$c0
717         ldr             @t[2], [sp,#4*(13)]
718         vmov            $c2,$c0
719         ldr             @x[14],[sp,#4*(14)]
720         vadd.i32        $d2,$d1,$t0             @ counter+2
721         str             @t[3], [sp,#4*(16+15)]
722         mov             @t[3],#10
723         add             @x[12],@x[12],#3        @ counter+3 
724         b               .Loop_neon
725
726 .align  4
727 .Loop_neon:
728         subs            @t[3],@t[3],#1
729 ___
730         my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
731         my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
732         my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
733         my @thread3=&ROUND(0,4,8,12);
734
735         foreach (@thread0) {
736                 eval;                   eval(shift(@thread3));
737                 eval(shift(@thread1));  eval(shift(@thread3));
738                 eval(shift(@thread2));  eval(shift(@thread3));
739         }
740
741         @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
742         @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
743         @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
744         @thread3=&ROUND(0,5,10,15);
745
746         foreach (@thread0) {
747                 eval;                   eval(shift(@thread3));
748                 eval(shift(@thread1));  eval(shift(@thread3));
749                 eval(shift(@thread2));  eval(shift(@thread3));
750         }
751 $code.=<<___;
752         bne             .Loop_neon
753
754         add             @t[3],sp,#32
755         vld1.32         {$t0-$t1},[sp]          @ load key material
756         vld1.32         {$t2-$t3},[@t[3]]
757
758         ldr             @t[3],[sp,#4*(32+2)]    @ load len
759
760         str             @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
761         str             @t[1], [sp,#4*(16+9)]
762         str             @x[12],[sp,#4*(16+12)]
763         str             @t[2], [sp,#4*(16+13)]
764         str             @x[14],[sp,#4*(16+14)]
765
766         @ at this point we have first half of 512-bit result in
767         @ @x[0-7] and second half at sp+4*(16+8)
768
769         ldr             r12,[sp,#4*(32+1)]      @ load inp
770         ldr             r14,[sp,#4*(32+0)]      @ load out
771
772         vadd.i32        $a0,$a0,$t0             @ accumulate key material
773         vadd.i32        $a1,$a1,$t0
774         vadd.i32        $a2,$a2,$t0
775         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
776
777         vadd.i32        $b0,$b0,$t1
778         vadd.i32        $b1,$b1,$t1
779         vadd.i32        $b2,$b2,$t1
780         vldr            $t1#lo,[sp,#4*(16+2)]   @ two
781
782         vadd.i32        $c0,$c0,$t2
783         vadd.i32        $c1,$c1,$t2
784         vadd.i32        $c2,$c2,$t2
785         vadd.i32        $d1#lo,$d1#lo,$t0#lo    @ counter+1
786         vadd.i32        $d2#lo,$d2#lo,$t1#lo    @ counter+2
787
788         vadd.i32        $d0,$d0,$t3
789         vadd.i32        $d1,$d1,$t3
790         vadd.i32        $d2,$d2,$t3
791
792         cmp             @t[3],#64*4
793         blo             .Ltail_neon
794
795         vld1.8          {$t0-$t1},[r12]!        @ load input
796          mov            @t[3],sp
797         vld1.8          {$t2-$t3},[r12]!
798         veor            $a0,$a0,$t0             @ xor with input
799         veor            $b0,$b0,$t1
800         vld1.8          {$t0-$t1},[r12]!
801         veor            $c0,$c0,$t2
802         veor            $d0,$d0,$t3
803         vld1.8          {$t2-$t3},[r12]!
804
805         veor            $a1,$a1,$t0
806          vst1.8         {$a0-$b0},[r14]!        @ store output
807         veor            $b1,$b1,$t1
808         vld1.8          {$t0-$t1},[r12]!
809         veor            $c1,$c1,$t2
810          vst1.8         {$c0-$d0},[r14]!
811         veor            $d1,$d1,$t3
812         vld1.8          {$t2-$t3},[r12]!
813
814         veor            $a2,$a2,$t0
815          vld1.32        {$a0-$b0},[@t[3]]!      @ load for next iteration
816          veor           $t0#hi,$t0#hi,$t0#hi
817          vldr           $t0#lo,[sp,#4*(16+4)]   @ four
818         veor            $b2,$b2,$t1
819          vld1.32        {$c0-$d0},[@t[3]]
820         veor            $c2,$c2,$t2
821          vst1.8         {$a1-$b1},[r14]!
822         veor            $d2,$d2,$t3
823          vst1.8         {$c1-$d1},[r14]!
824
825         vadd.i32        $d0#lo,$d0#lo,$t0#lo    @ next counter value
826         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
827
828         ldmia           sp,{@t[0]-@t[3]}        @ load key material
829         add             @x[0],@x[0],@t[0]       @ accumulate key material
830         ldr             @t[0],[r12],#16         @ load input
831          vst1.8         {$a2-$b2},[r14]!
832         add             @x[1],@x[1],@t[1]
833         ldr             @t[1],[r12,#-12]
834          vst1.8         {$c2-$d2},[r14]!
835         add             @x[2],@x[2],@t[2]
836         ldr             @t[2],[r12,#-8]
837         add             @x[3],@x[3],@t[3]
838         ldr             @t[3],[r12,#-4]
839 # ifdef __ARMEB__
840         rev             @x[0],@x[0]
841         rev             @x[1],@x[1]
842         rev             @x[2],@x[2]
843         rev             @x[3],@x[3]
844 # endif
845         eor             @x[0],@x[0],@t[0]       @ xor with input
846          add            @t[0],sp,#4*(4)
847         eor             @x[1],@x[1],@t[1]
848         str             @x[0],[r14],#16         @ store output
849         eor             @x[2],@x[2],@t[2]
850         str             @x[1],[r14,#-12]
851         eor             @x[3],@x[3],@t[3]
852          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
853         str             @x[2],[r14,#-8]
854         str             @x[3],[r14,#-4]
855
856         add             @x[4],@x[4],@t[0]       @ accumulate key material
857         ldr             @t[0],[r12],#16         @ load input
858         add             @x[5],@x[5],@t[1]
859         ldr             @t[1],[r12,#-12]
860         add             @x[6],@x[6],@t[2]
861         ldr             @t[2],[r12,#-8]
862         add             @x[7],@x[7],@t[3]
863         ldr             @t[3],[r12,#-4]
864 # ifdef __ARMEB__
865         rev             @x[4],@x[4]
866         rev             @x[5],@x[5]
867         rev             @x[6],@x[6]
868         rev             @x[7],@x[7]
869 # endif
870         eor             @x[4],@x[4],@t[0]
871          add            @t[0],sp,#4*(8)
872         eor             @x[5],@x[5],@t[1]
873         str             @x[4],[r14],#16         @ store output
874         eor             @x[6],@x[6],@t[2]
875         str             @x[5],[r14,#-12]
876         eor             @x[7],@x[7],@t[3]
877          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
878         str             @x[6],[r14,#-8]
879          add            @x[0],sp,#4*(16+8)
880         str             @x[7],[r14,#-4]
881
882         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
883
884         add             @x[0],@x[0],@t[0]       @ accumulate key material
885         ldr             @t[0],[r12],#16         @ load input
886         add             @x[1],@x[1],@t[1]
887         ldr             @t[1],[r12,#-12]
888 # ifdef __thumb2__
889         it      hi
890 # endif
891          strhi          @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
892         add             @x[2],@x[2],@t[2]
893         ldr             @t[2],[r12,#-8]
894 # ifdef __thumb2__
895         it      hi
896 # endif
897          strhi          @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
898         add             @x[3],@x[3],@t[3]
899         ldr             @t[3],[r12,#-4]
900 # ifdef __ARMEB__
901         rev             @x[0],@x[0]
902         rev             @x[1],@x[1]
903         rev             @x[2],@x[2]
904         rev             @x[3],@x[3]
905 # endif
906         eor             @x[0],@x[0],@t[0]
907          add            @t[0],sp,#4*(12)
908         eor             @x[1],@x[1],@t[1]
909         str             @x[0],[r14],#16         @ store output
910         eor             @x[2],@x[2],@t[2]
911         str             @x[1],[r14,#-12]
912         eor             @x[3],@x[3],@t[3]
913          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
914         str             @x[2],[r14,#-8]
915         str             @x[3],[r14,#-4]
916
917         add             @x[4],@x[4],@t[0]       @ accumulate key material
918          add            @t[0],@t[0],#4          @ next counter value
919         add             @x[5],@x[5],@t[1]
920          str            @t[0],[sp,#4*(12)]      @ save next counter value
921         ldr             @t[0],[r12],#16         @ load input
922         add             @x[6],@x[6],@t[2]
923          add            @x[4],@x[4],#3          @ counter+3
924         ldr             @t[1],[r12,#-12]
925         add             @x[7],@x[7],@t[3]
926         ldr             @t[2],[r12,#-8]
927         ldr             @t[3],[r12,#-4]
928 # ifdef __ARMEB__
929         rev             @x[4],@x[4]
930         rev             @x[5],@x[5]
931         rev             @x[6],@x[6]
932         rev             @x[7],@x[7]
933 # endif
934         eor             @x[4],@x[4],@t[0]
935 # ifdef __thumb2__
936         it      hi
937 # endif
938          ldrhi          @t[0],[sp,#4*(32+2)]    @ re-load len
939         eor             @x[5],@x[5],@t[1]
940         eor             @x[6],@x[6],@t[2]
941         str             @x[4],[r14],#16         @ store output
942         eor             @x[7],@x[7],@t[3]
943         str             @x[5],[r14,#-12]
944          sub            @t[3],@t[0],#64*4       @ len-=64*4
945         str             @x[6],[r14,#-8]
946         str             @x[7],[r14,#-4]
947         bhi             .Loop_neon_outer
948
949         b               .Ldone_neon
950
951 .align  4
952 .Lbreak_neon:
953         @ harmonize NEON and integer-only stack frames: load data
954         @ from NEON frame, but save to integer-only one; distance
955         @ between the two is 4*(32+4+16-32)=4*(20).
956
957         str             @t[3], [sp,#4*(20+32+2)]        @ save len
958          add            @t[3],sp,#4*(32+4)
959         str             r12,   [sp,#4*(20+32+1)]        @ save inp
960         str             r14,   [sp,#4*(20+32+0)]        @ save out
961
962         ldr             @x[12],[sp,#4*(16+10)]
963         ldr             @x[14],[sp,#4*(16+11)]
964          vldmia         @t[3],{d8-d15}                  @ fulfill ABI requirement
965         str             @x[12],[sp,#4*(20+16+10)]       @ copy "@x[10]"
966         str             @x[14],[sp,#4*(20+16+11)]       @ copy "@x[11]"
967
968         ldr             @t[3], [sp,#4*(15)]
969         ldr             @x[12],[sp,#4*(12)]             @ modulo-scheduled load
970         ldr             @t[2], [sp,#4*(13)]
971         ldr             @x[14],[sp,#4*(14)]
972         str             @t[3], [sp,#4*(20+16+15)]
973         add             @t[3],sp,#4*(20)
974         vst1.32         {$a0-$b0},[@t[3]]!              @ copy key
975         add             sp,sp,#4*(20)                   @ switch frame
976         vst1.32         {$c0-$d0},[@t[3]]
977         mov             @t[3],#10
978         b               .Loop                           @ go integer-only
979
980 .align  4
981 .Ltail_neon:
982         cmp             @t[3],#64*3
983         bhs             .L192_or_more_neon
984         cmp             @t[3],#64*2
985         bhs             .L128_or_more_neon
986         cmp             @t[3],#64*1
987         bhs             .L64_or_more_neon
988
989         add             @t[0],sp,#4*(8)
990         vst1.8          {$a0-$b0},[sp]
991         add             @t[2],sp,#4*(0)
992         vst1.8          {$c0-$d0},[@t[0]]
993         b               .Loop_tail_neon
994
995 .align  4
996 .L64_or_more_neon:
997         vld1.8          {$t0-$t1},[r12]!
998         vld1.8          {$t2-$t3},[r12]!
999         veor            $a0,$a0,$t0
1000         veor            $b0,$b0,$t1
1001         veor            $c0,$c0,$t2
1002         veor            $d0,$d0,$t3
1003         vst1.8          {$a0-$b0},[r14]!
1004         vst1.8          {$c0-$d0},[r14]!
1005
1006         beq             .Ldone_neon
1007
1008         add             @t[0],sp,#4*(8)
1009         vst1.8          {$a1-$b1},[sp]
1010         add             @t[2],sp,#4*(0)
1011         vst1.8          {$c1-$d1},[@t[0]]
1012         sub             @t[3],@t[3],#64*1       @ len-=64*1
1013         b               .Loop_tail_neon
1014
1015 .align  4
1016 .L128_or_more_neon:
1017         vld1.8          {$t0-$t1},[r12]!
1018         vld1.8          {$t2-$t3},[r12]!
1019         veor            $a0,$a0,$t0
1020         veor            $b0,$b0,$t1
1021         vld1.8          {$t0-$t1},[r12]!
1022         veor            $c0,$c0,$t2
1023         veor            $d0,$d0,$t3
1024         vld1.8          {$t2-$t3},[r12]!
1025
1026         veor            $a1,$a1,$t0
1027         veor            $b1,$b1,$t1
1028          vst1.8         {$a0-$b0},[r14]!
1029         veor            $c1,$c1,$t2
1030          vst1.8         {$c0-$d0},[r14]!
1031         veor            $d1,$d1,$t3
1032         vst1.8          {$a1-$b1},[r14]!
1033         vst1.8          {$c1-$d1},[r14]!
1034
1035         beq             .Ldone_neon
1036
1037         add             @t[0],sp,#4*(8)
1038         vst1.8          {$a2-$b2},[sp]
1039         add             @t[2],sp,#4*(0)
1040         vst1.8          {$c2-$d2},[@t[0]]
1041         sub             @t[3],@t[3],#64*2       @ len-=64*2
1042         b               .Loop_tail_neon
1043
1044 .align  4
1045 .L192_or_more_neon:
1046         vld1.8          {$t0-$t1},[r12]!
1047         vld1.8          {$t2-$t3},[r12]!
1048         veor            $a0,$a0,$t0
1049         veor            $b0,$b0,$t1
1050         vld1.8          {$t0-$t1},[r12]!
1051         veor            $c0,$c0,$t2
1052         veor            $d0,$d0,$t3
1053         vld1.8          {$t2-$t3},[r12]!
1054
1055         veor            $a1,$a1,$t0
1056         veor            $b1,$b1,$t1
1057         vld1.8          {$t0-$t1},[r12]!
1058         veor            $c1,$c1,$t2
1059          vst1.8         {$a0-$b0},[r14]!
1060         veor            $d1,$d1,$t3
1061         vld1.8          {$t2-$t3},[r12]!
1062
1063         veor            $a2,$a2,$t0
1064          vst1.8         {$c0-$d0},[r14]!
1065         veor            $b2,$b2,$t1
1066          vst1.8         {$a1-$b1},[r14]!
1067         veor            $c2,$c2,$t2
1068          vst1.8         {$c1-$d1},[r14]!
1069         veor            $d2,$d2,$t3
1070         vst1.8          {$a2-$b2},[r14]!
1071         vst1.8          {$c2-$d2},[r14]!
1072
1073         beq             .Ldone_neon
1074
1075         ldmia           sp,{@t[0]-@t[3]}        @ load key material
1076         add             @x[0],@x[0],@t[0]       @ accumulate key material
1077          add            @t[0],sp,#4*(4)
1078         add             @x[1],@x[1],@t[1]
1079         add             @x[2],@x[2],@t[2]
1080         add             @x[3],@x[3],@t[3]
1081          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1082
1083         add             @x[4],@x[4],@t[0]       @ accumulate key material
1084          add            @t[0],sp,#4*(8)
1085         add             @x[5],@x[5],@t[1]
1086         add             @x[6],@x[6],@t[2]
1087         add             @x[7],@x[7],@t[3]
1088          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1089 # ifdef __ARMEB__
1090         rev             @x[0],@x[0]
1091         rev             @x[1],@x[1]
1092         rev             @x[2],@x[2]
1093         rev             @x[3],@x[3]
1094         rev             @x[4],@x[4]
1095         rev             @x[5],@x[5]
1096         rev             @x[6],@x[6]
1097         rev             @x[7],@x[7]
1098 # endif
1099         stmia           sp,{@x[0]-@x[7]}
1100          add            @x[0],sp,#4*(16+8)
1101
1102         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
1103
1104         add             @x[0],@x[0],@t[0]       @ accumulate key material
1105          add            @t[0],sp,#4*(12)
1106         add             @x[1],@x[1],@t[1]
1107         add             @x[2],@x[2],@t[2]
1108         add             @x[3],@x[3],@t[3]
1109          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1110
1111         add             @x[4],@x[4],@t[0]       @ accumulate key material
1112          add            @t[0],sp,#4*(8)
1113         add             @x[5],@x[5],@t[1]
1114          add            @x[4],@x[4],#3          @ counter+3
1115         add             @x[6],@x[6],@t[2]
1116         add             @x[7],@x[7],@t[3]
1117          ldr            @t[3],[sp,#4*(32+2)]    @ re-load len
1118 # ifdef __ARMEB__
1119         rev             @x[0],@x[0]
1120         rev             @x[1],@x[1]
1121         rev             @x[2],@x[2]
1122         rev             @x[3],@x[3]
1123         rev             @x[4],@x[4]
1124         rev             @x[5],@x[5]
1125         rev             @x[6],@x[6]
1126         rev             @x[7],@x[7]
1127 # endif
1128         stmia           @t[0],{@x[0]-@x[7]}
1129          add            @t[2],sp,#4*(0)
1130          sub            @t[3],@t[3],#64*3       @ len-=64*3
1131
1132 .Loop_tail_neon:
1133         ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
1134         ldrb            @t[1],[r12],#1          @ read input
1135         subs            @t[3],@t[3],#1
1136         eor             @t[0],@t[0],@t[1]
1137         strb            @t[0],[r14],#1          @ store output
1138         bne             .Loop_tail_neon
1139
1140 .Ldone_neon:
1141         add             sp,sp,#4*(32+4)
1142         vldmia          sp,{d8-d15}
1143         add             sp,sp,#4*(16+3)
1144         ldmia           sp!,{r4-r11,pc}
1145 .size   ChaCha20_neon,.-ChaCha20_neon
1146 .comm   OPENSSL_armcap_P,4,4
1147 #endif
1148 ___
1149 }}}
1150
1151 foreach (split("\n",$code)) {
1152         s/\`([^\`]*)\`/eval $1/geo;
1153
1154         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1155
1156         print $_,"\n";
1157 }
1158 close STDOUT;