2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # If compared to compiler-generated code with similar characteristics,
22 # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
23 # this implementation is 25% smaller and >2x faster. In absolute terms
24 # performance is (quite impressive) ~6.5 cycles per processed byte.
25 # Fully unrolled assembler would be ~5x larger and is likely to be
26 # ~15% faster. It would be free from references to intermediate ring
27 # buffer, but put more pressure on L1P [both because the code would be
28 # larger and won't be using SPLOOP buffer]. There are no plans to
29 # realize fully unrolled variant though...
31 # !!! Note that this module uses AMR, which means that all interrupt
32 # service routines are expected to preserve it and for own well-being
35 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
36 open STDOUT,">$output";
38 ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
40 ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
41 ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
42 ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
43 ($XPA,$XPB) = ("A5","B5"); # X circular buffer
44 ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
49 .if .ASSEMBLER_VERSION<7000000
53 .asg sha1_block_data_order,_sha1_block_data_order
65 .global _sha1_block_data_order
66 _sha1_block_data_order:
67 .asmfunc stack_usage(64)
68 MV $NUM,A0 ; reassign $NUM
70 [!A0] BNOP RA ; if ($NUM==0) return;
71 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
73 [A0] LDW *${CTX}[0],$A ; load A-E...
74 || [A0] AND B0,SP,SP ; align stack at 64 bytes
75 [A0] LDW *${CTX}[1],$B
76 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
77 [A0] LDW *${CTX}[2],$C
78 || [A0] MVK 0x00404,B0
79 [A0] LDW *${CTX}[3],$D
80 || [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
81 [A0] LDW *${CTX}[4],$E
82 || [A0] MVC B0,AMR ; setup circular addressing
83 LDNW *${INP}++,$TX1 ; pre-fetch input
91 MVKH 0x5a820000,$K ; K_00_19
95 ;;==================================================
96 SPLOOPD 5 ; BODY_00_13
105 || ADD $K,$E,$T ; T=E+K
107 XOR $F0,$F,$F ; F_00_19(B,C,D)
111 || LDNW *${INP}++,$TX1
113 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
114 || ROTL $B,30,$C ; C=ROL(B,30)
115 || SWAP4 $TX2,$TX3 ; byte swap
117 ADD $Arot,$T,$T ; T+=ROL(A,5)
120 ADD $TX3,$T,$A ; A=T+Xi
121 || STW $TX3,*${XPB}++
123 ;;==================================================
124 ROTL $A,5,$Arot ; BODY_14
127 || ADD $K,$E,$T ; T=E+K
129 XOR $F0,$F,$F ; F_00_19(B,C,D)
133 || LDNW *${INP}++,$TX1
135 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
136 || ROTL $B,30,$C ; C=ROL(B,30)
137 || SWAP4 $TX2,$TX2 ; byte swap
138 || LDW *${XPA}++,$X0 ; fetches from X ring buffer are
139 || LDW *${XPB}[4],$X2 ; 2 iterations ahead
141 ADD $Arot,$T,$T ; T+=ROL(A,5)
143 || LDW *${XPA}[7],$X8
144 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
147 ADD $TX2,$T,$A ; A=T+Xi
148 || STW $TX2,*${XPB}++
149 ;;==================================================
150 ROTL $A,5,$Arot ; BODY_15
153 || ADD $K,$E,$T ; T=E+K
155 XOR $F0,$F,$F ; F_00_19(B,C,D)
160 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
161 || ROTL $B,30,$C ; C=ROL(B,30)
162 || SWAP4 $TX2,$TX2 ; byte swap
163 || XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
165 || LDW *${XPB}[4],$X2
167 ADD $Arot,$T,$T ; T+=ROL(A,5)
170 || LDW *${XPA}[7],$X8
171 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
174 ADD $TX2,$T,$A ; A=T+Xi
175 || STW $TX2,*${XPB}++
176 || XOR $TX0,$TX1,$TX1
178 ;;==================================================
179 SPLOOPD 5 ; BODY_16_19
185 || ADD $K,$E,$T ; T=E+K
186 || ROTL $TX1,1,$TX2 ; Xupdate output
188 XOR $F0,$F,$F ; F_00_19(B,C,D)
192 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
193 || ROTL $B,30,$C ; C=ROL(B,30)
196 || LDW *${XPB}[4],$X2
198 ADD $Arot,$T,$T ; T+=ROL(A,5)
201 || LDW *${XPA}[7],$X8
202 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
205 ADD $TX2,$T,$A ; A=T+Xi
206 || STW $TX2,*${XPB}++
207 || XOR $TX0,$TX1,$TX1
212 MVKH 0x6ed90000,$K ; K_20_39
216 ;;==================================================
217 SPLOOPD 5 ; BODY_20_39
222 || ADD $K,$E,$T ; T=E+K
223 || ROTL $TX1,1,$TX2 ; Xupdate output
225 XOR $D,$F,$F ; F_20_39(B,C,D)
229 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
230 || ROTL $B,30,$C ; C=ROL(B,30)
233 || LDW *${XPB}[4],$X2
235 ADD $Arot,$T,$T ; T+=ROL(A,5)
238 || LDW *${XPA}[7],$X8
239 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
242 ADD $TX2,$T,$A ; A=T+Xi
243 || STW $TX2,*${XPB}++ ; last one is redundant
244 || XOR $TX0,$TX1,$TX1
247 $code.=<<___ if (!shift);
249 MVKH 0x8f1b0000,$K ; K_40_59
253 ;;==================================================
254 SPLOOPD 5 ; BODY_40_59
262 || ADD $K,$E,$T ; T=E+K
263 || ROTL $TX1,1,$TX2 ; Xupdate output
265 XOR $F0,$F,$F ; F_40_59(B,C,D)
269 ADD $F,$T,$T ; T+=F_40_59(B,C,D)
270 || ROTL $B,30,$C ; C=ROL(B,30)
273 || LDW *${XPB}[4],$X2
275 ADD $Arot,$T,$T ; T+=ROL(A,5)
278 || LDW *${XPA}[7],$X8
279 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
282 ADD $TX2,$T,$A ; A=T+Xi
283 || STW $TX2,*${XPB}++
284 || XOR $TX0,$TX1,$TX1
291 MVKH 0xca620000,$K ; K_60_79
293 &BODY_20_39(-1); # BODY_60_78
295 ;;==================================================
297 || ROTL $A,5,$Arot ; BODY_79
299 || ROTL $TX1,1,$TX2 ; Xupdate output
301 [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
302 || ADD $K,$E,$T ; T=E+K
303 || XOR $D,$F,$F ; F_20_39(B,C,D)
305 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
306 || ADD $Ectx,$D,$E ; E=D,E+=Ectx
307 || ADD $Dctx,$C,$D ; D=C,D+=Dctx
308 || ROTL $B,30,$C ; C=ROL(B,30)
310 ADD $Arot,$T,$T ; T+=ROL(A,5)
311 || ADD $Bctx,$A,$B ; B=A,B+=Bctx
313 ADD $TX2,$T,$A ; A=T+Xi
315 ADD $Actx,$A,$A ; A+=Actx
316 || ADD $Cctx,$C,$C ; C+=Cctx
320 || MV FP,SP ; restore stack pointer
321 || LDW *FP[0],FP ; restore frame pointer
322 STW $A,*${CTX}[0] ; emit A-E...
325 || MVC B0,AMR ; clear AMR
332 .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"