2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
24 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
30 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
31 die "can't locate x86_64-xlate.pl";
33 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
38 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
39 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
47 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
51 .globl padlock_capability
52 .type padlock_capability,\@abi-omnipotent
59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
63 cmp \$`"0x".unpack("H*",'slua')`,%ecx
75 or \$0x10,%eax # set Nano bit#4
79 .size padlock_capability,.-padlock_capability
81 .globl padlock_key_bswap
82 .type padlock_key_bswap,\@abi-omnipotent,0
94 .size padlock_key_bswap,.-padlock_key_bswap
96 .globl padlock_verify_context
97 .type padlock_verify_context,\@abi-omnipotent
99 padlock_verify_context:
102 lea .Lpadlock_saved_context(%rip),%rax
103 call _padlock_verify_ctx
106 .size padlock_verify_context,.-padlock_verify_context
108 .type _padlock_verify_ctx,\@abi-omnipotent
121 .size _padlock_verify_ctx,.-_padlock_verify_ctx
123 .globl padlock_reload_key
124 .type padlock_reload_key,\@abi-omnipotent
130 .size padlock_reload_key,.-padlock_reload_key
132 .globl padlock_aes_block
133 .type padlock_aes_block,\@function,3
138 lea 32($ctx),%rbx # key
139 lea 16($ctx),$ctx # control word
140 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
143 .size padlock_aes_block,.-padlock_aes_block
145 .globl padlock_xstore
146 .type padlock_xstore,\@function,2
150 .byte 0x0f,0xa7,0xc0 # xstore
152 .size padlock_xstore,.-padlock_xstore
154 .globl padlock_sha1_oneshot
155 .type padlock_sha1_oneshot,\@function,3
157 padlock_sha1_oneshot:
159 mov %rdi,%rdx # put aside %rdi
160 movups (%rdi),%xmm0 # copy-in context
167 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
171 movups %xmm0,(%rdx) # copy-out context
174 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
176 .globl padlock_sha1_blocks
177 .type padlock_sha1_blocks,\@function,3
181 mov %rdi,%rdx # put aside %rdi
182 movups (%rdi),%xmm0 # copy-in context
189 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
193 movups %xmm0,(%rdx) # copy-out context
196 .size padlock_sha1_blocks,.-padlock_sha1_blocks
198 .globl padlock_sha256_oneshot
199 .type padlock_sha256_oneshot,\@function,3
201 padlock_sha256_oneshot:
203 mov %rdi,%rdx # put aside %rdi
204 movups (%rdi),%xmm0 # copy-in context
206 movups 16(%rdi),%xmm1
209 movaps %xmm1,16(%rsp)
211 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
213 movaps 16(%rsp),%xmm1
215 movups %xmm0,(%rdx) # copy-out context
216 movups %xmm1,16(%rdx)
218 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
220 .globl padlock_sha256_blocks
221 .type padlock_sha256_blocks,\@function,3
223 padlock_sha256_blocks:
225 mov %rdi,%rdx # put aside %rdi
226 movups (%rdi),%xmm0 # copy-in context
228 movups 16(%rdi),%xmm1
231 movaps %xmm1,16(%rsp)
233 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
235 movaps 16(%rsp),%xmm1
237 movups %xmm0,(%rdx) # copy-out context
238 movups %xmm1,16(%rdx)
240 .size padlock_sha256_blocks,.-padlock_sha256_blocks
242 .globl padlock_sha512_blocks
243 .type padlock_sha512_blocks,\@function,3
245 padlock_sha512_blocks:
247 mov %rdi,%rdx # put aside %rdi
248 movups (%rdi),%xmm0 # copy-in context
250 movups 16(%rdi),%xmm1
251 movups 32(%rdi),%xmm2
252 movups 48(%rdi),%xmm3
255 movaps %xmm1,16(%rsp)
256 movaps %xmm2,32(%rsp)
257 movaps %xmm3,48(%rsp)
258 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
260 movaps 16(%rsp),%xmm1
261 movaps 32(%rsp),%xmm2
262 movaps 48(%rsp),%xmm3
264 movups %xmm0,(%rdx) # copy-out context
265 movups %xmm1,16(%rdx)
266 movups %xmm2,32(%rdx)
267 movups %xmm3,48(%rdx)
269 .size padlock_sha512_blocks,.-padlock_sha512_blocks
273 my ($mode,$opcode) = @_;
274 # int padlock_$mode_encrypt(void *out, const void *inp,
275 # struct padlock_cipher_data *ctx, size_t len);
277 .globl padlock_${mode}_encrypt
278 .type padlock_${mode}_encrypt,\@function,4
280 padlock_${mode}_encrypt:
289 lea .Lpadlock_saved_context(%rip),%rax
292 call _padlock_verify_ctx
293 lea 16($ctx),$ctx # control word
296 testl \$`1<<5`,($ctx) # align bit in control word
297 jnz .L${mode}_aligned
299 setz %al # !out_misaligned
301 setz %bl # !inp_misaligned
303 jnz .L${mode}_aligned
305 mov \$$PADLOCK_CHUNK,$chunk
306 not %rax # out_misaligned?-1:0
309 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
310 and $chunk,%rax # out_misaligned?chunk:0
313 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
315 mov \$$PADLOCK_CHUNK,%rax
316 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
318 $code.=<<___ if ($mode eq "ctr32");
320 mov -4($ctx),%eax # pull 32-bit counter
323 and \$`$PADLOCK_CHUNK/16-1`,%eax
324 mov \$$PADLOCK_CHUNK,$chunk
328 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
331 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
334 mov $inp,%rax # check if prefetch crosses page
339 and \$0xfff,%rax # distance to page boundary
340 cmp \$$PADLOCK_PREFETCH{$mode},%rax
341 mov \$-$PADLOCK_PREFETCH{$mode},%rax
342 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
344 jz .L${mode}_unaligned_tail
350 cmp $len,$chunk # ctr32 artefact
351 cmova $len,$chunk # ctr32 artefact
352 mov $out,%r8 # save parameters
357 test \$0x0f,$out # out_misaligned
359 test \$0x0f,$inp # inp_misaligned
360 jz .L${mode}_inp_aligned
362 .byte 0xf3,0x48,0xa5 # rep movsq
366 .L${mode}_inp_aligned:
367 lea -16($ctx),%rax # ivp
368 lea 16($ctx),%rbx # key
370 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
372 $code.=<<___ if ($mode !~ /ecb|ctr/);
374 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
376 $code.=<<___ if ($mode eq "ctr32");
377 mov -4($ctx),%eax # pull 32-bit counter
378 test \$0xffff0000,%eax
379 jnz .L${mode}_no_carry
387 mov %r8,$out # restore parameters
390 jz .L${mode}_out_aligned
394 .byte 0xf3,0x48,0xa5 # rep movsq
396 .L${mode}_out_aligned:
402 mov \$$PADLOCK_CHUNK,$chunk
404 if (!$PADLOCK_PREFETCH{$mode}) {
414 $code.=<<___ if ($mode eq "ctr32");
416 mov $inp,%rax # check if prefetch crosses page
421 and \$0xfff,%rax # distance to page boundary
422 cmp \$$PADLOCK_PREFETCH{$mode},%rax
423 mov \$-$PADLOCK_PREFETCH{$mode},%rax
429 .L${mode}_unaligned_tail:
433 mov $out,%r8 # save parameters
435 sub %rax,%rsp # alloca
438 .byte 0xf3,0x48,0xa5 # rep movsq
440 mov %r8, $out # restore parameters
466 $code.=<<___ if ($mode eq "ctr32");
467 mov -4($ctx),%eax # pull 32-bit counter
471 mov \$`16*0x10000`,$chunk
475 cmova %rax,$chunk # don't let counter cross 2^16
477 jbe .L${mode}_aligned_skip
479 .L${mode}_aligned_loop:
480 mov $len,%r10 # save parameters
484 lea -16($ctx),%rax # ivp
485 lea 16($ctx),%rbx # key
486 shr \$4,$len # len/=AES_BLOCK_SIZE
487 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
489 mov -4($ctx),%eax # pull 32-bit counter
495 mov %r10,$len # restore parameters
497 mov \$`16*0x10000`,$chunk
500 jae .L${mode}_aligned_loop
502 .L${mode}_aligned_skip:
504 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
507 and \$0xfff,%rbp # distance to page boundary
509 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
510 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
512 and $len,%rbp # remainder
514 jz .L${mode}_aligned_tail
517 lea -16($ctx),%rax # ivp
518 lea 16($ctx),%rbx # key
519 shr \$4,$len # len/=AES_BLOCK_SIZE
520 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
522 $code.=<<___ if ($mode !~ /ecb|ctr/);
524 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
526 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
527 test %rbp,%rbp # check remainder
530 .L${mode}_aligned_tail:
538 .byte 0xf3,0x48,0xa5 # rep movsq
552 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
556 &generate_mode("ecb",0xc8);
557 &generate_mode("cbc",0xd0);
558 &generate_mode("cfb",0xe0);
559 &generate_mode("ofb",0xe8);
560 &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
563 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
567 .Lpadlock_saved_context:
570 $code =~ s/\`([^\`]*)\`/eval($1)/gem;