2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ##############################################################################
12 # Copyright 2014 Intel Corporation #
14 # Licensed under the Apache License, Version 2.0 (the "License"); #
15 # you may not use this file except in compliance with the License. #
16 # You may obtain a copy of the License at #
18 # http://www.apache.org/licenses/LICENSE-2.0 #
20 # Unless required by applicable law or agreed to in writing, software #
21 # distributed under the License is distributed on an "AS IS" BASIS, #
22 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
23 # See the License for the specific language governing permissions and #
24 # limitations under the License. #
26 ##############################################################################
28 # Developers and authors: #
29 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
30 # (1) Intel Corporation, Israel Development Center #
31 # (2) University of Haifa #
33 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
36 ##############################################################################
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
49 open OUT,"| \"$^X\" $xlate $flavour $output";
52 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
53 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.19) + ($1>=2.22);
58 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
59 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.09) + ($1>=2.10);
64 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
65 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
66 $avx = ($1>=10) + ($1>=11);
70 if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
71 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
72 $avx = ($ver>=3.0) + ($ver>=3.01);
86 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
87 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
88 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
89 .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
90 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
91 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
92 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
93 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
94 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
97 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
98 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
99 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
100 .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
101 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
102 .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
103 .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
104 .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
105 .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
108 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
109 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
110 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
111 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
112 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
113 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
114 .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
115 .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
116 .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
119 .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
120 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
121 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
122 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
123 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
124 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
125 .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
126 .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
127 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
129 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
130 # Montgomery form (*2^256) to our format (*2^261)
133 .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
134 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
135 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
136 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
137 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
138 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
139 .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
140 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
141 .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
144 .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
145 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
146 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
147 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
148 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
149 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
150 .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
151 .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
152 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
155 .long 1,1,1,1,1,1,1,1
159 # This function receives a pointer to an array of four affine points
160 # (X, Y, <1>) and rearanges the data for AVX2 execution, while
161 # converting it to 2^29 radix redundant form
163 my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
164 $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
167 .globl ecp_nistz256_avx2_transpose_convert
168 .type ecp_nistz256_avx2_transpose_convert,\@function,2
170 ecp_nistz256_avx2_transpose_convert:
173 $code.=<<___ if ($win64);
174 lea -8-16*10(%rsp), %rsp
175 vmovaps %xmm6, -8-16*10(%rax)
176 vmovaps %xmm7, -8-16*9(%rax)
177 vmovaps %xmm8, -8-16*8(%rax)
178 vmovaps %xmm9, -8-16*7(%rax)
179 vmovaps %xmm10, -8-16*6(%rax)
180 vmovaps %xmm11, -8-16*5(%rax)
181 vmovaps %xmm12, -8-16*4(%rax)
182 vmovaps %xmm13, -8-16*3(%rax)
183 vmovaps %xmm14, -8-16*2(%rax)
184 vmovaps %xmm15, -8-16*1(%rax)
188 vmovdqa 32*0(%rsi), $X0
189 lea 112(%rsi), %rax # size optimization
190 vmovdqa 32*1(%rsi), $Y0
191 lea .LAVX2_AND_MASK(%rip), %rdx
192 vmovdqa 32*2(%rsi), $X1
193 vmovdqa 32*3(%rsi), $Y1
194 vmovdqa 32*4-112(%rax), $X2
195 vmovdqa 32*5-112(%rax), $Y2
196 vmovdqa 32*6-112(%rax), $X3
197 vmovdqa 32*7-112(%rax), $Y3
199 # Transpose X and Y independently
200 vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0]
201 vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0]
202 vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1]
203 vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1]
205 vpunpcklqdq $Y1, $Y0, $T4
206 vpunpcklqdq $Y3, $Y2, $T5
207 vpunpckhqdq $Y1, $Y0, $T6
208 vpunpckhqdq $Y3, $Y2, $T7
210 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0]
211 vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1]
212 vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2]
213 vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3]
215 vperm2i128 \$0x20, $T5, $T4, $Y0
216 vperm2i128 \$0x20, $T7, $T6, $Y1
217 vperm2i128 \$0x31, $T5, $T4, $Y2
218 vperm2i128 \$0x31, $T7, $T6, $Y3
221 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask;
222 vpsrlq \$29, $X0, $X0
223 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask;
224 vpsrlq \$29, $X0, $X0
227 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
228 vpsrlq \$23, $X1, $X1
229 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
230 vpsrlq \$29, $X1, $X1
231 vpsllq \$12, $X2, $T4
233 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
234 vpsrlq \$17, $X2, $X2
235 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
236 vpsrlq \$29, $X2, $X2
237 vpsllq \$18, $X3, $T6
239 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
240 vpsrlq \$11, $X3, $X3
241 vmovdqa $T0, 32*0(%rdi)
242 lea 112(%rdi), %rax # size optimization
243 vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
244 vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
246 vmovdqa $T1, 32*1(%rdi)
247 vmovdqa $T2, 32*2(%rdi)
248 vmovdqa $T3, 32*3(%rdi)
249 vmovdqa $T4, 32*4-112(%rax)
250 vmovdqa $T5, 32*5-112(%rax)
251 vmovdqa $T6, 32*6-112(%rax)
252 vmovdqa $T0, 32*7-112(%rax)
253 vmovdqa $X3, 32*8-112(%rax)
254 lea 448(%rdi), %rax # size optimization
256 vpand $T7, $Y0, $T0 # out[0] = in[0] & mask;
257 vpsrlq \$29, $Y0, $Y0
258 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask;
259 vpsrlq \$29, $Y0, $Y0
262 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
263 vpsrlq \$23, $Y1, $Y1
264 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask;
265 vpsrlq \$29, $Y1, $Y1
266 vpsllq \$12, $Y2, $T4
268 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
269 vpsrlq \$17, $Y2, $Y2
270 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask;
271 vpsrlq \$29, $Y2, $Y2
272 vpsllq \$18, $Y3, $T6
274 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
275 vpsrlq \$11, $Y3, $Y3
276 vmovdqa $T0, 32*9-448(%rax)
277 vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask;
278 vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask;
280 vmovdqa $T1, 32*10-448(%rax)
281 vmovdqa $T2, 32*11-448(%rax)
282 vmovdqa $T3, 32*12-448(%rax)
283 vmovdqa $T4, 32*13-448(%rax)
284 vmovdqa $T5, 32*14-448(%rax)
285 vmovdqa $T6, 32*15-448(%rax)
286 vmovdqa $T0, 32*16-448(%rax)
287 vmovdqa $Y3, 32*17-448(%rax)
291 $code.=<<___ if ($win64);
292 movaps 16*0(%rsp), %xmm6
293 movaps 16*1(%rsp), %xmm7
294 movaps 16*2(%rsp), %xmm8
295 movaps 16*3(%rsp), %xmm9
296 movaps 16*4(%rsp), %xmm10
297 movaps 16*5(%rsp), %xmm11
298 movaps 16*6(%rsp), %xmm12
299 movaps 16*7(%rsp), %xmm13
300 movaps 16*8(%rsp), %xmm14
301 movaps 16*9(%rsp), %xmm15
302 lea 8+16*10(%rsp), %rsp
306 .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
310 ################################################################################
311 # This function receives a pointer to an array of four AVX2 formatted points
312 # (X, Y, Z) convert the data to normal representation, and rearanges the data
314 my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
315 my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
319 .globl ecp_nistz256_avx2_convert_transpose_back
320 .type ecp_nistz256_avx2_convert_transpose_back,\@function,2
322 ecp_nistz256_avx2_convert_transpose_back:
325 $code.=<<___ if ($win64);
326 lea -8-16*10(%rsp), %rsp
327 vmovaps %xmm6, -8-16*10(%rax)
328 vmovaps %xmm7, -8-16*9(%rax)
329 vmovaps %xmm8, -8-16*8(%rax)
330 vmovaps %xmm9, -8-16*7(%rax)
331 vmovaps %xmm10, -8-16*6(%rax)
332 vmovaps %xmm11, -8-16*5(%rax)
333 vmovaps %xmm12, -8-16*4(%rax)
334 vmovaps %xmm13, -8-16*3(%rax)
335 vmovaps %xmm14, -8-16*2(%rax)
336 vmovaps %xmm15, -8-16*1(%rax)
342 vmovdqa 32*0(%rsi), $D0
343 lea 160(%rsi), %rax # size optimization
344 vmovdqa 32*1(%rsi), $D1
345 vmovdqa 32*2(%rsi), $D2
346 vmovdqa 32*3(%rsi), $D3
347 vmovdqa 32*4-160(%rax), $D4
348 vmovdqa 32*5-160(%rax), $D5
349 vmovdqa 32*6-160(%rax), $D6
350 vmovdqa 32*7-160(%rax), $D7
351 vmovdqa 32*8-160(%rax), $D8
353 vpsllq \$29, $D1, $D1
354 vpsllq \$58, $D2, $T0
356 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
359 vpsllq \$23, $D3, $D3
360 vpsllq \$52, $D4, $T1
362 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
364 vpsrlq \$12, $D4, $D4
365 vpsllq \$17, $D5, $D5
366 vpsllq \$46, $D6, $T2
368 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
370 vpsrlq \$18, $D6, $D6
371 vpsllq \$11, $D7, $D7
372 vpsllq \$40, $D8, $T3
374 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
376 vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0]
377 vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0]
378 vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1]
379 vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1]
381 vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0]
382 vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1]
383 vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2]
384 vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3]
386 vmovdqa $D0, 32*0(%rdi)
387 vmovdqa $D1, 32*3(%rdi)
388 vmovdqa $D2, 32*6(%rdi)
389 vmovdqa $D3, 32*9(%rdi)
399 $code.=<<___ if ($win64);
400 movaps 16*0(%rsp), %xmm6
401 movaps 16*1(%rsp), %xmm7
402 movaps 16*2(%rsp), %xmm8
403 movaps 16*3(%rsp), %xmm9
404 movaps 16*4(%rsp), %xmm10
405 movaps 16*5(%rsp), %xmm11
406 movaps 16*6(%rsp), %xmm12
407 movaps 16*7(%rsp), %xmm13
408 movaps 16*8(%rsp), %xmm14
409 movaps 16*9(%rsp), %xmm15
410 lea 8+16*10(%rsp), %rsp
414 .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
418 my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
419 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
420 my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
424 vpsrlq $digit_size, $ACC0, $T0
425 vpand $AND_MASK, $ACC0, $ACC0
426 vpaddq $T0, $ACC1, $ACC1
428 vpsrlq $digit_size, $ACC1, $T0
429 vpand $AND_MASK, $ACC1, $ACC1
430 vpaddq $T0, $ACC2, $ACC2
432 vpsrlq $digit_size, $ACC2, $T0
433 vpand $AND_MASK, $ACC2, $ACC2
434 vpaddq $T0, $ACC3, $ACC3
436 vpsrlq $digit_size, $ACC3, $T0
437 vpand $AND_MASK, $ACC3, $ACC3
438 vpaddq $T0, $ACC4, $ACC4
440 vpsrlq $digit_size, $ACC4, $T0
441 vpand $AND_MASK, $ACC4, $ACC4
442 vpaddq $T0, $ACC5, $ACC5
444 vpsrlq $digit_size, $ACC5, $T0
445 vpand $AND_MASK, $ACC5, $ACC5
446 vpaddq $T0, $ACC6, $ACC6
448 vpsrlq $digit_size, $ACC6, $T0
449 vpand $AND_MASK, $ACC6, $ACC6
450 vpaddq $T0, $ACC7, $ACC7
452 vpsrlq $digit_size, $ACC7, $T0
453 vpand $AND_MASK, $ACC7, $ACC7
454 vpaddq $T0, $ACC8, $ACC8
455 #vpand $AND_MASK, $ACC8, $ACC8
462 vmovdqa $ACC0, 32*0(%rdi)
463 lea 160(%rdi), %rax # size optimization
464 vmovdqa $ACC1, 32*1(%rdi)
465 vmovdqa $ACC2, 32*2(%rdi)
466 vmovdqa $ACC3, 32*3(%rdi)
467 vmovdqa $ACC4, 32*4-160(%rax)
468 vmovdqa $ACC5, 32*5-160(%rax)
469 vmovdqa $ACC6, 32*6-160(%rax)
470 vmovdqa $ACC7, 32*7-160(%rax)
471 vmovdqa $ACC8, 32*8-160(%rax)
477 .type avx2_normalize,\@abi-omnipotent
480 vpsrlq $digit_size, $ACC0, $T0
481 vpand $AND_MASK, $ACC0, $ACC0
482 vpaddq $T0, $ACC1, $ACC1
484 vpsrlq $digit_size, $ACC1, $T0
485 vpand $AND_MASK, $ACC1, $ACC1
486 vpaddq $T0, $ACC2, $ACC2
488 vpsrlq $digit_size, $ACC2, $T0
489 vpand $AND_MASK, $ACC2, $ACC2
490 vpaddq $T0, $ACC3, $ACC3
492 vpsrlq $digit_size, $ACC3, $T0
493 vpand $AND_MASK, $ACC3, $ACC3
494 vpaddq $T0, $ACC4, $ACC4
496 vpsrlq $digit_size, $ACC4, $T0
497 vpand $AND_MASK, $ACC4, $ACC4
498 vpaddq $T0, $ACC5, $ACC5
500 vpsrlq $digit_size, $ACC5, $T0
501 vpand $AND_MASK, $ACC5, $ACC5
502 vpaddq $T0, $ACC6, $ACC6
504 vpsrlq $digit_size, $ACC6, $T0
505 vpand $AND_MASK, $ACC6, $ACC6
506 vpaddq $T0, $ACC7, $ACC7
508 vpsrlq $digit_size, $ACC7, $T0
509 vpand $AND_MASK, $ACC7, $ACC7
510 vpaddq $T0, $ACC8, $ACC8
511 #vpand $AND_MASK, $ACC8, $ACC8
514 .size avx2_normalize,.-avx2_normalize
516 .type avx2_normalize_n_store,\@abi-omnipotent
518 avx2_normalize_n_store:
519 vpsrlq $digit_size, $ACC0, $T0
520 vpand $AND_MASK, $ACC0, $ACC0
521 vpaddq $T0, $ACC1, $ACC1
523 vpsrlq $digit_size, $ACC1, $T0
524 vpand $AND_MASK, $ACC1, $ACC1
525 vmovdqa $ACC0, 32*0(%rdi)
526 lea 160(%rdi), %rax # size optimization
527 vpaddq $T0, $ACC2, $ACC2
529 vpsrlq $digit_size, $ACC2, $T0
530 vpand $AND_MASK, $ACC2, $ACC2
531 vmovdqa $ACC1, 32*1(%rdi)
532 vpaddq $T0, $ACC3, $ACC3
534 vpsrlq $digit_size, $ACC3, $T0
535 vpand $AND_MASK, $ACC3, $ACC3
536 vmovdqa $ACC2, 32*2(%rdi)
537 vpaddq $T0, $ACC4, $ACC4
539 vpsrlq $digit_size, $ACC4, $T0
540 vpand $AND_MASK, $ACC4, $ACC4
541 vmovdqa $ACC3, 32*3(%rdi)
542 vpaddq $T0, $ACC5, $ACC5
544 vpsrlq $digit_size, $ACC5, $T0
545 vpand $AND_MASK, $ACC5, $ACC5
546 vmovdqa $ACC4, 32*4-160(%rax)
547 vpaddq $T0, $ACC6, $ACC6
549 vpsrlq $digit_size, $ACC6, $T0
550 vpand $AND_MASK, $ACC6, $ACC6
551 vmovdqa $ACC5, 32*5-160(%rax)
552 vpaddq $T0, $ACC7, $ACC7
554 vpsrlq $digit_size, $ACC7, $T0
555 vpand $AND_MASK, $ACC7, $ACC7
556 vmovdqa $ACC6, 32*6-160(%rax)
557 vpaddq $T0, $ACC8, $ACC8
558 #vpand $AND_MASK, $ACC8, $ACC8
559 vmovdqa $ACC7, 32*7-160(%rax)
560 vmovdqa $ACC8, 32*8-160(%rax)
563 .size avx2_normalize_n_store,.-avx2_normalize_n_store
565 ################################################################################
566 # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
567 .type avx2_mul_x4,\@abi-omnipotent
570 lea .LAVX2_POLY(%rip), %rax
572 vpxor $ACC0, $ACC0, $ACC0
573 vpxor $ACC1, $ACC1, $ACC1
574 vpxor $ACC2, $ACC2, $ACC2
575 vpxor $ACC3, $ACC3, $ACC3
576 vpxor $ACC4, $ACC4, $ACC4
577 vpxor $ACC5, $ACC5, $ACC5
578 vpxor $ACC6, $ACC6, $ACC6
579 vpxor $ACC7, $ACC7, $ACC7
581 vmovdqa 32*7(%rax), %ymm14
582 vmovdqa 32*8(%rax), %ymm15
585 lea -512($a_ptr), $a_ptr # strategic bias to control u-op density
586 jmp .Lavx2_mul_x4_loop
590 vmovdqa 32*0($b_ptr), $B
591 lea 32*1($b_ptr), $b_ptr
593 vpmuludq 32*0+512($a_ptr), $B, $T0
594 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
595 vpaddq $T0, $ACC0, $ACC0
596 vpmuludq 32*2+512($a_ptr), $B, $T0
597 vpaddq $OVERFLOW, $ACC1, $ACC1
598 vpand $AND_MASK, $ACC0, $Y
599 vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW
600 vpaddq $T0, $ACC2, $ACC2
601 vpmuludq 32*4+512($a_ptr), $B, $T0
602 vpaddq $OVERFLOW, $ACC3, $ACC3
603 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW
604 vpaddq $T0, $ACC4, $ACC4
605 vpmuludq 32*6+512($a_ptr), $B, $T0
606 vpaddq $OVERFLOW, $ACC5, $ACC5
607 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW
608 vpaddq $T0, $ACC6, $ACC6
610 # Skip some multiplications, optimizing for the constant poly
611 vpmuludq $AND_MASK, $Y, $T0
612 vpaddq $OVERFLOW, $ACC7, $ACC7
613 vpmuludq 32*8+512($a_ptr), $B, $ACC8
614 vpaddq $T0, $ACC0, $OVERFLOW
615 vpaddq $T0, $ACC1, $ACC0
616 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
617 vpaddq $T0, $ACC2, $ACC1
618 vpmuludq 32*3(%rax), $Y, $T0
619 vpaddq $OVERFLOW, $ACC0, $ACC0
620 vpaddq $T0, $ACC3, $ACC2
623 vpsllq \$18, $Y, $OVERFLOW
626 vpmuludq %ymm14, $Y, $T0
627 vpaddq $OVERFLOW, $ACC6, $ACC5
628 vpmuludq %ymm15, $Y, $OVERFLOW
629 vpaddq $T0, $ACC7, $ACC6
630 vpaddq $OVERFLOW, $ACC8, $ACC7
633 jnz .Lavx2_mul_x4_loop
635 vpxor $ACC8, $ACC8, $ACC8
638 .size avx2_mul_x4,.-avx2_mul_x4
640 # Function optimized for the constant 1
641 ################################################################################
642 # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
643 .type avx2_mul_by1_x4,\@abi-omnipotent
646 lea .LAVX2_POLY(%rip), %rax
648 vpxor $ACC0, $ACC0, $ACC0
649 vpxor $ACC1, $ACC1, $ACC1
650 vpxor $ACC2, $ACC2, $ACC2
651 vpxor $ACC3, $ACC3, $ACC3
652 vpxor $ACC4, $ACC4, $ACC4
653 vpxor $ACC5, $ACC5, $ACC5
654 vpxor $ACC6, $ACC6, $ACC6
655 vpxor $ACC7, $ACC7, $ACC7
656 vpxor $ACC8, $ACC8, $ACC8
658 vmovdqa 32*3+.LONE(%rip), %ymm14
659 vmovdqa 32*7+.LONE(%rip), %ymm15
662 jmp .Lavx2_mul_by1_x4_loop
665 .Lavx2_mul_by1_x4_loop:
666 vmovdqa 32*0($a_ptr), $B
667 .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr
669 vpsllq \$5, $B, $OVERFLOW
670 vpmuludq %ymm14, $B, $T0
671 vpaddq $OVERFLOW, $ACC0, $ACC0
672 vpaddq $T0, $ACC3, $ACC3
674 vpmuludq $AND_MASK, $B, $T0
675 vpand $AND_MASK, $ACC0, $Y
676 vpaddq $T0, $ACC4, $ACC4
677 vpaddq $T0, $ACC5, $ACC5
678 vpaddq $T0, $ACC6, $ACC6
682 vpmuludq %ymm15, $B, $OVERFLOW
683 vpsubq $T0, $ACC6, $ACC6
685 vpmuludq $AND_MASK, $Y, $T0
686 vpaddq $OVERFLOW, $ACC7, $ACC7
687 vpaddq $T0, $ACC0, $OVERFLOW
688 vpaddq $T0, $ACC1, $ACC0
690 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
691 vpaddq $T0, $ACC2, $ACC1
692 vpmuludq 32*3(%rax), $Y, $T0
693 vpaddq $OVERFLOW, $ACC0, $ACC0
694 vpaddq $T0, $ACC3, $ACC2
696 vpsllq \$18, $Y, $OVERFLOW
698 vpmuludq 32*7(%rax), $Y, $T0
699 vpaddq $OVERFLOW, $ACC6, $ACC5
700 vpaddq $T0, $ACC7, $ACC6
701 vpmuludq 32*8(%rax), $Y, $ACC7
704 jnz .Lavx2_mul_by1_x4_loop
707 .size avx2_mul_by1_x4,.-avx2_mul_by1_x4
709 ################################################################################
710 # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
711 .type avx2_sqr_x4,\@abi-omnipotent
714 lea .LAVX2_POLY(%rip), %rax
716 vmovdqa 32*7(%rax), %ymm14
717 vmovdqa 32*8(%rax), %ymm15
719 vmovdqa 32*0($a_ptr), $B
720 vmovdqa 32*1($a_ptr), $ACC1
721 vmovdqa 32*2($a_ptr), $ACC2
722 vmovdqa 32*3($a_ptr), $ACC3
723 vmovdqa 32*4($a_ptr), $ACC4
724 vmovdqa 32*5($a_ptr), $ACC5
725 vmovdqa 32*6($a_ptr), $ACC6
726 vmovdqa 32*7($a_ptr), $ACC7
727 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7
728 vmovdqa 32*8($a_ptr), $ACC8
729 vpaddq $ACC2, $ACC2, $ACC2
730 vmovdqa $ACC1, 32*0(%rcx)
731 vpaddq $ACC3, $ACC3, $ACC3
732 vmovdqa $ACC2, 32*1(%rcx)
733 vpaddq $ACC4, $ACC4, $ACC4
734 vmovdqa $ACC3, 32*2(%rcx)
735 vpaddq $ACC5, $ACC5, $ACC5
736 vmovdqa $ACC4, 32*3(%rcx)
737 vpaddq $ACC6, $ACC6, $ACC6
738 vmovdqa $ACC5, 32*4(%rcx)
739 vpaddq $ACC7, $ACC7, $ACC7
740 vmovdqa $ACC6, 32*5(%rcx)
741 vpaddq $ACC8, $ACC8, $ACC8
742 vmovdqa $ACC7, 32*6(%rcx)
743 vmovdqa $ACC8, 32*7(%rcx)
746 vpmuludq $B, $B, $ACC0
747 vpmuludq $B, $ACC1, $ACC1
748 vpand $AND_MASK, $ACC0, $Y
749 vpmuludq $B, $ACC2, $ACC2
750 vpmuludq $B, $ACC3, $ACC3
751 vpmuludq $B, $ACC4, $ACC4
752 vpmuludq $B, $ACC5, $ACC5
753 vpmuludq $B, $ACC6, $ACC6
754 vpmuludq $AND_MASK, $Y, $T0
755 vpmuludq $B, $ACC7, $ACC7
756 vpmuludq $B, $ACC8, $ACC8
757 vmovdqa 32*1($a_ptr), $B
759 vpaddq $T0, $ACC0, $OVERFLOW
760 vpaddq $T0, $ACC1, $ACC0
761 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
762 vpaddq $T0, $ACC2, $ACC1
763 vpmuludq 32*3(%rax), $Y, $T0
764 vpaddq $OVERFLOW, $ACC0, $ACC0
765 vpaddq $T0, $ACC3, $ACC2
769 vpmuludq %ymm14, $Y, $OVERFLOW
770 vpaddq $T0, $ACC6, $ACC5
771 vpmuludq %ymm15, $Y, $T0
772 vpaddq $OVERFLOW, $ACC7, $ACC6
773 vpaddq $T0, $ACC8, $ACC7
776 vpmuludq $B, $B, $OVERFLOW
777 vpand $AND_MASK, $ACC0, $Y
778 vpmuludq 32*1(%rcx), $B, $T0
779 vpaddq $OVERFLOW, $ACC1, $ACC1
780 vpmuludq 32*2(%rcx), $B, $OVERFLOW
781 vpaddq $T0, $ACC2, $ACC2
782 vpmuludq 32*3(%rcx), $B, $T0
783 vpaddq $OVERFLOW, $ACC3, $ACC3
784 vpmuludq 32*4(%rcx), $B, $OVERFLOW
785 vpaddq $T0, $ACC4, $ACC4
786 vpmuludq 32*5(%rcx), $B, $T0
787 vpaddq $OVERFLOW, $ACC5, $ACC5
788 vpmuludq 32*6(%rcx), $B, $OVERFLOW
789 vpaddq $T0, $ACC6, $ACC6
791 vpmuludq $AND_MASK, $Y, $T0
792 vpaddq $OVERFLOW, $ACC7, $ACC7
793 vpmuludq 32*7(%rcx), $B, $ACC8
794 vmovdqa 32*2($a_ptr), $B
795 vpaddq $T0, $ACC0, $OVERFLOW
796 vpaddq $T0, $ACC1, $ACC0
797 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
798 vpaddq $T0, $ACC2, $ACC1
799 vpmuludq 32*3(%rax), $Y, $T0
800 vpaddq $OVERFLOW, $ACC0, $ACC0
801 vpaddq $T0, $ACC3, $ACC2
805 vpmuludq %ymm14, $Y, $OVERFLOW
806 vpaddq $T0, $ACC6, $ACC5
807 vpmuludq %ymm15, $Y, $T0
808 vpaddq $OVERFLOW, $ACC7, $ACC6
809 vpaddq $T0, $ACC8, $ACC7
813 vpand $AND_MASK, $ACC0, $Y
814 vpmuludq 32*2(%rcx), $B, $OVERFLOW
815 vpaddq $T0, $ACC2, $ACC2
816 vpmuludq 32*3(%rcx), $B, $T0
817 vpaddq $OVERFLOW, $ACC3, $ACC3
818 vpmuludq 32*4(%rcx), $B, $OVERFLOW
819 vpaddq $T0, $ACC4, $ACC4
820 vpmuludq 32*5(%rcx), $B, $T0
821 vpaddq $OVERFLOW, $ACC5, $ACC5
822 vpmuludq 32*6(%rcx), $B, $OVERFLOW
823 vpaddq $T0, $ACC6, $ACC6
825 vpmuludq $AND_MASK, $Y, $T0
826 vpaddq $OVERFLOW, $ACC7, $ACC7
827 vpmuludq 32*7(%rcx), $B, $ACC8
828 vmovdqa 32*3($a_ptr), $B
829 vpaddq $T0, $ACC0, $OVERFLOW
830 vpaddq $T0, $ACC1, $ACC0
831 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
832 vpaddq $T0, $ACC2, $ACC1
833 vpmuludq 32*3(%rax), $Y, $T0
834 vpaddq $OVERFLOW, $ACC0, $ACC0
835 vpaddq $T0, $ACC3, $ACC2
839 vpmuludq %ymm14, $Y, $OVERFLOW
840 vpaddq $T0, $ACC6, $ACC5
841 vpmuludq %ymm15, $Y, $T0
842 vpand $AND_MASK, $ACC0, $Y
843 vpaddq $OVERFLOW, $ACC7, $ACC6
844 vpaddq $T0, $ACC8, $ACC7
847 vpmuludq $B, $B, $OVERFLOW
848 vpmuludq 32*3(%rcx), $B, $T0
849 vpaddq $OVERFLOW, $ACC3, $ACC3
850 vpmuludq 32*4(%rcx), $B, $OVERFLOW
851 vpaddq $T0, $ACC4, $ACC4
852 vpmuludq 32*5(%rcx), $B, $T0
853 vpaddq $OVERFLOW, $ACC5, $ACC5
854 vpmuludq 32*6(%rcx), $B, $OVERFLOW
855 vpaddq $T0, $ACC6, $ACC6
857 vpmuludq $AND_MASK, $Y, $T0
858 vpaddq $OVERFLOW, $ACC7, $ACC7
859 vpmuludq 32*7(%rcx), $B, $ACC8
860 vmovdqa 32*4($a_ptr), $B
861 vpaddq $T0, $ACC0, $OVERFLOW
862 vpaddq $T0, $ACC1, $ACC0
863 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
864 vpaddq $T0, $ACC2, $ACC1
865 vpmuludq 32*3(%rax), $Y, $T0
866 vpaddq $OVERFLOW, $ACC0, $ACC0
867 vpaddq $T0, $ACC3, $ACC2
871 vpmuludq %ymm14, $Y, $OVERFLOW
872 vpaddq $T0, $ACC6, $ACC5
873 vpmuludq %ymm15, $Y, $T0
874 vpand $AND_MASK, $ACC0, $Y
875 vpaddq $OVERFLOW, $ACC7, $ACC6
876 vpaddq $T0, $ACC8, $ACC7
880 vpmuludq 32*4(%rcx), $B, $OVERFLOW
881 vpaddq $T0, $ACC4, $ACC4
882 vpmuludq 32*5(%rcx), $B, $T0
883 vpaddq $OVERFLOW, $ACC5, $ACC5
884 vpmuludq 32*6(%rcx), $B, $OVERFLOW
885 vpaddq $T0, $ACC6, $ACC6
887 vpmuludq $AND_MASK, $Y, $T0
888 vpaddq $OVERFLOW, $ACC7, $ACC7
889 vpmuludq 32*7(%rcx), $B, $ACC8
890 vmovdqa 32*5($a_ptr), $B
891 vpaddq $T0, $ACC0, $OVERFLOW
892 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
893 vpaddq $T0, $ACC1, $ACC0
894 vpaddq $T0, $ACC2, $ACC1
895 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0
896 vpaddq $OVERFLOW, $ACC0, $ACC0
897 vpaddq $T0, $ACC3, $ACC2
901 vpmuludq %ymm14, $Y, $OVERFLOW
902 vpaddq $T0, $ACC6, $ACC5
903 vpmuludq %ymm15, $Y, $T0
904 vpand $AND_MASK, $ACC0, $Y
905 vpaddq $OVERFLOW, $ACC7, $ACC6
906 vpaddq $T0, $ACC8, $ACC7
909 vpmuludq $B, $B, $OVERFLOW
910 vpmuludq 32*5(%rcx), $B, $T0
911 vpaddq $OVERFLOW, $ACC5, $ACC5
912 vpmuludq 32*6(%rcx), $B, $OVERFLOW
913 vpaddq $T0, $ACC6, $ACC6
915 vpmuludq $AND_MASK, $Y, $T0
916 vpaddq $OVERFLOW, $ACC7, $ACC7
917 vpmuludq 32*7(%rcx), $B, $ACC8
918 vmovdqa 32*6($a_ptr), $B
919 vpaddq $T0, $ACC0, $OVERFLOW
920 vpaddq $T0, $ACC1, $ACC0
921 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
922 vpaddq $T0, $ACC2, $ACC1
923 vpmuludq 32*3(%rax), $Y, $T0
924 vpaddq $OVERFLOW, $ACC0, $ACC0
925 vpaddq $T0, $ACC3, $ACC2
929 vpmuludq %ymm14, $Y, $OVERFLOW
930 vpaddq $T0, $ACC6, $ACC5
931 vpmuludq %ymm15, $Y, $T0
932 vpand $AND_MASK, $ACC0, $Y
933 vpaddq $OVERFLOW, $ACC7, $ACC6
934 vpaddq $T0, $ACC8, $ACC7
938 vpmuludq 32*6(%rcx), $B, $OVERFLOW
939 vpaddq $T0, $ACC6, $ACC6
941 vpmuludq $AND_MASK, $Y, $T0
942 vpaddq $OVERFLOW, $ACC7, $ACC7
943 vpmuludq 32*7(%rcx), $B, $ACC8
944 vmovdqa 32*7($a_ptr), $B
945 vpaddq $T0, $ACC0, $OVERFLOW
946 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
947 vpaddq $T0, $ACC1, $ACC0
948 vpaddq $T0, $ACC2, $ACC1
949 vpmuludq 32*3(%rax), $Y, $T0
950 vpaddq $OVERFLOW, $ACC0, $ACC0
951 vpaddq $T0, $ACC3, $ACC2
955 vpmuludq %ymm14, $Y, $OVERFLOW
956 vpaddq $T0, $ACC6, $ACC5
957 vpmuludq %ymm15, $Y, $T0
958 vpand $AND_MASK, $ACC0, $Y
959 vpaddq $OVERFLOW, $ACC7, $ACC6
960 vpaddq $T0, $ACC8, $ACC7
963 vpmuludq $B, $B, $OVERFLOW
965 vpmuludq $AND_MASK, $Y, $T0
966 vpaddq $OVERFLOW, $ACC7, $ACC7
967 vpmuludq 32*7(%rcx), $B, $ACC8
968 vmovdqa 32*8($a_ptr), $B
969 vpaddq $T0, $ACC0, $OVERFLOW
970 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
971 vpaddq $T0, $ACC1, $ACC0
972 vpaddq $T0, $ACC2, $ACC1
973 vpmuludq 32*3(%rax), $Y, $T0
974 vpaddq $OVERFLOW, $ACC0, $ACC0
975 vpaddq $T0, $ACC3, $ACC2
979 vpmuludq %ymm14, $Y, $OVERFLOW
980 vpaddq $T0, $ACC6, $ACC5
981 vpmuludq %ymm15, $Y, $T0
982 vpand $AND_MASK, $ACC0, $Y
983 vpaddq $OVERFLOW, $ACC7, $ACC6
984 vpaddq $T0, $ACC8, $ACC7
987 vpmuludq $B, $B, $ACC8
989 vpmuludq $AND_MASK, $Y, $T0
990 vpaddq $T0, $ACC0, $OVERFLOW
991 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW
992 vpaddq $T0, $ACC1, $ACC0
993 vpaddq $T0, $ACC2, $ACC1
994 vpmuludq 32*3(%rax), $Y, $T0
995 vpaddq $OVERFLOW, $ACC0, $ACC0
996 vpaddq $T0, $ACC3, $ACC2
1000 vpmuludq %ymm14, $Y, $OVERFLOW
1001 vpaddq $T0, $ACC6, $ACC5
1002 vpmuludq %ymm15, $Y, $T0
1003 vpaddq $OVERFLOW, $ACC7, $ACC6
1004 vpaddq $T0, $ACC8, $ACC7
1006 vpxor $ACC8, $ACC8, $ACC8
1009 .size avx2_sqr_x4,.-avx2_sqr_x4
1011 ################################################################################
1012 # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
1013 .type avx2_sub_x4,\@abi-omnipotent
1016 vmovdqa 32*0($a_ptr), $ACC0
1017 lea 160($a_ptr), $a_ptr
1018 lea .LAVX2_POLY_x8+128(%rip), %rax
1019 lea 128($b_ptr), $b_ptr
1020 vmovdqa 32*1-160($a_ptr), $ACC1
1021 vmovdqa 32*2-160($a_ptr), $ACC2
1022 vmovdqa 32*3-160($a_ptr), $ACC3
1023 vmovdqa 32*4-160($a_ptr), $ACC4
1024 vmovdqa 32*5-160($a_ptr), $ACC5
1025 vmovdqa 32*6-160($a_ptr), $ACC6
1026 vmovdqa 32*7-160($a_ptr), $ACC7
1027 vmovdqa 32*8-160($a_ptr), $ACC8
1029 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1030 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1031 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1032 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1033 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1034 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1035 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1036 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1037 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1039 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0
1040 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1
1041 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2
1042 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3
1043 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4
1044 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5
1045 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6
1046 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7
1047 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8
1050 .size avx2_sub_x4,.-avx2_sub_x4
1052 .type avx2_select_n_store,\@abi-omnipotent
1054 avx2_select_n_store:
1055 vmovdqa `8+32*9*8`(%rsp), $Y
1056 vpor `8+32*9*8+32`(%rsp), $Y, $Y
1058 vpandn $ACC0, $Y, $ACC0
1059 vpandn $ACC1, $Y, $ACC1
1060 vpandn $ACC2, $Y, $ACC2
1061 vpandn $ACC3, $Y, $ACC3
1062 vpandn $ACC4, $Y, $ACC4
1063 vpandn $ACC5, $Y, $ACC5
1064 vpandn $ACC6, $Y, $ACC6
1065 vmovdqa `8+32*9*8+32`(%rsp), $B
1066 vpandn $ACC7, $Y, $ACC7
1067 vpandn `8+32*9*8`(%rsp), $B, $B
1068 vpandn $ACC8, $Y, $ACC8
1070 vpand 32*0(%rsi), $B, $T0
1072 vpand 32*1(%rsi), $B, $Y
1073 vpxor $T0, $ACC0, $ACC0
1074 vpand 32*2(%rsi), $B, $T0
1075 vpxor $Y, $ACC1, $ACC1
1076 vpand 32*3(%rsi), $B, $Y
1077 vpxor $T0, $ACC2, $ACC2
1078 vpand 32*4-160(%rax), $B, $T0
1079 vpxor $Y, $ACC3, $ACC3
1080 vpand 32*5-160(%rax), $B, $Y
1081 vpxor $T0, $ACC4, $ACC4
1082 vpand 32*6-160(%rax), $B, $T0
1083 vpxor $Y, $ACC5, $ACC5
1084 vpand 32*7-160(%rax), $B, $Y
1085 vpxor $T0, $ACC6, $ACC6
1086 vpand 32*8-160(%rax), $B, $T0
1087 vmovdqa `8+32*9*8+32`(%rsp), $B
1088 vpxor $Y, $ACC7, $ACC7
1090 vpand 32*0(%rdx), $B, $Y
1092 vpxor $T0, $ACC8, $ACC8
1093 vpand 32*1(%rdx), $B, $T0
1094 vpxor $Y, $ACC0, $ACC0
1095 vpand 32*2(%rdx), $B, $Y
1096 vpxor $T0, $ACC1, $ACC1
1097 vpand 32*3(%rdx), $B, $T0
1098 vpxor $Y, $ACC2, $ACC2
1099 vpand 32*4-160(%rax), $B, $Y
1100 vpxor $T0, $ACC3, $ACC3
1101 vpand 32*5-160(%rax), $B, $T0
1102 vpxor $Y, $ACC4, $ACC4
1103 vpand 32*6-160(%rax), $B, $Y
1104 vpxor $T0, $ACC5, $ACC5
1105 vpand 32*7-160(%rax), $B, $T0
1106 vpxor $Y, $ACC6, $ACC6
1107 vpand 32*8-160(%rax), $B, $Y
1108 vpxor $T0, $ACC7, $ACC7
1109 vpxor $Y, $ACC8, $ACC8
1113 .size avx2_select_n_store,.-avx2_select_n_store
1115 $code.=<<___ if (0); # inlined
1116 ################################################################################
1117 # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
1118 .type avx2_mul_by2_x4,\@abi-omnipotent
1121 vmovdqa 32*0($a_ptr), $ACC0
1122 lea 160($a_ptr), %rax
1123 vmovdqa 32*1($a_ptr), $ACC1
1124 vmovdqa 32*2($a_ptr), $ACC2
1125 vmovdqa 32*3($a_ptr), $ACC3
1126 vmovdqa 32*4-160(%rax), $ACC4
1127 vmovdqa 32*5-160(%rax), $ACC5
1128 vmovdqa 32*6-160(%rax), $ACC6
1129 vmovdqa 32*7-160(%rax), $ACC7
1130 vmovdqa 32*8-160(%rax), $ACC8
1132 vpaddq $ACC0, $ACC0, $ACC0
1133 vpaddq $ACC1, $ACC1, $ACC1
1134 vpaddq $ACC2, $ACC2, $ACC2
1135 vpaddq $ACC3, $ACC3, $ACC3
1136 vpaddq $ACC4, $ACC4, $ACC4
1137 vpaddq $ACC5, $ACC5, $ACC5
1138 vpaddq $ACC6, $ACC6, $ACC6
1139 vpaddq $ACC7, $ACC7, $ACC7
1140 vpaddq $ACC8, $ACC8, $ACC8
1143 .size avx2_mul_by2_x4,.-avx2_mul_by2_x4
1145 my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
1146 my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
1149 ################################################################################
1150 # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
1151 .globl ecp_nistz256_avx2_point_add_affine_x4
1152 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3
1154 ecp_nistz256_avx2_point_add_affine_x4:
1159 $code.=<<___ if ($win64);
1160 lea -16*10(%rsp), %rsp
1161 vmovaps %xmm6, -8-16*10(%rax)
1162 vmovaps %xmm7, -8-16*9(%rax)
1163 vmovaps %xmm8, -8-16*8(%rax)
1164 vmovaps %xmm9, -8-16*7(%rax)
1165 vmovaps %xmm10, -8-16*6(%rax)
1166 vmovaps %xmm11, -8-16*5(%rax)
1167 vmovaps %xmm12, -8-16*4(%rax)
1168 vmovaps %xmm13, -8-16*3(%rax)
1169 vmovaps %xmm14, -8-16*2(%rax)
1170 vmovaps %xmm15, -8-16*1(%rax)
1175 # Result + 32*0 = Result.X
1176 # Result + 32*9 = Result.Y
1177 # Result + 32*18 = Result.Z
1186 sub \$`32*9*8+32*2+32*8`, %rsp
1189 mov $r_ptr_in, $r_ptr
1190 mov $a_ptr_in, $a_ptr
1191 mov $b_ptr_in, $b_ptr
1193 vmovdqa 32*0($a_ptr_in), %ymm0
1194 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1195 vpxor %ymm1, %ymm1, %ymm1
1196 lea 256($a_ptr_in), %rax # size optimization
1197 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1198 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1199 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1200 vpor 32*4-256(%rax), %ymm0, %ymm0
1201 lea 256(%rax), %rcx # size optimization
1202 vpor 32*5-256(%rax), %ymm0, %ymm0
1203 vpor 32*6-256(%rax), %ymm0, %ymm0
1204 vpor 32*7-256(%rax), %ymm0, %ymm0
1205 vpor 32*8-256(%rax), %ymm0, %ymm0
1206 vpor 32*9-256(%rax), %ymm0, %ymm0
1207 vpor 32*10-256(%rax), %ymm0, %ymm0
1208 vpor 32*11-256(%rax), %ymm0, %ymm0
1209 vpor 32*12-512(%rcx), %ymm0, %ymm0
1210 vpor 32*13-512(%rcx), %ymm0, %ymm0
1211 vpor 32*14-512(%rcx), %ymm0, %ymm0
1212 vpor 32*15-512(%rcx), %ymm0, %ymm0
1213 vpor 32*16-512(%rcx), %ymm0, %ymm0
1214 vpor 32*17-512(%rcx), %ymm0, %ymm0
1215 vpcmpeqq %ymm1, %ymm0, %ymm0
1216 vmovdqa %ymm0, `32*9*8`(%rsp)
1218 vpxor %ymm1, %ymm1, %ymm1
1219 vmovdqa 32*0($b_ptr), %ymm0
1220 lea 256($b_ptr), %rax # size optimization
1221 vpor 32*1($b_ptr), %ymm0, %ymm0
1222 vpor 32*2($b_ptr), %ymm0, %ymm0
1223 vpor 32*3($b_ptr), %ymm0, %ymm0
1224 vpor 32*4-256(%rax), %ymm0, %ymm0
1225 lea 256(%rax), %rcx # size optimization
1226 vpor 32*5-256(%rax), %ymm0, %ymm0
1227 vpor 32*6-256(%rax), %ymm0, %ymm0
1228 vpor 32*7-256(%rax), %ymm0, %ymm0
1229 vpor 32*8-256(%rax), %ymm0, %ymm0
1230 vpor 32*9-256(%rax), %ymm0, %ymm0
1231 vpor 32*10-256(%rax), %ymm0, %ymm0
1232 vpor 32*11-256(%rax), %ymm0, %ymm0
1233 vpor 32*12-512(%rcx), %ymm0, %ymm0
1234 vpor 32*13-512(%rcx), %ymm0, %ymm0
1235 vpor 32*14-512(%rcx), %ymm0, %ymm0
1236 vpor 32*15-512(%rcx), %ymm0, %ymm0
1237 vpor 32*16-512(%rcx), %ymm0, %ymm0
1238 vpor 32*17-512(%rcx), %ymm0, %ymm0
1239 vpcmpeqq %ymm1, %ymm0, %ymm0
1240 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1243 lea `32*9*2`($a_ptr), %rsi
1244 lea `32*9*2`(%rsp), %rdi
1245 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1247 call avx2_normalize_n_store
1250 lea `32*9*0`($b_ptr), %rsi
1251 lea `32*9*2`(%rsp), %rdx
1252 lea `32*9*0`(%rsp), %rdi
1254 #call avx2_normalize
1257 # S2 = Z1*Z1^2 = Z1^3
1258 lea `32*9*2`($a_ptr), %rsi
1259 lea `32*9*2`(%rsp), %rdx
1260 lea `32*9*1`(%rsp), %rdi
1262 call avx2_normalize_n_store
1264 # S2 = S2*Y2 = Y2*Z1^3
1265 lea `32*9*1`($b_ptr), %rsi
1266 lea `32*9*1`(%rsp), %rdx
1267 lea `32*9*1`(%rsp), %rdi
1269 call avx2_normalize_n_store
1271 # H = U2 - U1 = U2 - X1
1272 lea `32*9*0`(%rsp), %rsi
1273 lea `32*9*0`($a_ptr), %rdx
1274 lea `32*9*3`(%rsp), %rdi
1276 call avx2_normalize_n_store
1278 # R = S2 - S1 = S2 - Y1
1279 lea `32*9*1`(%rsp), %rsi
1280 lea `32*9*1`($a_ptr), %rdx
1281 lea `32*9*4`(%rsp), %rdi
1283 call avx2_normalize_n_store
1286 lea `32*9*3`(%rsp), %rsi
1287 lea `32*9*2`($a_ptr), %rdx
1288 lea `32*9*2`($r_ptr), %rdi
1292 lea .LONE(%rip), %rsi
1293 lea `32*9*2`($a_ptr), %rdx
1294 call avx2_select_n_store
1297 lea `32*9*4`(%rsp), %rsi
1298 lea `32*9*6`(%rsp), %rdi
1299 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1301 call avx2_normalize_n_store
1304 lea `32*9*3`(%rsp), %rsi
1305 lea `32*9*5`(%rsp), %rdi
1307 call avx2_normalize_n_store
1310 lea `32*9*3`(%rsp), %rsi
1311 lea `32*9*5`(%rsp), %rdx
1312 lea `32*9*7`(%rsp), %rdi
1314 call avx2_normalize_n_store
1317 lea `32*9*0`($a_ptr), %rsi
1318 lea `32*9*5`(%rsp), %rdx
1319 lea `32*9*0`(%rsp), %rdi
1321 #call avx2_normalize
1325 #lea 32*9*0(%rsp), %rsi
1326 #lea 32*9*5(%rsp), %rdi
1327 #call avx2_mul_by2_x4
1329 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1330 lea `32*9*5`(%rsp), %rdi
1331 vpaddq $ACC1, $ACC1, $ACC1
1332 vpaddq $ACC2, $ACC2, $ACC2
1333 vpaddq $ACC3, $ACC3, $ACC3
1334 vpaddq $ACC4, $ACC4, $ACC4
1335 vpaddq $ACC5, $ACC5, $ACC5
1336 vpaddq $ACC6, $ACC6, $ACC6
1337 vpaddq $ACC7, $ACC7, $ACC7
1338 vpaddq $ACC8, $ACC8, $ACC8
1339 call avx2_normalize_n_store
1342 #lea 32*9*6(%rsp), %rsi
1343 #lea 32*9*7(%rsp), %rdx
1344 #lea 32*9*5(%rsp), %rcx
1345 #lea 32*9*0($r_ptr), %rdi
1351 #lea 32*9*0($r_ptr), %rsi
1352 #lea 32*9*0($r_ptr), %rdi
1357 lea `32*9*6+128`(%rsp), %rsi
1358 lea .LAVX2_POLY_x2+128(%rip), %rax
1359 lea `32*9*7+128`(%rsp), %rdx
1360 lea `32*9*5+128`(%rsp), %rcx
1361 lea `32*9*0`($r_ptr), %rdi
1363 vmovdqa 32*0-128(%rsi), $ACC0
1364 vmovdqa 32*1-128(%rsi), $ACC1
1365 vmovdqa 32*2-128(%rsi), $ACC2
1366 vmovdqa 32*3-128(%rsi), $ACC3
1367 vmovdqa 32*4-128(%rsi), $ACC4
1368 vmovdqa 32*5-128(%rsi), $ACC5
1369 vmovdqa 32*6-128(%rsi), $ACC6
1370 vmovdqa 32*7-128(%rsi), $ACC7
1371 vmovdqa 32*8-128(%rsi), $ACC8
1373 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1374 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1375 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1376 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1377 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1378 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1379 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1380 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1381 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1383 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1384 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1385 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1386 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1387 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1388 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1389 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1390 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1391 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1393 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1394 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1395 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1396 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1397 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1398 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1399 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1400 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1401 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1404 lea 32*0($b_ptr), %rsi
1405 lea 32*0($a_ptr), %rdx
1406 call avx2_select_n_store
1409 lea `32*9*0`(%rsp), %rsi
1410 lea `32*9*0`($r_ptr), %rdx
1411 lea `32*9*3`(%rsp), %rdi
1413 call avx2_normalize_n_store
1416 lea `32*9*3`(%rsp), %rsi
1417 lea `32*9*4`(%rsp), %rdx
1418 lea `32*9*3`(%rsp), %rdi
1420 call avx2_normalize_n_store
1423 lea `32*9*7`(%rsp), %rsi
1424 lea `32*9*1`($a_ptr), %rdx
1425 lea `32*9*1`(%rsp), %rdi
1427 call avx2_normalize_n_store
1430 lea `32*9*3`(%rsp), %rsi
1431 lea `32*9*1`(%rsp), %rdx
1432 lea `32*9*1`($r_ptr), %rdi
1436 lea 32*9($b_ptr), %rsi
1437 lea 32*9($a_ptr), %rdx
1438 call avx2_select_n_store
1440 #lea 32*9*0($r_ptr), %rsi
1441 #lea 32*9*0($r_ptr), %rdi
1442 #call avx2_mul_by1_x4
1446 lea `32*9*1`($r_ptr), %rsi
1447 lea `32*9*1`($r_ptr), %rdi
1448 call avx2_mul_by1_x4
1449 call avx2_normalize_n_store
1453 $code.=<<___ if ($win64);
1454 movaps %xmm6, -16*10(%rbp)
1455 movaps %xmm7, -16*9(%rbp)
1456 movaps %xmm8, -16*8(%rbp)
1457 movaps %xmm9, -16*7(%rbp)
1458 movaps %xmm10, -16*6(%rbp)
1459 movaps %xmm11, -16*5(%rbp)
1460 movaps %xmm12, -16*4(%rbp)
1461 movaps %xmm13, -16*3(%rbp)
1462 movaps %xmm14, -16*2(%rbp)
1463 movaps %xmm15, -16*1(%rbp)
1469 .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
1471 ################################################################################
1472 # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
1473 .globl ecp_nistz256_avx2_point_add_affines_x4
1474 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3
1476 ecp_nistz256_avx2_point_add_affines_x4:
1481 $code.=<<___ if ($win64);
1482 lea -16*10(%rsp), %rsp
1483 vmovaps %xmm6, -8-16*10(%rax)
1484 vmovaps %xmm7, -8-16*9(%rax)
1485 vmovaps %xmm8, -8-16*8(%rax)
1486 vmovaps %xmm9, -8-16*7(%rax)
1487 vmovaps %xmm10, -8-16*6(%rax)
1488 vmovaps %xmm11, -8-16*5(%rax)
1489 vmovaps %xmm12, -8-16*4(%rax)
1490 vmovaps %xmm13, -8-16*3(%rax)
1491 vmovaps %xmm14, -8-16*2(%rax)
1492 vmovaps %xmm15, -8-16*1(%rax)
1497 # Result + 32*0 = Result.X
1498 # Result + 32*9 = Result.Y
1499 # Result + 32*18 = Result.Z
1507 sub \$`32*9*8+32*2+32*8`, %rsp
1510 mov $r_ptr_in, $r_ptr
1511 mov $a_ptr_in, $a_ptr
1512 mov $b_ptr_in, $b_ptr
1514 vmovdqa 32*0($a_ptr_in), %ymm0
1515 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1516 vpxor %ymm1, %ymm1, %ymm1
1517 lea 256($a_ptr_in), %rax # size optimization
1518 vpor 32*1($a_ptr_in), %ymm0, %ymm0
1519 vpor 32*2($a_ptr_in), %ymm0, %ymm0
1520 vpor 32*3($a_ptr_in), %ymm0, %ymm0
1521 vpor 32*4-256(%rax), %ymm0, %ymm0
1522 lea 256(%rax), %rcx # size optimization
1523 vpor 32*5-256(%rax), %ymm0, %ymm0
1524 vpor 32*6-256(%rax), %ymm0, %ymm0
1525 vpor 32*7-256(%rax), %ymm0, %ymm0
1526 vpor 32*8-256(%rax), %ymm0, %ymm0
1527 vpor 32*9-256(%rax), %ymm0, %ymm0
1528 vpor 32*10-256(%rax), %ymm0, %ymm0
1529 vpor 32*11-256(%rax), %ymm0, %ymm0
1530 vpor 32*12-512(%rcx), %ymm0, %ymm0
1531 vpor 32*13-512(%rcx), %ymm0, %ymm0
1532 vpor 32*14-512(%rcx), %ymm0, %ymm0
1533 vpor 32*15-512(%rcx), %ymm0, %ymm0
1534 vpor 32*16-512(%rcx), %ymm0, %ymm0
1535 vpor 32*17-512(%rcx), %ymm0, %ymm0
1536 vpcmpeqq %ymm1, %ymm0, %ymm0
1537 vmovdqa %ymm0, `32*9*8`(%rsp)
1539 vpxor %ymm1, %ymm1, %ymm1
1540 vmovdqa 32*0($b_ptr), %ymm0
1541 lea 256($b_ptr), %rax # size optimization
1542 vpor 32*1($b_ptr), %ymm0, %ymm0
1543 vpor 32*2($b_ptr), %ymm0, %ymm0
1544 vpor 32*3($b_ptr), %ymm0, %ymm0
1545 vpor 32*4-256(%rax), %ymm0, %ymm0
1546 lea 256(%rax), %rcx # size optimization
1547 vpor 32*5-256(%rax), %ymm0, %ymm0
1548 vpor 32*6-256(%rax), %ymm0, %ymm0
1549 vpor 32*7-256(%rax), %ymm0, %ymm0
1550 vpor 32*8-256(%rax), %ymm0, %ymm0
1551 vpor 32*9-256(%rax), %ymm0, %ymm0
1552 vpor 32*10-256(%rax), %ymm0, %ymm0
1553 vpor 32*11-256(%rax), %ymm0, %ymm0
1554 vpor 32*12-512(%rcx), %ymm0, %ymm0
1555 vpor 32*13-512(%rcx), %ymm0, %ymm0
1556 vpor 32*14-512(%rcx), %ymm0, %ymm0
1557 vpor 32*15-512(%rcx), %ymm0, %ymm0
1558 vpor 32*16-512(%rcx), %ymm0, %ymm0
1559 vpor 32*17-512(%rcx), %ymm0, %ymm0
1560 vpcmpeqq %ymm1, %ymm0, %ymm0
1561 vmovdqa %ymm0, `32*9*8+32`(%rsp)
1563 # H = U2 - U1 = X2 - X1
1564 lea `32*9*0`($b_ptr), %rsi
1565 lea `32*9*0`($a_ptr), %rdx
1566 lea `32*9*3`(%rsp), %rdi
1568 call avx2_normalize_n_store
1570 # R = S2 - S1 = Y2 - Y1
1571 lea `32*9*1`($b_ptr), %rsi
1572 lea `32*9*1`($a_ptr), %rdx
1573 lea `32*9*4`(%rsp), %rdi
1575 call avx2_normalize_n_store
1578 lea `32*9*3`(%rsp), %rsi
1579 lea `32*9*2`($r_ptr), %rdi
1580 call avx2_mul_by1_x4
1583 vmovdqa `32*9*8`(%rsp), $B
1584 vpor `32*9*8+32`(%rsp), $B, $B
1586 vpandn $ACC0, $B, $ACC0
1587 lea .LONE+128(%rip), %rax
1588 vpandn $ACC1, $B, $ACC1
1589 vpandn $ACC2, $B, $ACC2
1590 vpandn $ACC3, $B, $ACC3
1591 vpandn $ACC4, $B, $ACC4
1592 vpandn $ACC5, $B, $ACC5
1593 vpandn $ACC6, $B, $ACC6
1594 vpandn $ACC7, $B, $ACC7
1596 vpand 32*0-128(%rax), $B, $T0
1597 vpandn $ACC8, $B, $ACC8
1598 vpand 32*1-128(%rax), $B, $Y
1599 vpxor $T0, $ACC0, $ACC0
1600 vpand 32*2-128(%rax), $B, $T0
1601 vpxor $Y, $ACC1, $ACC1
1602 vpand 32*3-128(%rax), $B, $Y
1603 vpxor $T0, $ACC2, $ACC2
1604 vpand 32*4-128(%rax), $B, $T0
1605 vpxor $Y, $ACC3, $ACC3
1606 vpand 32*5-128(%rax), $B, $Y
1607 vpxor $T0, $ACC4, $ACC4
1608 vpand 32*6-128(%rax), $B, $T0
1609 vpxor $Y, $ACC5, $ACC5
1610 vpand 32*7-128(%rax), $B, $Y
1611 vpxor $T0, $ACC6, $ACC6
1612 vpand 32*8-128(%rax), $B, $T0
1613 vpxor $Y, $ACC7, $ACC7
1614 vpxor $T0, $ACC8, $ACC8
1618 lea `32*9*4`(%rsp), %rsi
1619 lea `32*9*6`(%rsp), %rdi
1620 lea `32*9*8+32*2`(%rsp), %rcx # temporary vector
1622 call avx2_normalize_n_store
1625 lea `32*9*3`(%rsp), %rsi
1626 lea `32*9*5`(%rsp), %rdi
1628 call avx2_normalize_n_store
1631 lea `32*9*3`(%rsp), %rsi
1632 lea `32*9*5`(%rsp), %rdx
1633 lea `32*9*7`(%rsp), %rdi
1635 call avx2_normalize_n_store
1638 lea `32*9*0`($a_ptr), %rsi
1639 lea `32*9*5`(%rsp), %rdx
1640 lea `32*9*0`(%rsp), %rdi
1642 #call avx2_normalize
1646 #lea 32*9*0(%rsp), %rsi
1647 #lea 32*9*5(%rsp), %rdi
1648 #call avx2_mul_by2_x4
1650 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4
1651 lea `32*9*5`(%rsp), %rdi
1652 vpaddq $ACC1, $ACC1, $ACC1
1653 vpaddq $ACC2, $ACC2, $ACC2
1654 vpaddq $ACC3, $ACC3, $ACC3
1655 vpaddq $ACC4, $ACC4, $ACC4
1656 vpaddq $ACC5, $ACC5, $ACC5
1657 vpaddq $ACC6, $ACC6, $ACC6
1658 vpaddq $ACC7, $ACC7, $ACC7
1659 vpaddq $ACC8, $ACC8, $ACC8
1660 call avx2_normalize_n_store
1663 #lea 32*9*6(%rsp), %rsi
1664 #lea 32*9*7(%rsp), %rdx
1665 #lea 32*9*5(%rsp), %rcx
1666 #lea 32*9*0($r_ptr), %rdi
1672 #lea 32*9*0($r_ptr), %rsi
1673 #lea 32*9*0($r_ptr), %rdi
1678 lea `32*9*6+128`(%rsp), %rsi
1679 lea .LAVX2_POLY_x2+128(%rip), %rax
1680 lea `32*9*7+128`(%rsp), %rdx
1681 lea `32*9*5+128`(%rsp), %rcx
1682 lea `32*9*0`($r_ptr), %rdi
1684 vmovdqa 32*0-128(%rsi), $ACC0
1685 vmovdqa 32*1-128(%rsi), $ACC1
1686 vmovdqa 32*2-128(%rsi), $ACC2
1687 vmovdqa 32*3-128(%rsi), $ACC3
1688 vmovdqa 32*4-128(%rsi), $ACC4
1689 vmovdqa 32*5-128(%rsi), $ACC5
1690 vmovdqa 32*6-128(%rsi), $ACC6
1691 vmovdqa 32*7-128(%rsi), $ACC7
1692 vmovdqa 32*8-128(%rsi), $ACC8
1694 vpaddq 32*0-128(%rax), $ACC0, $ACC0
1695 vpaddq 32*1-128(%rax), $ACC1, $ACC1
1696 vpaddq 32*2-128(%rax), $ACC2, $ACC2
1697 vpaddq 32*3-128(%rax), $ACC3, $ACC3
1698 vpaddq 32*4-128(%rax), $ACC4, $ACC4
1699 vpaddq 32*5-128(%rax), $ACC5, $ACC5
1700 vpaddq 32*6-128(%rax), $ACC6, $ACC6
1701 vpaddq 32*7-128(%rax), $ACC7, $ACC7
1702 vpaddq 32*8-128(%rax), $ACC8, $ACC8
1704 vpsubq 32*0-128(%rdx), $ACC0, $ACC0
1705 vpsubq 32*1-128(%rdx), $ACC1, $ACC1
1706 vpsubq 32*2-128(%rdx), $ACC2, $ACC2
1707 vpsubq 32*3-128(%rdx), $ACC3, $ACC3
1708 vpsubq 32*4-128(%rdx), $ACC4, $ACC4
1709 vpsubq 32*5-128(%rdx), $ACC5, $ACC5
1710 vpsubq 32*6-128(%rdx), $ACC6, $ACC6
1711 vpsubq 32*7-128(%rdx), $ACC7, $ACC7
1712 vpsubq 32*8-128(%rdx), $ACC8, $ACC8
1714 vpsubq 32*0-128(%rcx), $ACC0, $ACC0
1715 vpsubq 32*1-128(%rcx), $ACC1, $ACC1
1716 vpsubq 32*2-128(%rcx), $ACC2, $ACC2
1717 vpsubq 32*3-128(%rcx), $ACC3, $ACC3
1718 vpsubq 32*4-128(%rcx), $ACC4, $ACC4
1719 vpsubq 32*5-128(%rcx), $ACC5, $ACC5
1720 vpsubq 32*6-128(%rcx), $ACC6, $ACC6
1721 vpsubq 32*7-128(%rcx), $ACC7, $ACC7
1722 vpsubq 32*8-128(%rcx), $ACC8, $ACC8
1725 lea 32*0($b_ptr), %rsi
1726 lea 32*0($a_ptr), %rdx
1727 call avx2_select_n_store
1730 lea `32*9*0`(%rsp), %rsi
1731 lea `32*9*0`($r_ptr), %rdx
1732 lea `32*9*3`(%rsp), %rdi
1734 call avx2_normalize_n_store
1737 lea `32*9*3`(%rsp), %rsi
1738 lea `32*9*4`(%rsp), %rdx
1739 lea `32*9*3`(%rsp), %rdi
1741 call avx2_normalize_n_store
1744 lea `32*9*7`(%rsp), %rsi
1745 lea `32*9*1`($a_ptr), %rdx
1746 lea `32*9*1`(%rsp), %rdi
1748 call avx2_normalize_n_store
1751 lea `32*9*3`(%rsp), %rsi
1752 lea `32*9*1`(%rsp), %rdx
1753 lea `32*9*1`($r_ptr), %rdi
1757 lea 32*9($b_ptr), %rsi
1758 lea 32*9($a_ptr), %rdx
1759 call avx2_select_n_store
1761 #lea 32*9*0($r_ptr), %rsi
1762 #lea 32*9*0($r_ptr), %rdi
1763 #call avx2_mul_by1_x4
1767 lea `32*9*1`($r_ptr), %rsi
1768 lea `32*9*1`($r_ptr), %rdi
1769 call avx2_mul_by1_x4
1770 call avx2_normalize_n_store
1774 $code.=<<___ if ($win64);
1775 movaps %xmm6, -16*10(%rbp)
1776 movaps %xmm7, -16*9(%rbp)
1777 movaps %xmm8, -16*8(%rbp)
1778 movaps %xmm9, -16*7(%rbp)
1779 movaps %xmm10, -16*6(%rbp)
1780 movaps %xmm11, -16*5(%rbp)
1781 movaps %xmm12, -16*4(%rbp)
1782 movaps %xmm13, -16*3(%rbp)
1783 movaps %xmm14, -16*2(%rbp)
1784 movaps %xmm15, -16*1(%rbp)
1790 .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
1792 ################################################################################
1793 # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
1794 .globl ecp_nistz256_avx2_to_mont
1795 .type ecp_nistz256_avx2_to_mont,\@function,2
1797 ecp_nistz256_avx2_to_mont:
1800 $code.=<<___ if ($win64);
1801 lea -8-16*10(%rsp), %rsp
1802 vmovaps %xmm6, -8-16*10(%rax)
1803 vmovaps %xmm7, -8-16*9(%rax)
1804 vmovaps %xmm8, -8-16*8(%rax)
1805 vmovaps %xmm9, -8-16*7(%rax)
1806 vmovaps %xmm10, -8-16*6(%rax)
1807 vmovaps %xmm11, -8-16*5(%rax)
1808 vmovaps %xmm12, -8-16*4(%rax)
1809 vmovaps %xmm13, -8-16*3(%rax)
1810 vmovaps %xmm14, -8-16*2(%rax)
1811 vmovaps %xmm15, -8-16*1(%rax)
1814 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1815 lea .LTO_MONT_AVX2(%rip), %rdx
1817 call avx2_normalize_n_store
1821 $code.=<<___ if ($win64);
1822 movaps 16*0(%rsp), %xmm6
1823 movaps 16*1(%rsp), %xmm7
1824 movaps 16*2(%rsp), %xmm8
1825 movaps 16*3(%rsp), %xmm9
1826 movaps 16*4(%rsp), %xmm10
1827 movaps 16*5(%rsp), %xmm11
1828 movaps 16*6(%rsp), %xmm12
1829 movaps 16*7(%rsp), %xmm13
1830 movaps 16*8(%rsp), %xmm14
1831 movaps 16*9(%rsp), %xmm15
1832 lea 8+16*10(%rsp), %rsp
1836 .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
1838 ################################################################################
1839 # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
1840 .globl ecp_nistz256_avx2_from_mont
1841 .type ecp_nistz256_avx2_from_mont,\@function,2
1843 ecp_nistz256_avx2_from_mont:
1846 $code.=<<___ if ($win64);
1847 lea -8-16*10(%rsp), %rsp
1848 vmovaps %xmm6, -8-16*10(%rax)
1849 vmovaps %xmm7, -8-16*9(%rax)
1850 vmovaps %xmm8, -8-16*8(%rax)
1851 vmovaps %xmm9, -8-16*7(%rax)
1852 vmovaps %xmm10, -8-16*6(%rax)
1853 vmovaps %xmm11, -8-16*5(%rax)
1854 vmovaps %xmm12, -8-16*4(%rax)
1855 vmovaps %xmm13, -8-16*3(%rax)
1856 vmovaps %xmm14, -8-16*2(%rax)
1857 vmovaps %xmm15, -8-16*1(%rax)
1860 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
1861 lea .LFROM_MONT_AVX2(%rip), %rdx
1863 call avx2_normalize_n_store
1867 $code.=<<___ if ($win64);
1868 movaps 16*0(%rsp), %xmm6
1869 movaps 16*1(%rsp), %xmm7
1870 movaps 16*2(%rsp), %xmm8
1871 movaps 16*3(%rsp), %xmm9
1872 movaps 16*4(%rsp), %xmm10
1873 movaps 16*5(%rsp), %xmm11
1874 movaps 16*6(%rsp), %xmm12
1875 movaps 16*7(%rsp), %xmm13
1876 movaps 16*8(%rsp), %xmm14
1877 movaps 16*9(%rsp), %xmm15
1878 lea 8+16*10(%rsp), %rsp
1882 .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
1884 ################################################################################
1885 # void ecp_nistz256_avx2_set1(void* RESULTx4);
1886 .globl ecp_nistz256_avx2_set1
1887 .type ecp_nistz256_avx2_set1,\@function,1
1889 ecp_nistz256_avx2_set1:
1890 lea .LONE+128(%rip), %rax
1893 vmovdqa 32*0-128(%rax), %ymm0
1894 vmovdqa 32*1-128(%rax), %ymm1
1895 vmovdqa 32*2-128(%rax), %ymm2
1896 vmovdqa 32*3-128(%rax), %ymm3
1897 vmovdqa 32*4-128(%rax), %ymm4
1898 vmovdqa 32*5-128(%rax), %ymm5
1899 vmovdqa %ymm0, 32*0-128(%rdi)
1900 vmovdqa 32*6-128(%rax), %ymm0
1901 vmovdqa %ymm1, 32*1-128(%rdi)
1902 vmovdqa 32*7-128(%rax), %ymm1
1903 vmovdqa %ymm2, 32*2-128(%rdi)
1904 vmovdqa 32*8-128(%rax), %ymm2
1905 vmovdqa %ymm3, 32*3-128(%rdi)
1906 vmovdqa %ymm4, 32*4-128(%rdi)
1907 vmovdqa %ymm5, 32*5-128(%rdi)
1908 vmovdqa %ymm0, 32*6-128(%rdi)
1909 vmovdqa %ymm1, 32*7-128(%rdi)
1910 vmovdqa %ymm2, 32*8-128(%rdi)
1914 .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
1918 ################################################################################
1919 # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in,
1920 # int index0, int index1, int index2, int index3);
1921 ################################################################################
1923 my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
1924 my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
1925 my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
1926 my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
1929 .globl ecp_nistz256_avx2_multi_gather_w7
1930 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6
1932 ecp_nistz256_avx2_multi_gather_w7:
1935 $code.=<<___ if ($win64);
1936 lea -8-16*10(%rsp), %rsp
1937 vmovaps %xmm6, -8-16*10(%rax)
1938 vmovaps %xmm7, -8-16*9(%rax)
1939 vmovaps %xmm8, -8-16*8(%rax)
1940 vmovaps %xmm9, -8-16*7(%rax)
1941 vmovaps %xmm10, -8-16*6(%rax)
1942 vmovaps %xmm11, -8-16*5(%rax)
1943 vmovaps %xmm12, -8-16*4(%rax)
1944 vmovaps %xmm13, -8-16*3(%rax)
1945 vmovaps %xmm14, -8-16*2(%rax)
1946 vmovaps %xmm15, -8-16*1(%rax)
1949 lea .LIntOne(%rip), %rax
1951 vmovd $index0, %xmm0
1952 vmovd $index1, %xmm1
1953 vmovd $index2, %xmm2
1954 vmovd $index3, %xmm3
1956 vpxor $R0a, $R0a, $R0a
1957 vpxor $R0b, $R0b, $R0b
1958 vpxor $R1a, $R1a, $R1a
1959 vpxor $R1b, $R1b, $R1b
1960 vpxor $R2a, $R2a, $R2a
1961 vpxor $R2b, $R2b, $R2b
1962 vpxor $R3a, $R3a, $R3a
1963 vpxor $R3b, $R3b, $R3b
1966 vpermd $INDEX0, $R0a, $INDEX0
1967 vpermd $INDEX1, $R0a, $INDEX1
1968 vpermd $INDEX2, $R0a, $INDEX2
1969 vpermd $INDEX3, $R0a, $INDEX3
1972 lea 112($val), $val # size optimization
1973 jmp .Lmulti_select_loop_avx2
1975 # INDEX=0, corresponds to the point at infty (0,0)
1977 .Lmulti_select_loop_avx2:
1978 vpcmpeqd $INDEX0, $M0, $TMP0
1980 vmovdqa `32*0+32*64*2*0`($in_t), $T0
1981 vmovdqa `32*1+32*64*2*0`($in_t), $T1
1982 vpand $TMP0, $T0, $T0
1983 vpand $TMP0, $T1, $T1
1984 vpxor $T0, $R0a, $R0a
1985 vpxor $T1, $R0b, $R0b
1987 vpcmpeqd $INDEX1, $M0, $TMP0
1989 vmovdqa `32*0+32*64*2*1`($in_t), $T0
1990 vmovdqa `32*1+32*64*2*1`($in_t), $T1
1991 vpand $TMP0, $T0, $T0
1992 vpand $TMP0, $T1, $T1
1993 vpxor $T0, $R1a, $R1a
1994 vpxor $T1, $R1b, $R1b
1996 vpcmpeqd $INDEX2, $M0, $TMP0
1998 vmovdqa `32*0+32*64*2*2`($in_t), $T0
1999 vmovdqa `32*1+32*64*2*2`($in_t), $T1
2000 vpand $TMP0, $T0, $T0
2001 vpand $TMP0, $T1, $T1
2002 vpxor $T0, $R2a, $R2a
2003 vpxor $T1, $R2b, $R2b
2005 vpcmpeqd $INDEX3, $M0, $TMP0
2007 vmovdqa `32*0+32*64*2*3`($in_t), $T0
2008 vmovdqa `32*1+32*64*2*3`($in_t), $T1
2009 vpand $TMP0, $T0, $T0
2010 vpand $TMP0, $T1, $T1
2011 vpxor $T0, $R3a, $R3a
2012 vpxor $T1, $R3b, $R3b
2014 vpaddd (%rax), $M0, $M0 # increment
2015 lea 32*2($in_t), $in_t
2018 jnz .Lmulti_select_loop_avx2
2020 vmovdqu $R0a, 32*0-112($val)
2021 vmovdqu $R0b, 32*1-112($val)
2022 vmovdqu $R1a, 32*2-112($val)
2023 vmovdqu $R1b, 32*3-112($val)
2024 vmovdqu $R2a, 32*4-112($val)
2025 vmovdqu $R2b, 32*5-112($val)
2026 vmovdqu $R3a, 32*6-112($val)
2027 vmovdqu $R3b, 32*7-112($val)
2031 $code.=<<___ if ($win64);
2032 movaps 16*0(%rsp), %xmm6
2033 movaps 16*1(%rsp), %xmm7
2034 movaps 16*2(%rsp), %xmm8
2035 movaps 16*3(%rsp), %xmm9
2036 movaps 16*4(%rsp), %xmm10
2037 movaps 16*5(%rsp), %xmm11
2038 movaps 16*6(%rsp), %xmm12
2039 movaps 16*7(%rsp), %xmm13
2040 movaps 16*8(%rsp), %xmm14
2041 movaps 16*9(%rsp), %xmm15
2042 lea 8+16*10(%rsp), %rsp
2046 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2048 .extern OPENSSL_ia32cap_P
2049 .globl ecp_nistz_avx2_eligible
2050 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2052 ecp_nistz_avx2_eligible:
2053 mov OPENSSL_ia32cap_P+8(%rip),%eax
2057 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2060 }} else {{ # assembler is too old
2064 .globl ecp_nistz256_avx2_transpose_convert
2065 .globl ecp_nistz256_avx2_convert_transpose_back
2066 .globl ecp_nistz256_avx2_point_add_affine_x4
2067 .globl ecp_nistz256_avx2_point_add_affines_x4
2068 .globl ecp_nistz256_avx2_to_mont
2069 .globl ecp_nistz256_avx2_from_mont
2070 .globl ecp_nistz256_avx2_set1
2071 .globl ecp_nistz256_avx2_multi_gather_w7
2072 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent
2073 ecp_nistz256_avx2_transpose_convert:
2074 ecp_nistz256_avx2_convert_transpose_back:
2075 ecp_nistz256_avx2_point_add_affine_x4:
2076 ecp_nistz256_avx2_point_add_affines_x4:
2077 ecp_nistz256_avx2_to_mont:
2078 ecp_nistz256_avx2_from_mont:
2079 ecp_nistz256_avx2_set1:
2080 ecp_nistz256_avx2_multi_gather_w7:
2081 .byte 0x0f,0x0b # ud2
2083 .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7
2085 .globl ecp_nistz_avx2_eligible
2086 .type ecp_nistz_avx2_eligible,\@abi-omnipotent
2087 ecp_nistz_avx2_eligible:
2090 .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
2094 foreach (split("\n",$code)) {
2095 s/\`([^\`]*)\`/eval($1)/geo;