libc: make stdio_impl.h an internal libc header
[unleashed/tickless.git] / lib / libcrypto / ec / asm / ecp_nistz256-x86_64.pl
blobb772aae74253f4a3d9b16e5a46fdff2f26ccfab0
1 #!/usr/bin/env perl
2 # $OpenBSD: ecp_nistz256-x86_64.pl,v 1.1 2016/11/04 17:33:20 miod Exp $
4 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
6 # Licensed under the OpenSSL license (the "License"). You may not use
7 # this file except in compliance with the License. You can obtain a copy
8 # in the file LICENSE in the source distribution or at
9 # https://www.openssl.org/source/license.html
11 # Copyright (c) 2014, Intel Corporation.
13 # Permission to use, copy, modify, and/or distribute this software for any
14 # purpose with or without fee is hereby granted, provided that the above
15 # copyright notice and this permission notice appear in all copies.
17 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
20 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
22 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
23 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25 # Developers and authors:
26 # Shay Gueron (1, 2), and Vlad Krasnov (1)
27 # (1) Intel Corporation, Israel Development Center
28 # (2) University of Haifa
30 # Reference:
31 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
32 # 256 Bit Primes"
34 # Further optimization by <appro@openssl.org>:
36 # this/original with/without -DECP_NISTZ256_ASM(*)
37 # Opteron +12-49% +110-150%
38 # Bulldozer +14-45% +175-210%
39 # P4 +18-46% n/a :-(
40 # Westmere +12-34% +80-87%
41 # Sandy Bridge +9-35% +110-120%
42 # Ivy Bridge +9-35% +110-125%
43 # Haswell +8-37% +140-160%
44 # Broadwell +18-58% +145-210%
45 # Atom +15-50% +130-180%
46 # VIA Nano +43-160% +300-480%
48 # (*) "without -DECP_NISTZ256_ASM" refers to build with
49 # "enable-ec_nistp_64_gcc_128";
51 # Ranges denote minimum and maximum improvement coefficients depending
52 # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
53 # server-side operation. Keep in mind that +100% means 2x improvement.
55 $flavour = shift;
56 $output = shift;
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64 die "can't locate x86_64-xlate.pl";
66 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67 *STDOUT=*OUT;
69 $code.=<<___;
70 .text
72 # The polynomial
73 .align 64
74 .Lpoly:
75 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
77 .LOne:
78 .long 1,1,1,1,1,1,1,1
79 .LTwo:
80 .long 2,2,2,2,2,2,2,2
81 .LThree:
82 .long 3,3,3,3,3,3,3,3
83 .LONE_mont:
84 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
85 ___
88 ################################################################################
89 # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
91 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
92 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
93 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
95 $code.=<<___;
97 .globl ecp_nistz256_mul_by_2
98 .type ecp_nistz256_mul_by_2,\@function,2
99 .align 64
100 ecp_nistz256_mul_by_2:
101 push %r12
102 push %r13
104 mov 8*0($a_ptr), $a0
105 mov 8*1($a_ptr), $a1
106 add $a0, $a0 # a0:a3+a0:a3
107 mov 8*2($a_ptr), $a2
108 adc $a1, $a1
109 mov 8*3($a_ptr), $a3
110 lea .Lpoly(%rip), $a_ptr
111 mov $a0, $t0
112 adc $a2, $a2
113 adc $a3, $a3
114 mov $a1, $t1
115 sbb $t4, $t4
117 sub 8*0($a_ptr), $a0
118 mov $a2, $t2
119 sbb 8*1($a_ptr), $a1
120 sbb 8*2($a_ptr), $a2
121 mov $a3, $t3
122 sbb 8*3($a_ptr), $a3
123 test $t4, $t4
125 cmovz $t0, $a0
126 cmovz $t1, $a1
127 mov $a0, 8*0($r_ptr)
128 cmovz $t2, $a2
129 mov $a1, 8*1($r_ptr)
130 cmovz $t3, $a3
131 mov $a2, 8*2($r_ptr)
132 mov $a3, 8*3($r_ptr)
134 pop %r13
135 pop %r12
137 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
139 ################################################################################
140 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
141 .globl ecp_nistz256_neg
142 .type ecp_nistz256_neg,\@function,2
143 .align 32
144 ecp_nistz256_neg:
145 push %r12
146 push %r13
148 xor $a0, $a0
149 xor $a1, $a1
150 xor $a2, $a2
151 xor $a3, $a3
152 xor $t4, $t4
154 sub 8*0($a_ptr), $a0
155 sbb 8*1($a_ptr), $a1
156 sbb 8*2($a_ptr), $a2
157 mov $a0, $t0
158 sbb 8*3($a_ptr), $a3
159 lea .Lpoly(%rip), $a_ptr
160 mov $a1, $t1
161 sbb \$0, $t4
163 add 8*0($a_ptr), $a0
164 mov $a2, $t2
165 adc 8*1($a_ptr), $a1
166 adc 8*2($a_ptr), $a2
167 mov $a3, $t3
168 adc 8*3($a_ptr), $a3
169 test $t4, $t4
171 cmovz $t0, $a0
172 cmovz $t1, $a1
173 mov $a0, 8*0($r_ptr)
174 cmovz $t2, $a2
175 mov $a1, 8*1($r_ptr)
176 cmovz $t3, $a3
177 mov $a2, 8*2($r_ptr)
178 mov $a3, 8*3($r_ptr)
180 pop %r13
181 pop %r12
183 .size ecp_nistz256_neg,.-ecp_nistz256_neg
187 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
188 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
189 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
190 my ($poly1,$poly3)=($acc6,$acc7);
192 $code.=<<___;
193 ################################################################################
194 # void ecp_nistz256_mul_mont(
195 # uint64_t res[4],
196 # uint64_t a[4],
197 # uint64_t b[4]);
199 .globl ecp_nistz256_mul_mont
200 .type ecp_nistz256_mul_mont,\@function,3
201 .align 32
202 ecp_nistz256_mul_mont:
203 .Lmul_mont:
204 push %rbp
205 push %rbx
206 push %r12
207 push %r13
208 push %r14
209 push %r15
211 mov $b_org, $b_ptr
212 mov 8*0($b_org), %rax
213 mov 8*0($a_ptr), $acc1
214 mov 8*1($a_ptr), $acc2
215 mov 8*2($a_ptr), $acc3
216 mov 8*3($a_ptr), $acc4
218 call __ecp_nistz256_mul_montq
220 pop %r15
221 pop %r14
222 pop %r13
223 pop %r12
224 pop %rbx
225 pop %rbp
227 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
229 .type __ecp_nistz256_mul_montq,\@abi-omnipotent
230 .align 32
231 __ecp_nistz256_mul_montq:
232 ########################################################################
233 # Multiply a by b[0]
234 mov %rax, $t1
235 mulq $acc1
236 mov .Lpoly+8*1(%rip),$poly1
237 mov %rax, $acc0
238 mov $t1, %rax
239 mov %rdx, $acc1
241 mulq $acc2
242 mov .Lpoly+8*3(%rip),$poly3
243 add %rax, $acc1
244 mov $t1, %rax
245 adc \$0, %rdx
246 mov %rdx, $acc2
248 mulq $acc3
249 add %rax, $acc2
250 mov $t1, %rax
251 adc \$0, %rdx
252 mov %rdx, $acc3
254 mulq $acc4
255 add %rax, $acc3
256 mov $acc0, %rax
257 adc \$0, %rdx
258 xor $acc5, $acc5
259 mov %rdx, $acc4
261 ########################################################################
262 # First reduction step
263 # Basically now we want to multiply acc[0] by p256,
264 # and add the result to the acc.
265 # Due to the special form of p256 we do some optimizations
267 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
268 # then we add acc[0] and get acc[0] x 2^96
270 mov $acc0, $t1
271 shl \$32, $acc0
272 mulq $poly3
273 shr \$32, $t1
274 add $acc0, $acc1 # +=acc[0]<<96
275 adc $t1, $acc2
276 adc %rax, $acc3
277 mov 8*1($b_ptr), %rax
278 adc %rdx, $acc4
279 adc \$0, $acc5
280 xor $acc0, $acc0
282 ########################################################################
283 # Multiply by b[1]
284 mov %rax, $t1
285 mulq 8*0($a_ptr)
286 add %rax, $acc1
287 mov $t1, %rax
288 adc \$0, %rdx
289 mov %rdx, $t0
291 mulq 8*1($a_ptr)
292 add $t0, $acc2
293 adc \$0, %rdx
294 add %rax, $acc2
295 mov $t1, %rax
296 adc \$0, %rdx
297 mov %rdx, $t0
299 mulq 8*2($a_ptr)
300 add $t0, $acc3
301 adc \$0, %rdx
302 add %rax, $acc3
303 mov $t1, %rax
304 adc \$0, %rdx
305 mov %rdx, $t0
307 mulq 8*3($a_ptr)
308 add $t0, $acc4
309 adc \$0, %rdx
310 add %rax, $acc4
311 mov $acc1, %rax
312 adc %rdx, $acc5
313 adc \$0, $acc0
315 ########################################################################
316 # Second reduction step
317 mov $acc1, $t1
318 shl \$32, $acc1
319 mulq $poly3
320 shr \$32, $t1
321 add $acc1, $acc2
322 adc $t1, $acc3
323 adc %rax, $acc4
324 mov 8*2($b_ptr), %rax
325 adc %rdx, $acc5
326 adc \$0, $acc0
327 xor $acc1, $acc1
329 ########################################################################
330 # Multiply by b[2]
331 mov %rax, $t1
332 mulq 8*0($a_ptr)
333 add %rax, $acc2
334 mov $t1, %rax
335 adc \$0, %rdx
336 mov %rdx, $t0
338 mulq 8*1($a_ptr)
339 add $t0, $acc3
340 adc \$0, %rdx
341 add %rax, $acc3
342 mov $t1, %rax
343 adc \$0, %rdx
344 mov %rdx, $t0
346 mulq 8*2($a_ptr)
347 add $t0, $acc4
348 adc \$0, %rdx
349 add %rax, $acc4
350 mov $t1, %rax
351 adc \$0, %rdx
352 mov %rdx, $t0
354 mulq 8*3($a_ptr)
355 add $t0, $acc5
356 adc \$0, %rdx
357 add %rax, $acc5
358 mov $acc2, %rax
359 adc %rdx, $acc0
360 adc \$0, $acc1
362 ########################################################################
363 # Third reduction step
364 mov $acc2, $t1
365 shl \$32, $acc2
366 mulq $poly3
367 shr \$32, $t1
368 add $acc2, $acc3
369 adc $t1, $acc4
370 adc %rax, $acc5
371 mov 8*3($b_ptr), %rax
372 adc %rdx, $acc0
373 adc \$0, $acc1
374 xor $acc2, $acc2
376 ########################################################################
377 # Multiply by b[3]
378 mov %rax, $t1
379 mulq 8*0($a_ptr)
380 add %rax, $acc3
381 mov $t1, %rax
382 adc \$0, %rdx
383 mov %rdx, $t0
385 mulq 8*1($a_ptr)
386 add $t0, $acc4
387 adc \$0, %rdx
388 add %rax, $acc4
389 mov $t1, %rax
390 adc \$0, %rdx
391 mov %rdx, $t0
393 mulq 8*2($a_ptr)
394 add $t0, $acc5
395 adc \$0, %rdx
396 add %rax, $acc5
397 mov $t1, %rax
398 adc \$0, %rdx
399 mov %rdx, $t0
401 mulq 8*3($a_ptr)
402 add $t0, $acc0
403 adc \$0, %rdx
404 add %rax, $acc0
405 mov $acc3, %rax
406 adc %rdx, $acc1
407 adc \$0, $acc2
409 ########################################################################
410 # Final reduction step
411 mov $acc3, $t1
412 shl \$32, $acc3
413 mulq $poly3
414 shr \$32, $t1
415 add $acc3, $acc4
416 adc $t1, $acc5
417 mov $acc4, $t0
418 adc %rax, $acc0
419 adc %rdx, $acc1
420 mov $acc5, $t1
421 adc \$0, $acc2
423 ########################################################################
424 # Branch-less conditional subtraction of P
425 sub \$-1, $acc4 # .Lpoly[0]
426 mov $acc0, $t2
427 sbb $poly1, $acc5 # .Lpoly[1]
428 sbb \$0, $acc0 # .Lpoly[2]
429 mov $acc1, $t3
430 sbb $poly3, $acc1 # .Lpoly[3]
431 sbb \$0, $acc2
433 cmovc $t0, $acc4
434 cmovc $t1, $acc5
435 mov $acc4, 8*0($r_ptr)
436 cmovc $t2, $acc0
437 mov $acc5, 8*1($r_ptr)
438 cmovc $t3, $acc1
439 mov $acc0, 8*2($r_ptr)
440 mov $acc1, 8*3($r_ptr)
443 .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
445 ################################################################################
446 # void ecp_nistz256_sqr_mont(
447 # uint64_t res[4],
448 # uint64_t a[4]);
450 # we optimize the square according to S.Gueron and V.Krasnov,
451 # "Speeding up Big-Number Squaring"
452 .globl ecp_nistz256_sqr_mont
453 .type ecp_nistz256_sqr_mont,\@function,2
454 .align 32
455 ecp_nistz256_sqr_mont:
456 push %rbp
457 push %rbx
458 push %r12
459 push %r13
460 push %r14
461 push %r15
463 mov 8*0($a_ptr), %rax
464 mov 8*1($a_ptr), $acc6
465 mov 8*2($a_ptr), $acc7
466 mov 8*3($a_ptr), $acc0
468 call __ecp_nistz256_sqr_montq
470 pop %r15
471 pop %r14
472 pop %r13
473 pop %r12
474 pop %rbx
475 pop %rbp
477 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
479 .type __ecp_nistz256_sqr_montq,\@abi-omnipotent
480 .align 32
481 __ecp_nistz256_sqr_montq:
482 mov %rax, $acc5
483 mulq $acc6 # a[1]*a[0]
484 mov %rax, $acc1
485 mov $acc7, %rax
486 mov %rdx, $acc2
488 mulq $acc5 # a[0]*a[2]
489 add %rax, $acc2
490 mov $acc0, %rax
491 adc \$0, %rdx
492 mov %rdx, $acc3
494 mulq $acc5 # a[0]*a[3]
495 add %rax, $acc3
496 mov $acc7, %rax
497 adc \$0, %rdx
498 mov %rdx, $acc4
500 #################################
501 mulq $acc6 # a[1]*a[2]
502 add %rax, $acc3
503 mov $acc0, %rax
504 adc \$0, %rdx
505 mov %rdx, $t1
507 mulq $acc6 # a[1]*a[3]
508 add %rax, $acc4
509 mov $acc0, %rax
510 adc \$0, %rdx
511 add $t1, $acc4
512 mov %rdx, $acc5
513 adc \$0, $acc5
515 #################################
516 mulq $acc7 # a[2]*a[3]
517 xor $acc7, $acc7
518 add %rax, $acc5
519 mov 8*0($a_ptr), %rax
520 mov %rdx, $acc6
521 adc \$0, $acc6
523 add $acc1, $acc1 # acc1:6<<1
524 adc $acc2, $acc2
525 adc $acc3, $acc3
526 adc $acc4, $acc4
527 adc $acc5, $acc5
528 adc $acc6, $acc6
529 adc \$0, $acc7
531 mulq %rax
532 mov %rax, $acc0
533 mov 8*1($a_ptr), %rax
534 mov %rdx, $t0
536 mulq %rax
537 add $t0, $acc1
538 adc %rax, $acc2
539 mov 8*2($a_ptr), %rax
540 adc \$0, %rdx
541 mov %rdx, $t0
543 mulq %rax
544 add $t0, $acc3
545 adc %rax, $acc4
546 mov 8*3($a_ptr), %rax
547 adc \$0, %rdx
548 mov %rdx, $t0
550 mulq %rax
551 add $t0, $acc5
552 adc %rax, $acc6
553 mov $acc0, %rax
554 adc %rdx, $acc7
556 mov .Lpoly+8*1(%rip), $a_ptr
557 mov .Lpoly+8*3(%rip), $t1
559 ##########################################
560 # Now the reduction
561 # First iteration
562 mov $acc0, $t0
563 shl \$32, $acc0
564 mulq $t1
565 shr \$32, $t0
566 add $acc0, $acc1 # +=acc[0]<<96
567 adc $t0, $acc2
568 adc %rax, $acc3
569 mov $acc1, %rax
570 adc \$0, %rdx
572 ##########################################
573 # Second iteration
574 mov $acc1, $t0
575 shl \$32, $acc1
576 mov %rdx, $acc0
577 mulq $t1
578 shr \$32, $t0
579 add $acc1, $acc2
580 adc $t0, $acc3
581 adc %rax, $acc0
582 mov $acc2, %rax
583 adc \$0, %rdx
585 ##########################################
586 # Third iteration
587 mov $acc2, $t0
588 shl \$32, $acc2
589 mov %rdx, $acc1
590 mulq $t1
591 shr \$32, $t0
592 add $acc2, $acc3
593 adc $t0, $acc0
594 adc %rax, $acc1
595 mov $acc3, %rax
596 adc \$0, %rdx
598 ###########################################
599 # Last iteration
600 mov $acc3, $t0
601 shl \$32, $acc3
602 mov %rdx, $acc2
603 mulq $t1
604 shr \$32, $t0
605 add $acc3, $acc0
606 adc $t0, $acc1
607 adc %rax, $acc2
608 adc \$0, %rdx
609 xor $acc3, $acc3
611 ############################################
612 # Add the rest of the acc
613 add $acc0, $acc4
614 adc $acc1, $acc5
615 mov $acc4, $acc0
616 adc $acc2, $acc6
617 adc %rdx, $acc7
618 mov $acc5, $acc1
619 adc \$0, $acc3
621 sub \$-1, $acc4 # .Lpoly[0]
622 mov $acc6, $acc2
623 sbb $a_ptr, $acc5 # .Lpoly[1]
624 sbb \$0, $acc6 # .Lpoly[2]
625 mov $acc7, $t0
626 sbb $t1, $acc7 # .Lpoly[3]
627 sbb \$0, $acc3
629 cmovc $acc0, $acc4
630 cmovc $acc1, $acc5
631 mov $acc4, 8*0($r_ptr)
632 cmovc $acc2, $acc6
633 mov $acc5, 8*1($r_ptr)
634 cmovc $t0, $acc7
635 mov $acc6, 8*2($r_ptr)
636 mov $acc7, 8*3($r_ptr)
639 .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
644 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
645 my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
646 my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
648 $code.=<<___;
649 ################################################################################
650 # void ecp_nistz256_from_mont(
651 # uint64_t res[4],
652 # uint64_t in[4]);
653 # This one performs Montgomery multiplication by 1, so we only need the reduction
655 .globl ecp_nistz256_from_mont
656 .type ecp_nistz256_from_mont,\@function,2
657 .align 32
658 ecp_nistz256_from_mont:
659 push %r12
660 push %r13
662 mov 8*0($in_ptr), %rax
663 mov .Lpoly+8*3(%rip), $t2
664 mov 8*1($in_ptr), $acc1
665 mov 8*2($in_ptr), $acc2
666 mov 8*3($in_ptr), $acc3
667 mov %rax, $acc0
668 mov .Lpoly+8*1(%rip), $t1
670 #########################################
671 # First iteration
672 mov %rax, $t0
673 shl \$32, $acc0
674 mulq $t2
675 shr \$32, $t0
676 add $acc0, $acc1
677 adc $t0, $acc2
678 adc %rax, $acc3
679 mov $acc1, %rax
680 adc \$0, %rdx
682 #########################################
683 # Second iteration
684 mov $acc1, $t0
685 shl \$32, $acc1
686 mov %rdx, $acc0
687 mulq $t2
688 shr \$32, $t0
689 add $acc1, $acc2
690 adc $t0, $acc3
691 adc %rax, $acc0
692 mov $acc2, %rax
693 adc \$0, %rdx
695 ##########################################
696 # Third iteration
697 mov $acc2, $t0
698 shl \$32, $acc2
699 mov %rdx, $acc1
700 mulq $t2
701 shr \$32, $t0
702 add $acc2, $acc3
703 adc $t0, $acc0
704 adc %rax, $acc1
705 mov $acc3, %rax
706 adc \$0, %rdx
708 ###########################################
709 # Last iteration
710 mov $acc3, $t0
711 shl \$32, $acc3
712 mov %rdx, $acc2
713 mulq $t2
714 shr \$32, $t0
715 add $acc3, $acc0
716 adc $t0, $acc1
717 mov $acc0, $t0
718 adc %rax, $acc2
719 mov $acc1, $in_ptr
720 adc \$0, %rdx
722 ###########################################
723 # Branch-less conditional subtraction
724 sub \$-1, $acc0
725 mov $acc2, %rax
726 sbb $t1, $acc1
727 sbb \$0, $acc2
728 mov %rdx, $acc3
729 sbb $t2, %rdx
730 sbb $t2, $t2
732 cmovnz $t0, $acc0
733 cmovnz $in_ptr, $acc1
734 mov $acc0, 8*0($r_ptr)
735 cmovnz %rax, $acc2
736 mov $acc1, 8*1($r_ptr)
737 cmovz %rdx, $acc3
738 mov $acc2, 8*2($r_ptr)
739 mov $acc3, 8*3($r_ptr)
741 pop %r13
742 pop %r12
744 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
748 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
749 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
750 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
751 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
753 $code.=<<___;
754 ################################################################################
755 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
756 .globl ecp_nistz256_select_w5
757 .type ecp_nistz256_select_w5,\@abi-omnipotent
758 .align 32
759 ecp_nistz256_select_w5:
761 $code.=<<___ if ($win64);
762 lea -0x88(%rsp), %rax
763 .LSEH_begin_ecp_nistz256_select_w5:
764 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
765 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
766 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
767 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
768 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
769 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
770 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
771 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
772 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
773 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
774 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
776 $code.=<<___;
777 movdqa .LOne(%rip), $ONE
778 movd $index, $INDEX
780 pxor $Ra, $Ra
781 pxor $Rb, $Rb
782 pxor $Rc, $Rc
783 pxor $Rd, $Rd
784 pxor $Re, $Re
785 pxor $Rf, $Rf
787 movdqa $ONE, $M0
788 pshufd \$0, $INDEX, $INDEX
790 mov \$16, %rax
791 .Lselect_loop_sse_w5:
793 movdqa $M0, $TMP0
794 paddd $ONE, $M0
795 pcmpeqd $INDEX, $TMP0
797 movdqa 16*0($in_t), $T0a
798 movdqa 16*1($in_t), $T0b
799 movdqa 16*2($in_t), $T0c
800 movdqa 16*3($in_t), $T0d
801 movdqa 16*4($in_t), $T0e
802 movdqa 16*5($in_t), $T0f
803 lea 16*6($in_t), $in_t
805 pand $TMP0, $T0a
806 pand $TMP0, $T0b
807 por $T0a, $Ra
808 pand $TMP0, $T0c
809 por $T0b, $Rb
810 pand $TMP0, $T0d
811 por $T0c, $Rc
812 pand $TMP0, $T0e
813 por $T0d, $Rd
814 pand $TMP0, $T0f
815 por $T0e, $Re
816 por $T0f, $Rf
818 dec %rax
819 jnz .Lselect_loop_sse_w5
821 movdqu $Ra, 16*0($val)
822 movdqu $Rb, 16*1($val)
823 movdqu $Rc, 16*2($val)
824 movdqu $Rd, 16*3($val)
825 movdqu $Re, 16*4($val)
826 movdqu $Rf, 16*5($val)
828 $code.=<<___ if ($win64);
829 movaps (%rsp), %xmm6
830 movaps 0x10(%rsp), %xmm7
831 movaps 0x20(%rsp), %xmm8
832 movaps 0x30(%rsp), %xmm9
833 movaps 0x40(%rsp), %xmm10
834 movaps 0x50(%rsp), %xmm11
835 movaps 0x60(%rsp), %xmm12
836 movaps 0x70(%rsp), %xmm13
837 movaps 0x80(%rsp), %xmm14
838 movaps 0x90(%rsp), %xmm15
839 lea 0xa8(%rsp), %rsp
840 .LSEH_end_ecp_nistz256_select_w5:
842 $code.=<<___;
844 .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
846 ################################################################################
847 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
848 .globl ecp_nistz256_select_w7
849 .type ecp_nistz256_select_w7,\@abi-omnipotent
850 .align 32
851 ecp_nistz256_select_w7:
853 $code.=<<___ if ($win64);
854 lea -0x88(%rsp), %rax
855 .LSEH_begin_ecp_nistz256_select_w7:
856 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
857 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
858 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
859 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
860 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
861 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
862 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
863 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
864 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
865 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
866 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
868 $code.=<<___;
869 movdqa .LOne(%rip), $M0
870 movd $index, $INDEX
872 pxor $Ra, $Ra
873 pxor $Rb, $Rb
874 pxor $Rc, $Rc
875 pxor $Rd, $Rd
877 movdqa $M0, $ONE
878 pshufd \$0, $INDEX, $INDEX
879 mov \$64, %rax
881 .Lselect_loop_sse_w7:
882 movdqa $M0, $TMP0
883 paddd $ONE, $M0
884 movdqa 16*0($in_t), $T0a
885 movdqa 16*1($in_t), $T0b
886 pcmpeqd $INDEX, $TMP0
887 movdqa 16*2($in_t), $T0c
888 movdqa 16*3($in_t), $T0d
889 lea 16*4($in_t), $in_t
891 pand $TMP0, $T0a
892 pand $TMP0, $T0b
893 por $T0a, $Ra
894 pand $TMP0, $T0c
895 por $T0b, $Rb
896 pand $TMP0, $T0d
897 por $T0c, $Rc
898 prefetcht0 255($in_t)
899 por $T0d, $Rd
901 dec %rax
902 jnz .Lselect_loop_sse_w7
904 movdqu $Ra, 16*0($val)
905 movdqu $Rb, 16*1($val)
906 movdqu $Rc, 16*2($val)
907 movdqu $Rd, 16*3($val)
909 $code.=<<___ if ($win64);
910 movaps (%rsp), %xmm6
911 movaps 0x10(%rsp), %xmm7
912 movaps 0x20(%rsp), %xmm8
913 movaps 0x30(%rsp), %xmm9
914 movaps 0x40(%rsp), %xmm10
915 movaps 0x50(%rsp), %xmm11
916 movaps 0x60(%rsp), %xmm12
917 movaps 0x70(%rsp), %xmm13
918 movaps 0x80(%rsp), %xmm14
919 movaps 0x90(%rsp), %xmm15
920 lea 0xa8(%rsp), %rsp
921 .LSEH_end_ecp_nistz256_select_w7:
923 $code.=<<___;
925 .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
929 ########################################################################
930 # This block implements higher level point_double, point_add and
931 # point_add_affine. The key to performance in this case is to allow
932 # out-of-order execution logic to overlap computations from next step
933 # with tail processing from current step. By using tailored calling
934 # sequence we minimize inter-step overhead to give processor better
935 # shot at overlapping operations...
937 # You will notice that input data is copied to stack. Trouble is that
938 # there are no registers to spare for holding original pointers and
939 # reloading them, pointers, would create undesired dependencies on
940 # effective addresses calculation paths. In other words it's too done
941 # to favour out-of-order execution logic.
942 # <appro@openssl.org>
944 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
945 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
946 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
947 my ($poly1,$poly3)=($acc6,$acc7);
949 sub load_for_mul () {
950 my ($a,$b,$src0) = @_;
951 my $bias = $src0 eq "%rax" ? 0 : -128;
953 " mov $b, $src0
954 lea $b, $b_ptr
955 mov 8*0+$a, $acc1
956 mov 8*1+$a, $acc2
957 lea $bias+$a, $a_ptr
958 mov 8*2+$a, $acc3
959 mov 8*3+$a, $acc4"
962 sub load_for_sqr () {
963 my ($a,$src0) = @_;
964 my $bias = $src0 eq "%rax" ? 0 : -128;
966 " mov 8*0+$a, $src0
967 mov 8*1+$a, $acc6
968 lea $bias+$a, $a_ptr
969 mov 8*2+$a, $acc7
970 mov 8*3+$a, $acc0"
974 ########################################################################
975 # operate in 4-5-0-1 "name space" that matches multiplication output
977 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
979 $code.=<<___;
980 .type __ecp_nistz256_add_toq,\@abi-omnipotent
981 .align 32
982 __ecp_nistz256_add_toq:
983 add 8*0($b_ptr), $a0
984 adc 8*1($b_ptr), $a1
985 mov $a0, $t0
986 adc 8*2($b_ptr), $a2
987 adc 8*3($b_ptr), $a3
988 mov $a1, $t1
989 sbb $t4, $t4
991 sub \$-1, $a0
992 mov $a2, $t2
993 sbb $poly1, $a1
994 sbb \$0, $a2
995 mov $a3, $t3
996 sbb $poly3, $a3
997 test $t4, $t4
999 cmovz $t0, $a0
1000 cmovz $t1, $a1
1001 mov $a0, 8*0($r_ptr)
1002 cmovz $t2, $a2
1003 mov $a1, 8*1($r_ptr)
1004 cmovz $t3, $a3
1005 mov $a2, 8*2($r_ptr)
1006 mov $a3, 8*3($r_ptr)
1009 .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1011 .type __ecp_nistz256_sub_fromq,\@abi-omnipotent
1012 .align 32
1013 __ecp_nistz256_sub_fromq:
1014 sub 8*0($b_ptr), $a0
1015 sbb 8*1($b_ptr), $a1
1016 mov $a0, $t0
1017 sbb 8*2($b_ptr), $a2
1018 sbb 8*3($b_ptr), $a3
1019 mov $a1, $t1
1020 sbb $t4, $t4
1022 add \$-1, $a0
1023 mov $a2, $t2
1024 adc $poly1, $a1
1025 adc \$0, $a2
1026 mov $a3, $t3
1027 adc $poly3, $a3
1028 test $t4, $t4
1030 cmovz $t0, $a0
1031 cmovz $t1, $a1
1032 mov $a0, 8*0($r_ptr)
1033 cmovz $t2, $a2
1034 mov $a1, 8*1($r_ptr)
1035 cmovz $t3, $a3
1036 mov $a2, 8*2($r_ptr)
1037 mov $a3, 8*3($r_ptr)
1040 .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1042 .type __ecp_nistz256_subq,\@abi-omnipotent
1043 .align 32
1044 __ecp_nistz256_subq:
1045 sub $a0, $t0
1046 sbb $a1, $t1
1047 mov $t0, $a0
1048 sbb $a2, $t2
1049 sbb $a3, $t3
1050 mov $t1, $a1
1051 sbb $t4, $t4
1053 add \$-1, $t0
1054 mov $t2, $a2
1055 adc $poly1, $t1
1056 adc \$0, $t2
1057 mov $t3, $a3
1058 adc $poly3, $t3
1059 test $t4, $t4
1061 cmovnz $t0, $a0
1062 cmovnz $t1, $a1
1063 cmovnz $t2, $a2
1064 cmovnz $t3, $a3
1067 .size __ecp_nistz256_subq,.-__ecp_nistz256_subq
1069 .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
1070 .align 32
1071 __ecp_nistz256_mul_by_2q:
1072 add $a0, $a0 # a0:a3+a0:a3
1073 adc $a1, $a1
1074 mov $a0, $t0
1075 adc $a2, $a2
1076 adc $a3, $a3
1077 mov $a1, $t1
1078 sbb $t4, $t4
1080 sub \$-1, $a0
1081 mov $a2, $t2
1082 sbb $poly1, $a1
1083 sbb \$0, $a2
1084 mov $a3, $t3
1085 sbb $poly3, $a3
1086 test $t4, $t4
1088 cmovz $t0, $a0
1089 cmovz $t1, $a1
1090 mov $a0, 8*0($r_ptr)
1091 cmovz $t2, $a2
1092 mov $a1, 8*1($r_ptr)
1093 cmovz $t3, $a3
1094 mov $a2, 8*2($r_ptr)
1095 mov $a3, 8*3($r_ptr)
1098 .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1101 sub gen_double () {
1102 my $x = shift;
1103 my ($src0,$sfx,$bias);
1104 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1106 if ($x ne "x") {
1107 $src0 = "%rax";
1108 $sfx = "";
1109 $bias = 0;
1111 $code.=<<___;
1112 .globl ecp_nistz256_point_double
1113 .type ecp_nistz256_point_double,\@function,2
1114 .align 32
1115 ecp_nistz256_point_double:
1117 } else {
1118 $src0 = "%rdx";
1119 $sfx = "x";
1120 $bias = 128;
1122 $code.=<<___;
1123 .type ecp_nistz256_point_doublex,\@function,2
1124 .align 32
1125 ecp_nistz256_point_doublex:
1126 .Lpoint_doublex:
1129 $code.=<<___;
1130 push %rbp
1131 push %rbx
1132 push %r12
1133 push %r13
1134 push %r14
1135 push %r15
1136 sub \$32*5+8, %rsp
1138 .Lpoint_double_shortcut$x:
1139 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
1140 mov $a_ptr, $b_ptr # backup copy
1141 movdqu 0x10($a_ptr), %xmm1
1142 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
1143 mov 0x20+8*1($a_ptr), $acc5
1144 mov 0x20+8*2($a_ptr), $acc0
1145 mov 0x20+8*3($a_ptr), $acc1
1146 mov .Lpoly+8*1(%rip), $poly1
1147 mov .Lpoly+8*3(%rip), $poly3
1148 movdqa %xmm0, $in_x(%rsp)
1149 movdqa %xmm1, $in_x+0x10(%rsp)
1150 lea 0x20($r_ptr), $acc2
1151 lea 0x40($r_ptr), $acc3
1152 movq $r_ptr, %xmm0
1153 movq $acc2, %xmm1
1154 movq $acc3, %xmm2
1156 lea $S(%rsp), $r_ptr
1157 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
1159 mov 0x40+8*0($a_ptr), $src0
1160 mov 0x40+8*1($a_ptr), $acc6
1161 mov 0x40+8*2($a_ptr), $acc7
1162 mov 0x40+8*3($a_ptr), $acc0
1163 lea 0x40-$bias($a_ptr), $a_ptr
1164 lea $Zsqr(%rsp), $r_ptr
1165 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
1167 `&load_for_sqr("$S(%rsp)", "$src0")`
1168 lea $S(%rsp), $r_ptr
1169 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
1171 mov 0x20($b_ptr), $src0 # $b_ptr is still valid
1172 mov 0x40+8*0($b_ptr), $acc1
1173 mov 0x40+8*1($b_ptr), $acc2
1174 mov 0x40+8*2($b_ptr), $acc3
1175 mov 0x40+8*3($b_ptr), $acc4
1176 lea 0x40-$bias($b_ptr), $a_ptr
1177 lea 0x20($b_ptr), $b_ptr
1178 movq %xmm2, $r_ptr
1179 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
1180 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
1182 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1183 mov $in_x+8*1(%rsp), $acc5
1184 lea $Zsqr(%rsp), $b_ptr
1185 mov $in_x+8*2(%rsp), $acc0
1186 mov $in_x+8*3(%rsp), $acc1
1187 lea $M(%rsp), $r_ptr
1188 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
1190 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1191 mov $in_x+8*1(%rsp), $acc5
1192 lea $Zsqr(%rsp), $b_ptr
1193 mov $in_x+8*2(%rsp), $acc0
1194 mov $in_x+8*3(%rsp), $acc1
1195 lea $Zsqr(%rsp), $r_ptr
1196 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
1198 `&load_for_sqr("$S(%rsp)", "$src0")`
1199 movq %xmm1, $r_ptr
1200 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
1203 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1204 # operate in 4-5-6-7 "name space" that matches squaring output
1206 my ($poly1,$poly3)=($a_ptr,$t1);
1207 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1209 $code.=<<___;
1210 xor $t4, $t4
1211 mov $a0, $t0
1212 add \$-1, $a0
1213 mov $a1, $t1
1214 adc $poly1, $a1
1215 mov $a2, $t2
1216 adc \$0, $a2
1217 mov $a3, $t3
1218 adc $poly3, $a3
1219 adc \$0, $t4
1220 xor $a_ptr, $a_ptr # borrow $a_ptr
1221 test \$1, $t0
1223 cmovz $t0, $a0
1224 cmovz $t1, $a1
1225 cmovz $t2, $a2
1226 cmovz $t3, $a3
1227 cmovz $a_ptr, $t4
1229 mov $a1, $t0 # a0:a3>>1
1230 shr \$1, $a0
1231 shl \$63, $t0
1232 mov $a2, $t1
1233 shr \$1, $a1
1234 or $t0, $a0
1235 shl \$63, $t1
1236 mov $a3, $t2
1237 shr \$1, $a2
1238 or $t1, $a1
1239 shl \$63, $t2
1240 mov $a0, 8*0($r_ptr)
1241 shr \$1, $a3
1242 mov $a1, 8*1($r_ptr)
1243 shl \$63, $t4
1244 or $t2, $a2
1245 or $t4, $a3
1246 mov $a2, 8*2($r_ptr)
1247 mov $a3, 8*3($r_ptr)
1250 $code.=<<___;
1251 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1252 lea $M(%rsp), $r_ptr
1253 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
1255 lea $tmp0(%rsp), $r_ptr
1256 call __ecp_nistz256_mul_by_2$x
1258 lea $M(%rsp), $b_ptr
1259 lea $M(%rsp), $r_ptr
1260 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
1262 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1263 lea $S(%rsp), $r_ptr
1264 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
1266 lea $tmp0(%rsp), $r_ptr
1267 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
1269 `&load_for_sqr("$M(%rsp)", "$src0")`
1270 movq %xmm0, $r_ptr
1271 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
1273 lea $tmp0(%rsp), $b_ptr
1274 mov $acc6, $acc0 # harmonize sqr output and sub input
1275 mov $acc7, $acc1
1276 mov $a_ptr, $poly1
1277 mov $t1, $poly3
1278 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
1280 mov $S+8*0(%rsp), $t0
1281 mov $S+8*1(%rsp), $t1
1282 mov $S+8*2(%rsp), $t2
1283 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
1284 lea $S(%rsp), $r_ptr
1285 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
1287 mov $M(%rsp), $src0
1288 lea $M(%rsp), $b_ptr
1289 mov $acc4, $acc6 # harmonize sub output and mul input
1290 xor %ecx, %ecx
1291 mov $acc4, $S+8*0(%rsp) # have to save:-(
1292 mov $acc5, $acc2
1293 mov $acc5, $S+8*1(%rsp)
1294 cmovz $acc0, $acc3
1295 mov $acc0, $S+8*2(%rsp)
1296 lea $S-$bias(%rsp), $a_ptr
1297 cmovz $acc1, $acc4
1298 mov $acc1, $S+8*3(%rsp)
1299 mov $acc6, $acc1
1300 lea $S(%rsp), $r_ptr
1301 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
1303 movq %xmm1, $b_ptr
1304 movq %xmm1, $r_ptr
1305 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
1307 add \$32*5+8, %rsp
1308 pop %r15
1309 pop %r14
1310 pop %r13
1311 pop %r12
1312 pop %rbx
1313 pop %rbp
1315 .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1318 &gen_double("q");
1320 sub gen_add () {
1321 my $x = shift;
1322 my ($src0,$sfx,$bias);
1323 my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1324 $U1,$U2,$S1,$S2,
1325 $res_x,$res_y,$res_z,
1326 $in1_x,$in1_y,$in1_z,
1327 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1328 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1330 if ($x ne "x") {
1331 $src0 = "%rax";
1332 $sfx = "";
1333 $bias = 0;
1335 $code.=<<___;
1336 .globl ecp_nistz256_point_add
1337 .type ecp_nistz256_point_add,\@function,3
1338 .align 32
1339 ecp_nistz256_point_add:
1341 } else {
1342 $src0 = "%rdx";
1343 $sfx = "x";
1344 $bias = 128;
1346 $code.=<<___;
1347 push %rbp
1348 push %rbx
1349 push %r12
1350 push %r13
1351 push %r14
1352 push %r15
1353 sub \$32*18+8, %rsp
1355 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1356 movdqu 0x10($a_ptr), %xmm1
1357 movdqu 0x20($a_ptr), %xmm2
1358 movdqu 0x30($a_ptr), %xmm3
1359 movdqu 0x40($a_ptr), %xmm4
1360 movdqu 0x50($a_ptr), %xmm5
1361 mov $a_ptr, $b_ptr # reassign
1362 mov $b_org, $a_ptr # reassign
1363 movdqa %xmm0, $in1_x(%rsp)
1364 movdqa %xmm1, $in1_x+0x10(%rsp)
1365 por %xmm0, %xmm1
1366 movdqa %xmm2, $in1_y(%rsp)
1367 movdqa %xmm3, $in1_y+0x10(%rsp)
1368 por %xmm2, %xmm3
1369 movdqa %xmm4, $in1_z(%rsp)
1370 movdqa %xmm5, $in1_z+0x10(%rsp)
1371 por %xmm1, %xmm3
1373 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
1374 pshufd \$0xb1, %xmm3, %xmm5
1375 movdqu 0x10($a_ptr), %xmm1
1376 movdqu 0x20($a_ptr), %xmm2
1377 por %xmm3, %xmm5
1378 movdqu 0x30($a_ptr), %xmm3
1379 mov 0x40+8*0($a_ptr), $src0 # load original in2_z
1380 mov 0x40+8*1($a_ptr), $acc6
1381 mov 0x40+8*2($a_ptr), $acc7
1382 mov 0x40+8*3($a_ptr), $acc0
1383 movdqa %xmm0, $in2_x(%rsp)
1384 pshufd \$0x1e, %xmm5, %xmm4
1385 movdqa %xmm1, $in2_x+0x10(%rsp)
1386 por %xmm0, %xmm1
1387 movq $r_ptr, %xmm0 # save $r_ptr
1388 movdqa %xmm2, $in2_y(%rsp)
1389 movdqa %xmm3, $in2_y+0x10(%rsp)
1390 por %xmm2, %xmm3
1391 por %xmm4, %xmm5
1392 pxor %xmm4, %xmm4
1393 por %xmm1, %xmm3
1395 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1396 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
1397 mov $acc6, $in2_z+8*1(%rsp)
1398 mov $acc7, $in2_z+8*2(%rsp)
1399 mov $acc0, $in2_z+8*3(%rsp)
1400 lea $Z2sqr(%rsp), $r_ptr # Z2^2
1401 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
1403 pcmpeqd %xmm4, %xmm5
1404 pshufd \$0xb1, %xmm3, %xmm4
1405 por %xmm3, %xmm4
1406 pshufd \$0, %xmm5, %xmm5 # in1infty
1407 pshufd \$0x1e, %xmm4, %xmm3
1408 por %xmm3, %xmm4
1409 pxor %xmm3, %xmm3
1410 pcmpeqd %xmm3, %xmm4
1411 pshufd \$0, %xmm4, %xmm4 # in2infty
1412 mov 0x40+8*0($b_ptr), $src0 # load original in1_z
1413 mov 0x40+8*1($b_ptr), $acc6
1414 mov 0x40+8*2($b_ptr), $acc7
1415 mov 0x40+8*3($b_ptr), $acc0
1416 movq $b_ptr, %xmm1
1418 lea 0x40-$bias($b_ptr), $a_ptr
1419 lea $Z1sqr(%rsp), $r_ptr # Z1^2
1420 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
1422 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
1423 lea $S1(%rsp), $r_ptr # S1 = Z2^3
1424 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
1426 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1427 lea $S2(%rsp), $r_ptr # S2 = Z1^3
1428 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
1430 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
1431 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
1432 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
1434 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1435 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1436 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
1438 lea $S1(%rsp), $b_ptr
1439 lea $R(%rsp), $r_ptr # R = S2 - S1
1440 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
1442 or $acc5, $acc4 # see if result is zero
1443 movdqa %xmm4, %xmm2
1444 or $acc0, $acc4
1445 or $acc1, $acc4
1446 por %xmm5, %xmm2 # in1infty || in2infty
1447 movq $acc4, %xmm3
1449 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1450 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
1451 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
1453 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
1454 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
1455 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
1457 lea $U1(%rsp), $b_ptr
1458 lea $H(%rsp), $r_ptr # H = U2 - U1
1459 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
1461 or $acc5, $acc4 # see if result is zero
1462 or $acc0, $acc4
1463 or $acc1, $acc4
1465 .byte 0x3e # predict taken
1466 jnz .Ladd_proceed$x # is_equal(U1,U2)?
1467 movq %xmm2, $acc0
1468 movq %xmm3, $acc1
1469 test $acc0, $acc0
1470 jnz .Ladd_proceed$x # (in1infty || in2infty)?
1471 test $acc1, $acc1
1472 jz .Ladd_double$x # is_equal(S1,S2)?
1474 movq %xmm0, $r_ptr # restore $r_ptr
1475 pxor %xmm0, %xmm0
1476 movdqu %xmm0, 0x00($r_ptr)
1477 movdqu %xmm0, 0x10($r_ptr)
1478 movdqu %xmm0, 0x20($r_ptr)
1479 movdqu %xmm0, 0x30($r_ptr)
1480 movdqu %xmm0, 0x40($r_ptr)
1481 movdqu %xmm0, 0x50($r_ptr)
1482 jmp .Ladd_done$x
1484 .align 32
1485 .Ladd_double$x:
1486 movq %xmm1, $a_ptr # restore $a_ptr
1487 movq %xmm0, $r_ptr # restore $r_ptr
1488 add \$`32*(18-5)`, %rsp # difference in frame sizes
1489 jmp .Lpoint_double_shortcut$x
1491 .align 32
1492 .Ladd_proceed$x:
1493 `&load_for_sqr("$R(%rsp)", "$src0")`
1494 lea $Rsqr(%rsp), $r_ptr # R^2
1495 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
1497 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1498 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1499 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
1501 `&load_for_sqr("$H(%rsp)", "$src0")`
1502 lea $Hsqr(%rsp), $r_ptr # H^2
1503 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
1505 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
1506 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1507 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
1509 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
1510 lea $Hcub(%rsp), $r_ptr # H^3
1511 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
1513 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
1514 lea $U2(%rsp), $r_ptr # U1*H^2
1515 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
1518 #######################################################################
1519 # operate in 4-5-0-1 "name space" that matches multiplication output
1521 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1522 my ($poly1, $poly3)=($acc6,$acc7);
1524 $code.=<<___;
1525 #lea $U2(%rsp), $a_ptr
1526 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1527 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1529 add $acc0, $acc0 # a0:a3+a0:a3
1530 lea $Rsqr(%rsp), $a_ptr
1531 adc $acc1, $acc1
1532 mov $acc0, $t0
1533 adc $acc2, $acc2
1534 adc $acc3, $acc3
1535 mov $acc1, $t1
1536 sbb $t4, $t4
1538 sub \$-1, $acc0
1539 mov $acc2, $t2
1540 sbb $poly1, $acc1
1541 sbb \$0, $acc2
1542 mov $acc3, $t3
1543 sbb $poly3, $acc3
1544 test $t4, $t4
1546 cmovz $t0, $acc0
1547 mov 8*0($a_ptr), $t0
1548 cmovz $t1, $acc1
1549 mov 8*1($a_ptr), $t1
1550 cmovz $t2, $acc2
1551 mov 8*2($a_ptr), $t2
1552 cmovz $t3, $acc3
1553 mov 8*3($a_ptr), $t3
1555 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
1557 lea $Hcub(%rsp), $b_ptr
1558 lea $res_x(%rsp), $r_ptr
1559 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
1561 mov $U2+8*0(%rsp), $t0
1562 mov $U2+8*1(%rsp), $t1
1563 mov $U2+8*2(%rsp), $t2
1564 mov $U2+8*3(%rsp), $t3
1565 lea $res_y(%rsp), $r_ptr
1567 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
1569 mov $acc0, 8*0($r_ptr) # save the result, as
1570 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
1571 mov $acc2, 8*2($r_ptr)
1572 mov $acc3, 8*3($r_ptr)
1575 $code.=<<___;
1576 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
1577 lea $S2(%rsp), $r_ptr
1578 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
1580 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
1581 lea $res_y(%rsp), $r_ptr
1582 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
1584 lea $S2(%rsp), $b_ptr
1585 lea $res_y(%rsp), $r_ptr
1586 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
1588 movq %xmm0, $r_ptr # restore $r_ptr
1590 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
1591 movdqa %xmm5, %xmm1
1592 pandn $res_z(%rsp), %xmm0
1593 movdqa %xmm5, %xmm2
1594 pandn $res_z+0x10(%rsp), %xmm1
1595 movdqa %xmm5, %xmm3
1596 pand $in2_z(%rsp), %xmm2
1597 pand $in2_z+0x10(%rsp), %xmm3
1598 por %xmm0, %xmm2
1599 por %xmm1, %xmm3
1601 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
1602 movdqa %xmm4, %xmm1
1603 pandn %xmm2, %xmm0
1604 movdqa %xmm4, %xmm2
1605 pandn %xmm3, %xmm1
1606 movdqa %xmm4, %xmm3
1607 pand $in1_z(%rsp), %xmm2
1608 pand $in1_z+0x10(%rsp), %xmm3
1609 por %xmm0, %xmm2
1610 por %xmm1, %xmm3
1611 movdqu %xmm2, 0x40($r_ptr)
1612 movdqu %xmm3, 0x50($r_ptr)
1614 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
1615 movdqa %xmm5, %xmm1
1616 pandn $res_x(%rsp), %xmm0
1617 movdqa %xmm5, %xmm2
1618 pandn $res_x+0x10(%rsp), %xmm1
1619 movdqa %xmm5, %xmm3
1620 pand $in2_x(%rsp), %xmm2
1621 pand $in2_x+0x10(%rsp), %xmm3
1622 por %xmm0, %xmm2
1623 por %xmm1, %xmm3
1625 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
1626 movdqa %xmm4, %xmm1
1627 pandn %xmm2, %xmm0
1628 movdqa %xmm4, %xmm2
1629 pandn %xmm3, %xmm1
1630 movdqa %xmm4, %xmm3
1631 pand $in1_x(%rsp), %xmm2
1632 pand $in1_x+0x10(%rsp), %xmm3
1633 por %xmm0, %xmm2
1634 por %xmm1, %xmm3
1635 movdqu %xmm2, 0x00($r_ptr)
1636 movdqu %xmm3, 0x10($r_ptr)
1638 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
1639 movdqa %xmm5, %xmm1
1640 pandn $res_y(%rsp), %xmm0
1641 movdqa %xmm5, %xmm2
1642 pandn $res_y+0x10(%rsp), %xmm1
1643 movdqa %xmm5, %xmm3
1644 pand $in2_y(%rsp), %xmm2
1645 pand $in2_y+0x10(%rsp), %xmm3
1646 por %xmm0, %xmm2
1647 por %xmm1, %xmm3
1649 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
1650 movdqa %xmm4, %xmm1
1651 pandn %xmm2, %xmm0
1652 movdqa %xmm4, %xmm2
1653 pandn %xmm3, %xmm1
1654 movdqa %xmm4, %xmm3
1655 pand $in1_y(%rsp), %xmm2
1656 pand $in1_y+0x10(%rsp), %xmm3
1657 por %xmm0, %xmm2
1658 por %xmm1, %xmm3
1659 movdqu %xmm2, 0x20($r_ptr)
1660 movdqu %xmm3, 0x30($r_ptr)
1662 .Ladd_done$x:
1663 add \$32*18+8, %rsp
1664 pop %r15
1665 pop %r14
1666 pop %r13
1667 pop %r12
1668 pop %rbx
1669 pop %rbp
1671 .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
1674 &gen_add("q");
1676 sub gen_add_affine () {
1677 my $x = shift;
1678 my ($src0,$sfx,$bias);
1679 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
1680 $res_x,$res_y,$res_z,
1681 $in1_x,$in1_y,$in1_z,
1682 $in2_x,$in2_y)=map(32*$_,(0..14));
1683 my $Z1sqr = $S2;
1685 if ($x ne "x") {
1686 $src0 = "%rax";
1687 $sfx = "";
1688 $bias = 0;
1690 $code.=<<___;
1691 .globl ecp_nistz256_point_add_affine
1692 .type ecp_nistz256_point_add_affine,\@function,3
1693 .align 32
1694 ecp_nistz256_point_add_affine:
1696 } else {
1697 $src0 = "%rdx";
1698 $sfx = "x";
1699 $bias = 128;
1701 $code.=<<___;
1702 push %rbp
1703 push %rbx
1704 push %r12
1705 push %r13
1706 push %r14
1707 push %r15
1708 sub \$32*15+8, %rsp
1710 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1711 mov $b_org, $b_ptr # reassign
1712 movdqu 0x10($a_ptr), %xmm1
1713 movdqu 0x20($a_ptr), %xmm2
1714 movdqu 0x30($a_ptr), %xmm3
1715 movdqu 0x40($a_ptr), %xmm4
1716 movdqu 0x50($a_ptr), %xmm5
1717 mov 0x40+8*0($a_ptr), $src0 # load original in1_z
1718 mov 0x40+8*1($a_ptr), $acc6
1719 mov 0x40+8*2($a_ptr), $acc7
1720 mov 0x40+8*3($a_ptr), $acc0
1721 movdqa %xmm0, $in1_x(%rsp)
1722 movdqa %xmm1, $in1_x+0x10(%rsp)
1723 por %xmm0, %xmm1
1724 movdqa %xmm2, $in1_y(%rsp)
1725 movdqa %xmm3, $in1_y+0x10(%rsp)
1726 por %xmm2, %xmm3
1727 movdqa %xmm4, $in1_z(%rsp)
1728 movdqa %xmm5, $in1_z+0x10(%rsp)
1729 por %xmm1, %xmm3
1731 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
1732 pshufd \$0xb1, %xmm3, %xmm5
1733 movdqu 0x10($b_ptr), %xmm1
1734 movdqu 0x20($b_ptr), %xmm2
1735 por %xmm3, %xmm5
1736 movdqu 0x30($b_ptr), %xmm3
1737 movdqa %xmm0, $in2_x(%rsp)
1738 pshufd \$0x1e, %xmm5, %xmm4
1739 movdqa %xmm1, $in2_x+0x10(%rsp)
1740 por %xmm0, %xmm1
1741 movq $r_ptr, %xmm0 # save $r_ptr
1742 movdqa %xmm2, $in2_y(%rsp)
1743 movdqa %xmm3, $in2_y+0x10(%rsp)
1744 por %xmm2, %xmm3
1745 por %xmm4, %xmm5
1746 pxor %xmm4, %xmm4
1747 por %xmm1, %xmm3
1749 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1750 lea $Z1sqr(%rsp), $r_ptr # Z1^2
1751 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
1753 pcmpeqd %xmm4, %xmm5
1754 pshufd \$0xb1, %xmm3, %xmm4
1755 mov 0x00($b_ptr), $src0 # $b_ptr is still valid
1756 #lea 0x00($b_ptr), $b_ptr
1757 mov $acc4, $acc1 # harmonize sqr output and mul input
1758 por %xmm3, %xmm4
1759 pshufd \$0, %xmm5, %xmm5 # in1infty
1760 pshufd \$0x1e, %xmm4, %xmm3
1761 mov $acc5, $acc2
1762 por %xmm3, %xmm4
1763 pxor %xmm3, %xmm3
1764 mov $acc6, $acc3
1765 pcmpeqd %xmm3, %xmm4
1766 pshufd \$0, %xmm4, %xmm4 # in2infty
1768 lea $Z1sqr-$bias(%rsp), $a_ptr
1769 mov $acc7, $acc4
1770 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
1771 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
1773 lea $in1_x(%rsp), $b_ptr
1774 lea $H(%rsp), $r_ptr # H = U2 - U1
1775 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
1777 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1778 lea $S2(%rsp), $r_ptr # S2 = Z1^3
1779 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
1781 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1782 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1783 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
1785 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1786 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1787 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
1789 lea $in1_y(%rsp), $b_ptr
1790 lea $R(%rsp), $r_ptr # R = S2 - S1
1791 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
1793 `&load_for_sqr("$H(%rsp)", "$src0")`
1794 lea $Hsqr(%rsp), $r_ptr # H^2
1795 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
1797 `&load_for_sqr("$R(%rsp)", "$src0")`
1798 lea $Rsqr(%rsp), $r_ptr # R^2
1799 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
1801 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
1802 lea $Hcub(%rsp), $r_ptr # H^3
1803 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
1805 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1806 lea $U2(%rsp), $r_ptr # U1*H^2
1807 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
1810 #######################################################################
1811 # operate in 4-5-0-1 "name space" that matches multiplication output
1813 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1814 my ($poly1, $poly3)=($acc6,$acc7);
1816 $code.=<<___;
1817 #lea $U2(%rsp), $a_ptr
1818 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1819 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1821 add $acc0, $acc0 # a0:a3+a0:a3
1822 lea $Rsqr(%rsp), $a_ptr
1823 adc $acc1, $acc1
1824 mov $acc0, $t0
1825 adc $acc2, $acc2
1826 adc $acc3, $acc3
1827 mov $acc1, $t1
1828 sbb $t4, $t4
1830 sub \$-1, $acc0
1831 mov $acc2, $t2
1832 sbb $poly1, $acc1
1833 sbb \$0, $acc2
1834 mov $acc3, $t3
1835 sbb $poly3, $acc3
1836 test $t4, $t4
1838 cmovz $t0, $acc0
1839 mov 8*0($a_ptr), $t0
1840 cmovz $t1, $acc1
1841 mov 8*1($a_ptr), $t1
1842 cmovz $t2, $acc2
1843 mov 8*2($a_ptr), $t2
1844 cmovz $t3, $acc3
1845 mov 8*3($a_ptr), $t3
1847 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
1849 lea $Hcub(%rsp), $b_ptr
1850 lea $res_x(%rsp), $r_ptr
1851 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
1853 mov $U2+8*0(%rsp), $t0
1854 mov $U2+8*1(%rsp), $t1
1855 mov $U2+8*2(%rsp), $t2
1856 mov $U2+8*3(%rsp), $t3
1857 lea $H(%rsp), $r_ptr
1859 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
1861 mov $acc0, 8*0($r_ptr) # save the result, as
1862 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
1863 mov $acc2, 8*2($r_ptr)
1864 mov $acc3, 8*3($r_ptr)
1867 $code.=<<___;
1868 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
1869 lea $S2(%rsp), $r_ptr
1870 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
1872 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
1873 lea $H(%rsp), $r_ptr
1874 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
1876 lea $S2(%rsp), $b_ptr
1877 lea $res_y(%rsp), $r_ptr
1878 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
1880 movq %xmm0, $r_ptr # restore $r_ptr
1882 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
1883 movdqa %xmm5, %xmm1
1884 pandn $res_z(%rsp), %xmm0
1885 movdqa %xmm5, %xmm2
1886 pandn $res_z+0x10(%rsp), %xmm1
1887 movdqa %xmm5, %xmm3
1888 pand .LONE_mont(%rip), %xmm2
1889 pand .LONE_mont+0x10(%rip), %xmm3
1890 por %xmm0, %xmm2
1891 por %xmm1, %xmm3
1893 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
1894 movdqa %xmm4, %xmm1
1895 pandn %xmm2, %xmm0
1896 movdqa %xmm4, %xmm2
1897 pandn %xmm3, %xmm1
1898 movdqa %xmm4, %xmm3
1899 pand $in1_z(%rsp), %xmm2
1900 pand $in1_z+0x10(%rsp), %xmm3
1901 por %xmm0, %xmm2
1902 por %xmm1, %xmm3
1903 movdqu %xmm2, 0x40($r_ptr)
1904 movdqu %xmm3, 0x50($r_ptr)
1906 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
1907 movdqa %xmm5, %xmm1
1908 pandn $res_x(%rsp), %xmm0
1909 movdqa %xmm5, %xmm2
1910 pandn $res_x+0x10(%rsp), %xmm1
1911 movdqa %xmm5, %xmm3
1912 pand $in2_x(%rsp), %xmm2
1913 pand $in2_x+0x10(%rsp), %xmm3
1914 por %xmm0, %xmm2
1915 por %xmm1, %xmm3
1917 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
1918 movdqa %xmm4, %xmm1
1919 pandn %xmm2, %xmm0
1920 movdqa %xmm4, %xmm2
1921 pandn %xmm3, %xmm1
1922 movdqa %xmm4, %xmm3
1923 pand $in1_x(%rsp), %xmm2
1924 pand $in1_x+0x10(%rsp), %xmm3
1925 por %xmm0, %xmm2
1926 por %xmm1, %xmm3
1927 movdqu %xmm2, 0x00($r_ptr)
1928 movdqu %xmm3, 0x10($r_ptr)
1930 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
1931 movdqa %xmm5, %xmm1
1932 pandn $res_y(%rsp), %xmm0
1933 movdqa %xmm5, %xmm2
1934 pandn $res_y+0x10(%rsp), %xmm1
1935 movdqa %xmm5, %xmm3
1936 pand $in2_y(%rsp), %xmm2
1937 pand $in2_y+0x10(%rsp), %xmm3
1938 por %xmm0, %xmm2
1939 por %xmm1, %xmm3
1941 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
1942 movdqa %xmm4, %xmm1
1943 pandn %xmm2, %xmm0
1944 movdqa %xmm4, %xmm2
1945 pandn %xmm3, %xmm1
1946 movdqa %xmm4, %xmm3
1947 pand $in1_y(%rsp), %xmm2
1948 pand $in1_y+0x10(%rsp), %xmm3
1949 por %xmm0, %xmm2
1950 por %xmm1, %xmm3
1951 movdqu %xmm2, 0x20($r_ptr)
1952 movdqu %xmm3, 0x30($r_ptr)
1954 add \$32*15+8, %rsp
1955 pop %r15
1956 pop %r14
1957 pop %r13
1958 pop %r12
1959 pop %rbx
1960 pop %rbp
1962 .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
1965 &gen_add_affine("q");
1969 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1970 print $code;
1971 close STDOUT;