2 # $OpenBSD: ecp_nistz256-x86_64.pl,v 1.1 2016/11/04 17:33:20 miod Exp $
4 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
6 # Licensed under the OpenSSL license (the "License"). You may not use
7 # this file except in compliance with the License. You can obtain a copy
8 # in the file LICENSE in the source distribution or at
9 # https://www.openssl.org/source/license.html
11 # Copyright (c) 2014, Intel Corporation.
13 # Permission to use, copy, modify, and/or distribute this software for any
14 # purpose with or without fee is hereby granted, provided that the above
15 # copyright notice and this permission notice appear in all copies.
17 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
20 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
22 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
23 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25 # Developers and authors:
26 # Shay Gueron (1, 2), and Vlad Krasnov (1)
27 # (1) Intel Corporation, Israel Development Center
28 # (2) University of Haifa
31 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
34 # Further optimization by <appro@openssl.org>:
36 # this/original with/without -DECP_NISTZ256_ASM(*)
37 # Opteron +12-49% +110-150%
38 # Bulldozer +14-45% +175-210%
40 # Westmere +12-34% +80-87%
41 # Sandy Bridge +9-35% +110-120%
42 # Ivy Bridge +9-35% +110-125%
43 # Haswell +8-37% +140-160%
44 # Broadwell +18-58% +145-210%
45 # Atom +15-50% +130-180%
46 # VIA Nano +43-160% +300-480%
48 # (*) "without -DECP_NISTZ256_ASM" refers to build with
49 # "enable-ec_nistp_64_gcc_128";
51 # Ranges denote minimum and maximum improvement coefficients depending
52 # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
53 # server-side operation. Keep in mind that +100% means 2x improvement.
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
61 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
64 die "can't locate x86_64-xlate.pl";
66 open OUT
,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
75 .quad
0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
84 .quad
0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
88 ################################################################################
89 # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
91 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
92 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
93 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
97 .globl ecp_nistz256_mul_by_2
98 .type ecp_nistz256_mul_by_2
,\
@function,2
100 ecp_nistz256_mul_by_2
:
106 add
$a0, $a0 # a0:a3+a0:a3
110 lea
.Lpoly
(%rip), $a_ptr
137 .size ecp_nistz256_mul_by_2
,.-ecp_nistz256_mul_by_2
139 ################################################################################
140 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
141 .globl ecp_nistz256_neg
142 .type ecp_nistz256_neg
,\
@function,2
159 lea
.Lpoly
(%rip), $a_ptr
183 .size ecp_nistz256_neg
,.-ecp_nistz256_neg
187 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
188 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
189 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
190 my ($poly1,$poly3)=($acc6,$acc7);
193 ################################################################################
194 # void ecp_nistz256_mul_mont(
199 .globl ecp_nistz256_mul_mont
200 .type ecp_nistz256_mul_mont
,\
@function,3
202 ecp_nistz256_mul_mont
:
212 mov
8*0($b_org), %rax
213 mov
8*0($a_ptr), $acc1
214 mov
8*1($a_ptr), $acc2
215 mov
8*2($a_ptr), $acc3
216 mov
8*3($a_ptr), $acc4
218 call __ecp_nistz256_mul_montq
227 .size ecp_nistz256_mul_mont
,.-ecp_nistz256_mul_mont
229 .type __ecp_nistz256_mul_montq
,\
@abi-omnipotent
231 __ecp_nistz256_mul_montq
:
232 ########################################################################
236 mov
.Lpoly
+8*1(%rip),$poly1
242 mov
.Lpoly
+8*3(%rip),$poly3
261 ########################################################################
262 # First reduction step
263 # Basically now we want to multiply acc[0] by p256,
264 # and add the result to the acc.
265 # Due to the special form of p256 we do some optimizations
267 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
268 # then we add acc[0] and get acc[0] x 2^96
274 add
$acc0, $acc1 # +=acc[0]<<96
277 mov
8*1($b_ptr), %rax
282 ########################################################################
315 ########################################################################
316 # Second reduction step
324 mov
8*2($b_ptr), %rax
329 ########################################################################
362 ########################################################################
363 # Third reduction step
371 mov
8*3($b_ptr), %rax
376 ########################################################################
409 ########################################################################
410 # Final reduction step
423 ########################################################################
424 # Branch-less conditional subtraction of P
425 sub \
$-1, $acc4 # .Lpoly[0]
427 sbb
$poly1, $acc5 # .Lpoly[1]
428 sbb \
$0, $acc0 # .Lpoly[2]
430 sbb
$poly3, $acc1 # .Lpoly[3]
435 mov
$acc4, 8*0($r_ptr)
437 mov
$acc5, 8*1($r_ptr)
439 mov
$acc0, 8*2($r_ptr)
440 mov
$acc1, 8*3($r_ptr)
443 .size __ecp_nistz256_mul_montq
,.-__ecp_nistz256_mul_montq
445 ################################################################################
446 # void ecp_nistz256_sqr_mont(
450 # we optimize the square according to S.Gueron and V.Krasnov,
451 # "Speeding up Big-Number Squaring"
452 .globl ecp_nistz256_sqr_mont
453 .type ecp_nistz256_sqr_mont
,\
@function,2
455 ecp_nistz256_sqr_mont
:
463 mov
8*0($a_ptr), %rax
464 mov
8*1($a_ptr), $acc6
465 mov
8*2($a_ptr), $acc7
466 mov
8*3($a_ptr), $acc0
468 call __ecp_nistz256_sqr_montq
477 .size ecp_nistz256_sqr_mont
,.-ecp_nistz256_sqr_mont
479 .type __ecp_nistz256_sqr_montq
,\
@abi-omnipotent
481 __ecp_nistz256_sqr_montq
:
483 mulq
$acc6 # a[1]*a[0]
488 mulq
$acc5 # a[0]*a[2]
494 mulq
$acc5 # a[0]*a[3]
500 #################################
501 mulq
$acc6 # a[1]*a[2]
507 mulq
$acc6 # a[1]*a[3]
515 #################################
516 mulq
$acc7 # a[2]*a[3]
519 mov
8*0($a_ptr), %rax
523 add
$acc1, $acc1 # acc1:6<<1
533 mov
8*1($a_ptr), %rax
539 mov
8*2($a_ptr), %rax
546 mov
8*3($a_ptr), %rax
556 mov
.Lpoly
+8*1(%rip), $a_ptr
557 mov
.Lpoly
+8*3(%rip), $t1
559 ##########################################
566 add
$acc0, $acc1 # +=acc[0]<<96
572 ##########################################
585 ##########################################
598 ###########################################
611 ############################################
612 # Add the rest of the acc
621 sub \
$-1, $acc4 # .Lpoly[0]
623 sbb
$a_ptr, $acc5 # .Lpoly[1]
624 sbb \
$0, $acc6 # .Lpoly[2]
626 sbb
$t1, $acc7 # .Lpoly[3]
631 mov
$acc4, 8*0($r_ptr)
633 mov
$acc5, 8*1($r_ptr)
635 mov
$acc6, 8*2($r_ptr)
636 mov
$acc7, 8*3($r_ptr)
639 .size __ecp_nistz256_sqr_montq
,.-__ecp_nistz256_sqr_montq
644 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
645 my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
646 my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
649 ################################################################################
650 # void ecp_nistz256_from_mont(
653 # This one performs Montgomery multiplication by 1, so we only need the reduction
655 .globl ecp_nistz256_from_mont
656 .type ecp_nistz256_from_mont
,\
@function,2
658 ecp_nistz256_from_mont
:
662 mov
8*0($in_ptr), %rax
663 mov
.Lpoly
+8*3(%rip), $t2
664 mov
8*1($in_ptr), $acc1
665 mov
8*2($in_ptr), $acc2
666 mov
8*3($in_ptr), $acc3
668 mov
.Lpoly
+8*1(%rip), $t1
670 #########################################
682 #########################################
695 ##########################################
708 ###########################################
722 ###########################################
723 # Branch-less conditional subtraction
733 cmovnz
$in_ptr, $acc1
734 mov
$acc0, 8*0($r_ptr)
736 mov
$acc1, 8*1($r_ptr)
738 mov
$acc2, 8*2($r_ptr)
739 mov
$acc3, 8*3($r_ptr)
744 .size ecp_nistz256_from_mont
,.-ecp_nistz256_from_mont
748 my ($val,$in_t,$index)=$win64?
("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
749 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
750 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
751 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
754 ################################################################################
755 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
756 .globl ecp_nistz256_select_w5
757 .type ecp_nistz256_select_w5
,\
@abi-omnipotent
759 ecp_nistz256_select_w5
:
761 $code.=<<___
if ($win64);
762 lea
-0x88(%rsp), %rax
763 .LSEH_begin_ecp_nistz256_select_w5
:
764 .byte
0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
765 .byte
0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
766 .byte
0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
767 .byte
0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
768 .byte
0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
769 .byte
0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
770 .byte
0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
771 .byte
0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
772 .byte
0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
773 .byte
0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
774 .byte
0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
777 movdqa
.LOne
(%rip), $ONE
788 pshufd \
$0, $INDEX, $INDEX
791 .Lselect_loop_sse_w5
:
795 pcmpeqd
$INDEX, $TMP0
797 movdqa
16*0($in_t), $T0a
798 movdqa
16*1($in_t), $T0b
799 movdqa
16*2($in_t), $T0c
800 movdqa
16*3($in_t), $T0d
801 movdqa
16*4($in_t), $T0e
802 movdqa
16*5($in_t), $T0f
803 lea
16*6($in_t), $in_t
819 jnz
.Lselect_loop_sse_w5
821 movdqu
$Ra, 16*0($val)
822 movdqu
$Rb, 16*1($val)
823 movdqu
$Rc, 16*2($val)
824 movdqu
$Rd, 16*3($val)
825 movdqu
$Re, 16*4($val)
826 movdqu
$Rf, 16*5($val)
828 $code.=<<___
if ($win64);
830 movaps
0x10(%rsp), %xmm7
831 movaps
0x20(%rsp), %xmm8
832 movaps
0x30(%rsp), %xmm9
833 movaps
0x40(%rsp), %xmm10
834 movaps
0x50(%rsp), %xmm11
835 movaps
0x60(%rsp), %xmm12
836 movaps
0x70(%rsp), %xmm13
837 movaps
0x80(%rsp), %xmm14
838 movaps
0x90(%rsp), %xmm15
840 .LSEH_end_ecp_nistz256_select_w5
:
844 .size ecp_nistz256_select_w5
,.-ecp_nistz256_select_w5
846 ################################################################################
847 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
848 .globl ecp_nistz256_select_w7
849 .type ecp_nistz256_select_w7
,\
@abi-omnipotent
851 ecp_nistz256_select_w7
:
853 $code.=<<___
if ($win64);
854 lea
-0x88(%rsp), %rax
855 .LSEH_begin_ecp_nistz256_select_w7
:
856 .byte
0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
857 .byte
0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
858 .byte
0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
859 .byte
0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
860 .byte
0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
861 .byte
0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
862 .byte
0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
863 .byte
0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
864 .byte
0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
865 .byte
0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
866 .byte
0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
869 movdqa
.LOne
(%rip), $M0
878 pshufd \
$0, $INDEX, $INDEX
881 .Lselect_loop_sse_w7
:
884 movdqa
16*0($in_t), $T0a
885 movdqa
16*1($in_t), $T0b
886 pcmpeqd
$INDEX, $TMP0
887 movdqa
16*2($in_t), $T0c
888 movdqa
16*3($in_t), $T0d
889 lea
16*4($in_t), $in_t
898 prefetcht0
255($in_t)
902 jnz
.Lselect_loop_sse_w7
904 movdqu
$Ra, 16*0($val)
905 movdqu
$Rb, 16*1($val)
906 movdqu
$Rc, 16*2($val)
907 movdqu
$Rd, 16*3($val)
909 $code.=<<___
if ($win64);
911 movaps
0x10(%rsp), %xmm7
912 movaps
0x20(%rsp), %xmm8
913 movaps
0x30(%rsp), %xmm9
914 movaps
0x40(%rsp), %xmm10
915 movaps
0x50(%rsp), %xmm11
916 movaps
0x60(%rsp), %xmm12
917 movaps
0x70(%rsp), %xmm13
918 movaps
0x80(%rsp), %xmm14
919 movaps
0x90(%rsp), %xmm15
921 .LSEH_end_ecp_nistz256_select_w7
:
925 .size ecp_nistz256_select_w7
,.-ecp_nistz256_select_w7
929 ########################################################################
930 # This block implements higher level point_double, point_add and
931 # point_add_affine. The key to performance in this case is to allow
932 # out-of-order execution logic to overlap computations from next step
933 # with tail processing from current step. By using tailored calling
934 # sequence we minimize inter-step overhead to give processor better
935 # shot at overlapping operations...
937 # You will notice that input data is copied to stack. Trouble is that
938 # there are no registers to spare for holding original pointers and
939 # reloading them, pointers, would create undesired dependencies on
940 # effective addresses calculation paths. In other words it's too done
941 # to favour out-of-order execution logic.
942 # <appro@openssl.org>
944 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
945 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
946 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
947 my ($poly1,$poly3)=($acc6,$acc7);
949 sub load_for_mul
() {
950 my ($a,$b,$src0) = @_;
951 my $bias = $src0 eq "%rax" ?
0 : -128;
962 sub load_for_sqr
() {
964 my $bias = $src0 eq "%rax" ?
0 : -128;
974 ########################################################################
975 # operate in 4-5-0-1 "name space" that matches multiplication output
977 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
980 .type __ecp_nistz256_add_toq
,\
@abi-omnipotent
982 __ecp_nistz256_add_toq
:
1001 mov
$a0, 8*0($r_ptr)
1003 mov
$a1, 8*1($r_ptr)
1005 mov
$a2, 8*2($r_ptr)
1006 mov
$a3, 8*3($r_ptr)
1009 .size __ecp_nistz256_add_toq
,.-__ecp_nistz256_add_toq
1011 .type __ecp_nistz256_sub_fromq
,\
@abi-omnipotent
1013 __ecp_nistz256_sub_fromq
:
1014 sub 8*0($b_ptr), $a0
1015 sbb
8*1($b_ptr), $a1
1017 sbb
8*2($b_ptr), $a2
1018 sbb
8*3($b_ptr), $a3
1032 mov
$a0, 8*0($r_ptr)
1034 mov
$a1, 8*1($r_ptr)
1036 mov
$a2, 8*2($r_ptr)
1037 mov
$a3, 8*3($r_ptr)
1040 .size __ecp_nistz256_sub_fromq
,.-__ecp_nistz256_sub_fromq
1042 .type __ecp_nistz256_subq
,\
@abi-omnipotent
1044 __ecp_nistz256_subq
:
1067 .size __ecp_nistz256_subq
,.-__ecp_nistz256_subq
1069 .type __ecp_nistz256_mul_by_2q
,\
@abi-omnipotent
1071 __ecp_nistz256_mul_by_2q
:
1072 add
$a0, $a0 # a0:a3+a0:a3
1090 mov
$a0, 8*0($r_ptr)
1092 mov
$a1, 8*1($r_ptr)
1094 mov
$a2, 8*2($r_ptr)
1095 mov
$a3, 8*3($r_ptr)
1098 .size __ecp_nistz256_mul_by_2q
,.-__ecp_nistz256_mul_by_2q
1103 my ($src0,$sfx,$bias);
1104 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1112 .globl ecp_nistz256_point_double
1113 .type ecp_nistz256_point_double
,\
@function,2
1115 ecp_nistz256_point_double
:
1123 .type ecp_nistz256_point_doublex
,\
@function,2
1125 ecp_nistz256_point_doublex
:
1138 .Lpoint_double_shortcut
$x:
1139 movdqu
0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
1140 mov
$a_ptr, $b_ptr # backup copy
1141 movdqu
0x10($a_ptr), %xmm1
1142 mov
0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
1143 mov
0x20+8*1($a_ptr), $acc5
1144 mov
0x20+8*2($a_ptr), $acc0
1145 mov
0x20+8*3($a_ptr), $acc1
1146 mov
.Lpoly
+8*1(%rip), $poly1
1147 mov
.Lpoly
+8*3(%rip), $poly3
1148 movdqa
%xmm0, $in_x(%rsp)
1149 movdqa
%xmm1, $in_x+0x10(%rsp)
1150 lea
0x20($r_ptr), $acc2
1151 lea
0x40($r_ptr), $acc3
1156 lea
$S(%rsp), $r_ptr
1157 call __ecp_nistz256_mul_by_2
$x # p256_mul_by_2(S, in_y);
1159 mov
0x40+8*0($a_ptr), $src0
1160 mov
0x40+8*1($a_ptr), $acc6
1161 mov
0x40+8*2($a_ptr), $acc7
1162 mov
0x40+8*3($a_ptr), $acc0
1163 lea
0x40-$bias($a_ptr), $a_ptr
1164 lea
$Zsqr(%rsp), $r_ptr
1165 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Zsqr, in_z);
1167 `&load_for_sqr("$S(%rsp)", "$src0")`
1168 lea
$S(%rsp), $r_ptr
1169 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(S, S);
1171 mov
0x20($b_ptr), $src0 # $b_ptr is still valid
1172 mov
0x40+8*0($b_ptr), $acc1
1173 mov
0x40+8*1($b_ptr), $acc2
1174 mov
0x40+8*2($b_ptr), $acc3
1175 mov
0x40+8*3($b_ptr), $acc4
1176 lea
0x40-$bias($b_ptr), $a_ptr
1177 lea
0x20($b_ptr), $b_ptr
1179 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(res_z, in_z, in_y);
1180 call __ecp_nistz256_mul_by_2
$x # p256_mul_by_2(res_z, res_z);
1182 mov
$in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1183 mov
$in_x+8*1(%rsp), $acc5
1184 lea
$Zsqr(%rsp), $b_ptr
1185 mov
$in_x+8*2(%rsp), $acc0
1186 mov
$in_x+8*3(%rsp), $acc1
1187 lea
$M(%rsp), $r_ptr
1188 call __ecp_nistz256_add_to
$x # p256_add(M, in_x, Zsqr);
1190 mov
$in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1191 mov
$in_x+8*1(%rsp), $acc5
1192 lea
$Zsqr(%rsp), $b_ptr
1193 mov
$in_x+8*2(%rsp), $acc0
1194 mov
$in_x+8*3(%rsp), $acc1
1195 lea
$Zsqr(%rsp), $r_ptr
1196 call __ecp_nistz256_sub_from
$x # p256_sub(Zsqr, in_x, Zsqr);
1198 `&load_for_sqr("$S(%rsp)", "$src0")`
1200 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(res_y, S);
1203 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1204 # operate in 4-5-6-7 "name space" that matches squaring output
1206 my ($poly1,$poly3)=($a_ptr,$t1);
1207 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1220 xor $a_ptr, $a_ptr # borrow $a_ptr
1229 mov
$a1, $t0 # a0:a3>>1
1240 mov
$a0, 8*0($r_ptr)
1242 mov
$a1, 8*1($r_ptr)
1246 mov
$a2, 8*2($r_ptr)
1247 mov
$a3, 8*3($r_ptr)
1251 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1252 lea
$M(%rsp), $r_ptr
1253 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(M, M, Zsqr);
1255 lea
$tmp0(%rsp), $r_ptr
1256 call __ecp_nistz256_mul_by_2
$x
1258 lea
$M(%rsp), $b_ptr
1259 lea
$M(%rsp), $r_ptr
1260 call __ecp_nistz256_add_to
$x # p256_mul_by_3(M, M);
1262 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1263 lea
$S(%rsp), $r_ptr
1264 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S, S, in_x);
1266 lea
$tmp0(%rsp), $r_ptr
1267 call __ecp_nistz256_mul_by_2
$x # p256_mul_by_2(tmp0, S);
1269 `&load_for_sqr("$M(%rsp)", "$src0")`
1271 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(res_x, M);
1273 lea
$tmp0(%rsp), $b_ptr
1274 mov
$acc6, $acc0 # harmonize sqr output and sub input
1278 call __ecp_nistz256_sub_from
$x # p256_sub(res_x, res_x, tmp0);
1280 mov
$S+8*0(%rsp), $t0
1281 mov
$S+8*1(%rsp), $t1
1282 mov
$S+8*2(%rsp), $t2
1283 mov
$S+8*3(%rsp), $acc2 # "4-5-0-1" order
1284 lea
$S(%rsp), $r_ptr
1285 call __ecp_nistz256_sub
$x # p256_sub(S, S, res_x);
1288 lea
$M(%rsp), $b_ptr
1289 mov
$acc4, $acc6 # harmonize sub output and mul input
1291 mov
$acc4, $S+8*0(%rsp) # have to save:-(
1293 mov
$acc5, $S+8*1(%rsp)
1295 mov
$acc0, $S+8*2(%rsp)
1296 lea
$S-$bias(%rsp), $a_ptr
1298 mov
$acc1, $S+8*3(%rsp)
1300 lea
$S(%rsp), $r_ptr
1301 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S, S, M);
1305 call __ecp_nistz256_sub_from
$x # p256_sub(res_y, S, res_y);
1315 .size ecp_nistz256_point_double
$sfx,.-ecp_nistz256_point_double
$sfx
1322 my ($src0,$sfx,$bias);
1323 my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1325 $res_x,$res_y,$res_z,
1326 $in1_x,$in1_y,$in1_z,
1327 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1328 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1336 .globl ecp_nistz256_point_add
1337 .type ecp_nistz256_point_add
,\
@function,3
1339 ecp_nistz256_point_add
:
1355 movdqu
0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1356 movdqu
0x10($a_ptr), %xmm1
1357 movdqu
0x20($a_ptr), %xmm2
1358 movdqu
0x30($a_ptr), %xmm3
1359 movdqu
0x40($a_ptr), %xmm4
1360 movdqu
0x50($a_ptr), %xmm5
1361 mov
$a_ptr, $b_ptr # reassign
1362 mov
$b_org, $a_ptr # reassign
1363 movdqa
%xmm0, $in1_x(%rsp)
1364 movdqa
%xmm1, $in1_x+0x10(%rsp)
1366 movdqa
%xmm2, $in1_y(%rsp)
1367 movdqa
%xmm3, $in1_y+0x10(%rsp)
1369 movdqa
%xmm4, $in1_z(%rsp)
1370 movdqa
%xmm5, $in1_z+0x10(%rsp)
1373 movdqu
0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
1374 pshufd \
$0xb1, %xmm3, %xmm5
1375 movdqu
0x10($a_ptr), %xmm1
1376 movdqu
0x20($a_ptr), %xmm2
1378 movdqu
0x30($a_ptr), %xmm3
1379 mov
0x40+8*0($a_ptr), $src0 # load original in2_z
1380 mov
0x40+8*1($a_ptr), $acc6
1381 mov
0x40+8*2($a_ptr), $acc7
1382 mov
0x40+8*3($a_ptr), $acc0
1383 movdqa
%xmm0, $in2_x(%rsp)
1384 pshufd \
$0x1e, %xmm5, %xmm4
1385 movdqa
%xmm1, $in2_x+0x10(%rsp)
1387 movq
$r_ptr, %xmm0 # save $r_ptr
1388 movdqa
%xmm2, $in2_y(%rsp)
1389 movdqa
%xmm3, $in2_y+0x10(%rsp)
1395 lea
0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1396 mov
$src0, $in2_z+8*0(%rsp) # make in2_z copy
1397 mov
$acc6, $in2_z+8*1(%rsp)
1398 mov
$acc7, $in2_z+8*2(%rsp)
1399 mov
$acc0, $in2_z+8*3(%rsp)
1400 lea
$Z2sqr(%rsp), $r_ptr # Z2^2
1401 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Z2sqr, in2_z);
1403 pcmpeqd
%xmm4, %xmm5
1404 pshufd \
$0xb1, %xmm3, %xmm4
1406 pshufd \
$0, %xmm5, %xmm5 # in1infty
1407 pshufd \
$0x1e, %xmm4, %xmm3
1410 pcmpeqd
%xmm3, %xmm4
1411 pshufd \
$0, %xmm4, %xmm4 # in2infty
1412 mov
0x40+8*0($b_ptr), $src0 # load original in1_z
1413 mov
0x40+8*1($b_ptr), $acc6
1414 mov
0x40+8*2($b_ptr), $acc7
1415 mov
0x40+8*3($b_ptr), $acc0
1418 lea
0x40-$bias($b_ptr), $a_ptr
1419 lea
$Z1sqr(%rsp), $r_ptr # Z1^2
1420 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Z1sqr, in1_z);
1422 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
1423 lea
$S1(%rsp), $r_ptr # S1 = Z2^3
1424 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S1, Z2sqr, in2_z);
1426 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1427 lea
$S2(%rsp), $r_ptr # S2 = Z1^3
1428 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S2, Z1sqr, in1_z);
1430 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
1431 lea
$S1(%rsp), $r_ptr # S1 = Y1*Z2^3
1432 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S1, S1, in1_y);
1434 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1435 lea
$S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1436 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S2, S2, in2_y);
1438 lea
$S1(%rsp), $b_ptr
1439 lea
$R(%rsp), $r_ptr # R = S2 - S1
1440 call __ecp_nistz256_sub_from
$x # p256_sub(R, S2, S1);
1442 or $acc5, $acc4 # see if result is zero
1446 por
%xmm5, %xmm2 # in1infty || in2infty
1449 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1450 lea
$U1(%rsp), $r_ptr # U1 = X1*Z2^2
1451 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(U1, in1_x, Z2sqr);
1453 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
1454 lea
$U2(%rsp), $r_ptr # U2 = X2*Z1^2
1455 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(U2, in2_x, Z1sqr);
1457 lea
$U1(%rsp), $b_ptr
1458 lea
$H(%rsp), $r_ptr # H = U2 - U1
1459 call __ecp_nistz256_sub_from
$x # p256_sub(H, U2, U1);
1461 or $acc5, $acc4 # see if result is zero
1465 .byte
0x3e # predict taken
1466 jnz
.Ladd_proceed
$x # is_equal(U1,U2)?
1470 jnz
.Ladd_proceed
$x # (in1infty || in2infty)?
1472 jz
.Ladd_double
$x # is_equal(S1,S2)?
1474 movq
%xmm0, $r_ptr # restore $r_ptr
1476 movdqu
%xmm0, 0x00($r_ptr)
1477 movdqu
%xmm0, 0x10($r_ptr)
1478 movdqu
%xmm0, 0x20($r_ptr)
1479 movdqu
%xmm0, 0x30($r_ptr)
1480 movdqu
%xmm0, 0x40($r_ptr)
1481 movdqu
%xmm0, 0x50($r_ptr)
1486 movq
%xmm1, $a_ptr # restore $a_ptr
1487 movq
%xmm0, $r_ptr # restore $r_ptr
1488 add \
$`32*(18-5)`, %rsp # difference in frame sizes
1489 jmp
.Lpoint_double_shortcut
$x
1493 `&load_for_sqr("$R(%rsp)", "$src0")`
1494 lea
$Rsqr(%rsp), $r_ptr # R^2
1495 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Rsqr, R);
1497 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1498 lea
$res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1499 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(res_z, H, in1_z);
1501 `&load_for_sqr("$H(%rsp)", "$src0")`
1502 lea
$Hsqr(%rsp), $r_ptr # H^2
1503 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Hsqr, H);
1505 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
1506 lea
$res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1507 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(res_z, res_z, in2_z);
1509 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
1510 lea
$Hcub(%rsp), $r_ptr # H^3
1511 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(Hcub, Hsqr, H);
1513 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
1514 lea
$U2(%rsp), $r_ptr # U1*H^2
1515 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(U2, U1, Hsqr);
1518 #######################################################################
1519 # operate in 4-5-0-1 "name space" that matches multiplication output
1521 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1522 my ($poly1, $poly3)=($acc6,$acc7);
1525 #lea $U2(%rsp), $a_ptr
1526 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1527 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1529 add
$acc0, $acc0 # a0:a3+a0:a3
1530 lea
$Rsqr(%rsp), $a_ptr
1547 mov
8*0($a_ptr), $t0
1549 mov
8*1($a_ptr), $t1
1551 mov
8*2($a_ptr), $t2
1553 mov
8*3($a_ptr), $t3
1555 call __ecp_nistz256_sub
$x # p256_sub(res_x, Rsqr, Hsqr);
1557 lea
$Hcub(%rsp), $b_ptr
1558 lea
$res_x(%rsp), $r_ptr
1559 call __ecp_nistz256_sub_from
$x # p256_sub(res_x, res_x, Hcub);
1561 mov
$U2+8*0(%rsp), $t0
1562 mov
$U2+8*1(%rsp), $t1
1563 mov
$U2+8*2(%rsp), $t2
1564 mov
$U2+8*3(%rsp), $t3
1565 lea
$res_y(%rsp), $r_ptr
1567 call __ecp_nistz256_sub
$x # p256_sub(res_y, U2, res_x);
1569 mov
$acc0, 8*0($r_ptr) # save the result, as
1570 mov
$acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
1571 mov
$acc2, 8*2($r_ptr)
1572 mov
$acc3, 8*3($r_ptr)
1576 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
1577 lea
$S2(%rsp), $r_ptr
1578 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S2, S1, Hcub);
1580 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
1581 lea
$res_y(%rsp), $r_ptr
1582 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(res_y, R, res_y);
1584 lea
$S2(%rsp), $b_ptr
1585 lea
$res_y(%rsp), $r_ptr
1586 call __ecp_nistz256_sub_from
$x # p256_sub(res_y, res_y, S2);
1588 movq
%xmm0, $r_ptr # restore $r_ptr
1590 movdqa
%xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
1592 pandn
$res_z(%rsp), %xmm0
1594 pandn
$res_z+0x10(%rsp), %xmm1
1596 pand
$in2_z(%rsp), %xmm2
1597 pand
$in2_z+0x10(%rsp), %xmm3
1601 movdqa
%xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
1607 pand
$in1_z(%rsp), %xmm2
1608 pand
$in1_z+0x10(%rsp), %xmm3
1611 movdqu
%xmm2, 0x40($r_ptr)
1612 movdqu
%xmm3, 0x50($r_ptr)
1614 movdqa
%xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
1616 pandn
$res_x(%rsp), %xmm0
1618 pandn
$res_x+0x10(%rsp), %xmm1
1620 pand
$in2_x(%rsp), %xmm2
1621 pand
$in2_x+0x10(%rsp), %xmm3
1625 movdqa
%xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
1631 pand
$in1_x(%rsp), %xmm2
1632 pand
$in1_x+0x10(%rsp), %xmm3
1635 movdqu
%xmm2, 0x00($r_ptr)
1636 movdqu
%xmm3, 0x10($r_ptr)
1638 movdqa
%xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
1640 pandn
$res_y(%rsp), %xmm0
1642 pandn
$res_y+0x10(%rsp), %xmm1
1644 pand
$in2_y(%rsp), %xmm2
1645 pand
$in2_y+0x10(%rsp), %xmm3
1649 movdqa
%xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
1655 pand
$in1_y(%rsp), %xmm2
1656 pand
$in1_y+0x10(%rsp), %xmm3
1659 movdqu
%xmm2, 0x20($r_ptr)
1660 movdqu
%xmm3, 0x30($r_ptr)
1671 .size ecp_nistz256_point_add
$sfx,.-ecp_nistz256_point_add
$sfx
1676 sub gen_add_affine
() {
1678 my ($src0,$sfx,$bias);
1679 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
1680 $res_x,$res_y,$res_z,
1681 $in1_x,$in1_y,$in1_z,
1682 $in2_x,$in2_y)=map(32*$_,(0..14));
1691 .globl ecp_nistz256_point_add_affine
1692 .type ecp_nistz256_point_add_affine
,\
@function,3
1694 ecp_nistz256_point_add_affine
:
1710 movdqu
0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1711 mov
$b_org, $b_ptr # reassign
1712 movdqu
0x10($a_ptr), %xmm1
1713 movdqu
0x20($a_ptr), %xmm2
1714 movdqu
0x30($a_ptr), %xmm3
1715 movdqu
0x40($a_ptr), %xmm4
1716 movdqu
0x50($a_ptr), %xmm5
1717 mov
0x40+8*0($a_ptr), $src0 # load original in1_z
1718 mov
0x40+8*1($a_ptr), $acc6
1719 mov
0x40+8*2($a_ptr), $acc7
1720 mov
0x40+8*3($a_ptr), $acc0
1721 movdqa
%xmm0, $in1_x(%rsp)
1722 movdqa
%xmm1, $in1_x+0x10(%rsp)
1724 movdqa
%xmm2, $in1_y(%rsp)
1725 movdqa
%xmm3, $in1_y+0x10(%rsp)
1727 movdqa
%xmm4, $in1_z(%rsp)
1728 movdqa
%xmm5, $in1_z+0x10(%rsp)
1731 movdqu
0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
1732 pshufd \
$0xb1, %xmm3, %xmm5
1733 movdqu
0x10($b_ptr), %xmm1
1734 movdqu
0x20($b_ptr), %xmm2
1736 movdqu
0x30($b_ptr), %xmm3
1737 movdqa
%xmm0, $in2_x(%rsp)
1738 pshufd \
$0x1e, %xmm5, %xmm4
1739 movdqa
%xmm1, $in2_x+0x10(%rsp)
1741 movq
$r_ptr, %xmm0 # save $r_ptr
1742 movdqa
%xmm2, $in2_y(%rsp)
1743 movdqa
%xmm3, $in2_y+0x10(%rsp)
1749 lea
0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
1750 lea
$Z1sqr(%rsp), $r_ptr # Z1^2
1751 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Z1sqr, in1_z);
1753 pcmpeqd
%xmm4, %xmm5
1754 pshufd \
$0xb1, %xmm3, %xmm4
1755 mov
0x00($b_ptr), $src0 # $b_ptr is still valid
1756 #lea 0x00($b_ptr), $b_ptr
1757 mov
$acc4, $acc1 # harmonize sqr output and mul input
1759 pshufd \
$0, %xmm5, %xmm5 # in1infty
1760 pshufd \
$0x1e, %xmm4, %xmm3
1765 pcmpeqd
%xmm3, %xmm4
1766 pshufd \
$0, %xmm4, %xmm4 # in2infty
1768 lea
$Z1sqr-$bias(%rsp), $a_ptr
1770 lea
$U2(%rsp), $r_ptr # U2 = X2*Z1^2
1771 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(U2, Z1sqr, in2_x);
1773 lea
$in1_x(%rsp), $b_ptr
1774 lea
$H(%rsp), $r_ptr # H = U2 - U1
1775 call __ecp_nistz256_sub_from
$x # p256_sub(H, U2, in1_x);
1777 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1778 lea
$S2(%rsp), $r_ptr # S2 = Z1^3
1779 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S2, Z1sqr, in1_z);
1781 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1782 lea
$res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
1783 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(res_z, H, in1_z);
1785 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1786 lea
$S2(%rsp), $r_ptr # S2 = Y2*Z1^3
1787 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S2, S2, in2_y);
1789 lea
$in1_y(%rsp), $b_ptr
1790 lea
$R(%rsp), $r_ptr # R = S2 - S1
1791 call __ecp_nistz256_sub_from
$x # p256_sub(R, S2, in1_y);
1793 `&load_for_sqr("$H(%rsp)", "$src0")`
1794 lea
$Hsqr(%rsp), $r_ptr # H^2
1795 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Hsqr, H);
1797 `&load_for_sqr("$R(%rsp)", "$src0")`
1798 lea
$Rsqr(%rsp), $r_ptr # R^2
1799 call __ecp_nistz256_sqr_mont
$x # p256_sqr_mont(Rsqr, R);
1801 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
1802 lea
$Hcub(%rsp), $r_ptr # H^3
1803 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(Hcub, Hsqr, H);
1805 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1806 lea
$U2(%rsp), $r_ptr # U1*H^2
1807 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(U2, in1_x, Hsqr);
1810 #######################################################################
1811 # operate in 4-5-0-1 "name space" that matches multiplication output
1813 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1814 my ($poly1, $poly3)=($acc6,$acc7);
1817 #lea $U2(%rsp), $a_ptr
1818 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
1819 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1821 add
$acc0, $acc0 # a0:a3+a0:a3
1822 lea
$Rsqr(%rsp), $a_ptr
1839 mov
8*0($a_ptr), $t0
1841 mov
8*1($a_ptr), $t1
1843 mov
8*2($a_ptr), $t2
1845 mov
8*3($a_ptr), $t3
1847 call __ecp_nistz256_sub
$x # p256_sub(res_x, Rsqr, Hsqr);
1849 lea
$Hcub(%rsp), $b_ptr
1850 lea
$res_x(%rsp), $r_ptr
1851 call __ecp_nistz256_sub_from
$x # p256_sub(res_x, res_x, Hcub);
1853 mov
$U2+8*0(%rsp), $t0
1854 mov
$U2+8*1(%rsp), $t1
1855 mov
$U2+8*2(%rsp), $t2
1856 mov
$U2+8*3(%rsp), $t3
1857 lea
$H(%rsp), $r_ptr
1859 call __ecp_nistz256_sub
$x # p256_sub(H, U2, res_x);
1861 mov
$acc0, 8*0($r_ptr) # save the result, as
1862 mov
$acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
1863 mov
$acc2, 8*2($r_ptr)
1864 mov
$acc3, 8*3($r_ptr)
1868 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
1869 lea
$S2(%rsp), $r_ptr
1870 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(S2, Hcub, in1_y);
1872 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
1873 lea
$H(%rsp), $r_ptr
1874 call __ecp_nistz256_mul_mont
$x # p256_mul_mont(H, H, R);
1876 lea
$S2(%rsp), $b_ptr
1877 lea
$res_y(%rsp), $r_ptr
1878 call __ecp_nistz256_sub_from
$x # p256_sub(res_y, H, S2);
1880 movq
%xmm0, $r_ptr # restore $r_ptr
1882 movdqa
%xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
1884 pandn
$res_z(%rsp), %xmm0
1886 pandn
$res_z+0x10(%rsp), %xmm1
1888 pand
.LONE_mont
(%rip), %xmm2
1889 pand
.LONE_mont
+0x10(%rip), %xmm3
1893 movdqa
%xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
1899 pand
$in1_z(%rsp), %xmm2
1900 pand
$in1_z+0x10(%rsp), %xmm3
1903 movdqu
%xmm2, 0x40($r_ptr)
1904 movdqu
%xmm3, 0x50($r_ptr)
1906 movdqa
%xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
1908 pandn
$res_x(%rsp), %xmm0
1910 pandn
$res_x+0x10(%rsp), %xmm1
1912 pand
$in2_x(%rsp), %xmm2
1913 pand
$in2_x+0x10(%rsp), %xmm3
1917 movdqa
%xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
1923 pand
$in1_x(%rsp), %xmm2
1924 pand
$in1_x+0x10(%rsp), %xmm3
1927 movdqu
%xmm2, 0x00($r_ptr)
1928 movdqu
%xmm3, 0x10($r_ptr)
1930 movdqa
%xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
1932 pandn
$res_y(%rsp), %xmm0
1934 pandn
$res_y+0x10(%rsp), %xmm1
1936 pand
$in2_y(%rsp), %xmm2
1937 pand
$in2_y+0x10(%rsp), %xmm3
1941 movdqa
%xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
1947 pand
$in1_y(%rsp), %xmm2
1948 pand
$in1_y+0x10(%rsp), %xmm3
1951 movdqu
%xmm2, 0x20($r_ptr)
1952 movdqu
%xmm3, 0x30($r_ptr)
1962 .size ecp_nistz256_point_add_affine
$sfx,.-ecp_nistz256_point_add_affine
$sfx
1965 &gen_add_affine
("q");
1969 $code =~ s/\`([^\`]*)\`/eval $1/gem;