lib/libcrypto/ec/asm/ecp_nistz256-x86_64.pl

   1 #!/usr/bin/env perl
   2 # $OpenBSD: ecp_nistz256-x86_64.pl,v 1.1 2016/11/04 17:33:20 miod Exp $
   3 #
   4 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   5 #
   6 # Licensed under the OpenSSL license (the "License").  You may not use
   7 # this file except in compliance with the License.  You can obtain a copy
   8 # in the file LICENSE in the source distribution or at
   9 # https://www.openssl.org/source/license.html
  10
  11 # Copyright (c) 2014, Intel Corporation.
  12 #
  13 # Permission to use, copy, modify, and/or distribute this software for any
  14 # purpose with or without fee is hereby granted, provided that the above
  15 # copyright notice and this permission notice appear in all copies.
  16 #
  17 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  18 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  19 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  20 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  21 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  22 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  23 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  24
  25 # Developers and authors:
  26 # Shay Gueron (1, 2), and Vlad Krasnov (1)
  27 # (1) Intel Corporation, Israel Development Center
  28 # (2) University of Haifa
  29
  30 #  Reference:
  31 #  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
  32 #                           256 Bit Primes"
  33
  34 # Further optimization by <appro@openssl.org>:
  35 #
  36 #               this/original   with/without -DECP_NISTZ256_ASM(*)
  37 # Opteron       +12-49%         +110-150%
  38 # Bulldozer     +14-45%         +175-210%
  39 # P4            +18-46%         n/a :-(
  40 # Westmere      +12-34%         +80-87%
  41 # Sandy Bridge  +9-35%          +110-120%
  42 # Ivy Bridge    +9-35%          +110-125%
  43 # Haswell       +8-37%          +140-160%
  44 # Broadwell     +18-58%         +145-210%
  45 # Atom          +15-50%         +130-180%
  46 # VIA Nano      +43-160%        +300-480%
  47 #
  48 # (*)   "without -DECP_NISTZ256_ASM" refers to build with
  49 #       "enable-ec_nistp_64_gcc_128";
  50 #
  51 # Ranges denote minimum and maximum improvement coefficients depending
  52 # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
  53 # server-side operation. Keep in mind that +100% means 2x improvement.
  54
  55 $flavour = shift;
  56 $output  = shift;
  57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  58
  59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  60
  61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  64 die "can't locate x86_64-xlate.pl";
  65
  66 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  67 *STDOUT=*OUT;
  68
  69 $code.=<<___;
  70 .text
  71
  72 # The polynomial
  73 .align 64
  74 .Lpoly:
  75 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
  76
  77 .LOne:
  78 .long 1,1,1,1,1,1,1,1
  79 .LTwo:
  80 .long 2,2,2,2,2,2,2,2
  81 .LThree:
  82 .long 3,3,3,3,3,3,3,3
  83 .LONE_mont:
  84 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
  85 ___
  86
  87 {
  88 ################################################################################
  89 # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
  90
  91 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
  92 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
  93 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
  94
  95 $code.=<<___;
  96
  97 .globl  ecp_nistz256_mul_by_2
  98 .type   ecp_nistz256_mul_by_2,\@function,2
  99 .align  64
 100 ecp_nistz256_mul_by_2:
 101         push    %r12
 102         push    %r13
 103
 104         mov     8*0($a_ptr), $a0
 105         mov     8*1($a_ptr), $a1
 106         add     $a0, $a0                # a0:a3+a0:a3
 107         mov     8*2($a_ptr), $a2
 108         adc     $a1, $a1
 109         mov     8*3($a_ptr), $a3
 110         lea     .Lpoly(%rip), $a_ptr
 111          mov    $a0, $t0
 112         adc     $a2, $a2
 113         adc     $a3, $a3
 114          mov    $a1, $t1
 115         sbb     $t4, $t4
 116
 117         sub     8*0($a_ptr), $a0
 118          mov    $a2, $t2
 119         sbb     8*1($a_ptr), $a1
 120         sbb     8*2($a_ptr), $a2
 121          mov    $a3, $t3
 122         sbb     8*3($a_ptr), $a3
 123         test    $t4, $t4
 124
 125         cmovz   $t0, $a0
 126         cmovz   $t1, $a1
 127         mov     $a0, 8*0($r_ptr)
 128         cmovz   $t2, $a2
 129         mov     $a1, 8*1($r_ptr)
 130         cmovz   $t3, $a3
 131         mov     $a2, 8*2($r_ptr)
 132         mov     $a3, 8*3($r_ptr)
 133
 134         pop     %r13
 135         pop     %r12
 136         ret
 137 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
 138
 139 ################################################################################
 140 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
 141 .globl  ecp_nistz256_neg
 142 .type   ecp_nistz256_neg,\@function,2
 143 .align  32
 144 ecp_nistz256_neg:
 145         push    %r12
 146         push    %r13
 147
 148         xor     $a0, $a0
 149         xor     $a1, $a1
 150         xor     $a2, $a2
 151         xor     $a3, $a3
 152         xor     $t4, $t4
 153
 154         sub     8*0($a_ptr), $a0
 155         sbb     8*1($a_ptr), $a1
 156         sbb     8*2($a_ptr), $a2
 157          mov    $a0, $t0
 158         sbb     8*3($a_ptr), $a3
 159         lea     .Lpoly(%rip), $a_ptr
 160          mov    $a1, $t1
 161         sbb     \$0, $t4
 162
 163         add     8*0($a_ptr), $a0
 164          mov    $a2, $t2
 165         adc     8*1($a_ptr), $a1
 166         adc     8*2($a_ptr), $a2
 167          mov    $a3, $t3
 168         adc     8*3($a_ptr), $a3
 169         test    $t4, $t4
 170
 171         cmovz   $t0, $a0
 172         cmovz   $t1, $a1
 173         mov     $a0, 8*0($r_ptr)
 174         cmovz   $t2, $a2
 175         mov     $a1, 8*1($r_ptr)
 176         cmovz   $t3, $a3
 177         mov     $a2, 8*2($r_ptr)
 178         mov     $a3, 8*3($r_ptr)
 179
 180         pop %r13
 181         pop %r12
 182         ret
 183 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
 184 ___
 185 }
 186 {
 187 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
 188 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
 189 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
 190 my ($poly1,$poly3)=($acc6,$acc7);
 191
 192 $code.=<<___;
 193 ################################################################################
 194 # void ecp_nistz256_mul_mont(
 195 #   uint64_t res[4],
 196 #   uint64_t a[4],
 197 #   uint64_t b[4]);
 198
 199 .globl  ecp_nistz256_mul_mont
 200 .type   ecp_nistz256_mul_mont,\@function,3
 201 .align  32
 202 ecp_nistz256_mul_mont:
 203 .Lmul_mont:
 204         push    %rbp
 205         push    %rbx
 206         push    %r12
 207         push    %r13
 208         push    %r14
 209         push    %r15
 210
 211         mov     $b_org, $b_ptr
 212         mov     8*0($b_org), %rax
 213         mov     8*0($a_ptr), $acc1
 214         mov     8*1($a_ptr), $acc2
 215         mov     8*2($a_ptr), $acc3
 216         mov     8*3($a_ptr), $acc4
 217
 218         call    __ecp_nistz256_mul_montq
 219
 220         pop     %r15
 221         pop     %r14
 222         pop     %r13
 223         pop     %r12
 224         pop     %rbx
 225         pop     %rbp
 226         ret
 227 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
 228
 229 .type   __ecp_nistz256_mul_montq,\@abi-omnipotent
 230 .align  32
 231 __ecp_nistz256_mul_montq:
 232         ########################################################################
 233         # Multiply a by b[0]
 234         mov     %rax, $t1
 235         mulq    $acc1
 236         mov     .Lpoly+8*1(%rip),$poly1
 237         mov     %rax, $acc0
 238         mov     $t1, %rax
 239         mov     %rdx, $acc1
 240
 241         mulq    $acc2
 242         mov     .Lpoly+8*3(%rip),$poly3
 243         add     %rax, $acc1
 244         mov     $t1, %rax
 245         adc     \$0, %rdx
 246         mov     %rdx, $acc2
 247
 248         mulq    $acc3
 249         add     %rax, $acc2
 250         mov     $t1, %rax
 251         adc     \$0, %rdx
 252         mov     %rdx, $acc3
 253
 254         mulq    $acc4
 255         add     %rax, $acc3
 256          mov    $acc0, %rax
 257         adc     \$0, %rdx
 258         xor     $acc5, $acc5
 259         mov     %rdx, $acc4
 260
 261         ########################################################################
 262         # First reduction step
 263         # Basically now we want to multiply acc[0] by p256,
 264         # and add the result to the acc.
 265         # Due to the special form of p256 we do some optimizations
 266         #
 267         # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
 268         # then we add acc[0] and get acc[0] x 2^96
 269
 270         mov     $acc0, $t1
 271         shl     \$32, $acc0
 272         mulq    $poly3
 273         shr     \$32, $t1
 274         add     $acc0, $acc1            # +=acc[0]<<96
 275         adc     $t1, $acc2
 276         adc     %rax, $acc3
 277          mov    8*1($b_ptr), %rax
 278         adc     %rdx, $acc4
 279         adc     \$0, $acc5
 280         xor     $acc0, $acc0
 281
 282         ########################################################################
 283         # Multiply by b[1]
 284         mov     %rax, $t1
 285         mulq    8*0($a_ptr)
 286         add     %rax, $acc1
 287         mov     $t1, %rax
 288         adc     \$0, %rdx
 289         mov     %rdx, $t0
 290
 291         mulq    8*1($a_ptr)
 292         add     $t0, $acc2
 293         adc     \$0, %rdx
 294         add     %rax, $acc2
 295         mov     $t1, %rax
 296         adc     \$0, %rdx
 297         mov     %rdx, $t0
 298
 299         mulq    8*2($a_ptr)
 300         add     $t0, $acc3
 301         adc     \$0, %rdx
 302         add     %rax, $acc3
 303         mov     $t1, %rax
 304         adc     \$0, %rdx
 305         mov     %rdx, $t0
 306
 307         mulq    8*3($a_ptr)
 308         add     $t0, $acc4
 309         adc     \$0, %rdx
 310         add     %rax, $acc4
 311          mov    $acc1, %rax
 312         adc     %rdx, $acc5
 313         adc     \$0, $acc0
 314
 315         ########################################################################
 316         # Second reduction step
 317         mov     $acc1, $t1
 318         shl     \$32, $acc1
 319         mulq    $poly3
 320         shr     \$32, $t1
 321         add     $acc1, $acc2
 322         adc     $t1, $acc3
 323         adc     %rax, $acc4
 324          mov    8*2($b_ptr), %rax
 325         adc     %rdx, $acc5
 326         adc     \$0, $acc0
 327         xor     $acc1, $acc1
 328
 329         ########################################################################
 330         # Multiply by b[2]
 331         mov     %rax, $t1
 332         mulq    8*0($a_ptr)
 333         add     %rax, $acc2
 334         mov     $t1, %rax
 335         adc     \$0, %rdx
 336         mov     %rdx, $t0
 337
 338         mulq    8*1($a_ptr)
 339         add     $t0, $acc3
 340         adc     \$0, %rdx
 341         add     %rax, $acc3
 342         mov     $t1, %rax
 343         adc     \$0, %rdx
 344         mov     %rdx, $t0
 345
 346         mulq    8*2($a_ptr)
 347         add     $t0, $acc4
 348         adc     \$0, %rdx
 349         add     %rax, $acc4
 350         mov     $t1, %rax
 351         adc     \$0, %rdx
 352         mov     %rdx, $t0
 353
 354         mulq    8*3($a_ptr)
 355         add     $t0, $acc5
 356         adc     \$0, %rdx
 357         add     %rax, $acc5
 358          mov    $acc2, %rax
 359         adc     %rdx, $acc0
 360         adc     \$0, $acc1
 361
 362         ########################################################################
 363         # Third reduction step
 364         mov     $acc2, $t1
 365         shl     \$32, $acc2
 366         mulq    $poly3
 367         shr     \$32, $t1
 368         add     $acc2, $acc3
 369         adc     $t1, $acc4
 370         adc     %rax, $acc5
 371          mov    8*3($b_ptr), %rax
 372         adc     %rdx, $acc0
 373         adc     \$0, $acc1
 374         xor     $acc2, $acc2
 375
 376         ########################################################################
 377         # Multiply by b[3]
 378         mov     %rax, $t1
 379         mulq    8*0($a_ptr)
 380         add     %rax, $acc3
 381         mov     $t1, %rax
 382         adc     \$0, %rdx
 383         mov     %rdx, $t0
 384
 385         mulq    8*1($a_ptr)
 386         add     $t0, $acc4
 387         adc     \$0, %rdx
 388         add     %rax, $acc4
 389         mov     $t1, %rax
 390         adc     \$0, %rdx
 391         mov     %rdx, $t0
 392
 393         mulq    8*2($a_ptr)
 394         add     $t0, $acc5
 395         adc     \$0, %rdx
 396         add     %rax, $acc5
 397         mov     $t1, %rax
 398         adc     \$0, %rdx
 399         mov     %rdx, $t0
 400
 401         mulq    8*3($a_ptr)
 402         add     $t0, $acc0
 403         adc     \$0, %rdx
 404         add     %rax, $acc0
 405          mov    $acc3, %rax
 406         adc     %rdx, $acc1
 407         adc     \$0, $acc2
 408
 409         ########################################################################
 410         # Final reduction step
 411         mov     $acc3, $t1
 412         shl     \$32, $acc3
 413         mulq    $poly3
 414         shr     \$32, $t1
 415         add     $acc3, $acc4
 416         adc     $t1, $acc5
 417          mov    $acc4, $t0
 418         adc     %rax, $acc0
 419         adc     %rdx, $acc1
 420          mov    $acc5, $t1
 421         adc     \$0, $acc2
 422
 423         ########################################################################
 424         # Branch-less conditional subtraction of P
 425         sub     \$-1, $acc4             # .Lpoly[0]
 426          mov    $acc0, $t2
 427         sbb     $poly1, $acc5           # .Lpoly[1]
 428         sbb     \$0, $acc0              # .Lpoly[2]
 429          mov    $acc1, $t3
 430         sbb     $poly3, $acc1           # .Lpoly[3]
 431         sbb     \$0, $acc2
 432
 433         cmovc   $t0, $acc4
 434         cmovc   $t1, $acc5
 435         mov     $acc4, 8*0($r_ptr)
 436         cmovc   $t2, $acc0
 437         mov     $acc5, 8*1($r_ptr)
 438         cmovc   $t3, $acc1
 439         mov     $acc0, 8*2($r_ptr)
 440         mov     $acc1, 8*3($r_ptr)
 441
 442         ret
 443 .size   __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
 444
 445 ################################################################################
 446 # void ecp_nistz256_sqr_mont(
 447 #   uint64_t res[4],
 448 #   uint64_t a[4]);
 449
 450 # we optimize the square according to S.Gueron and V.Krasnov,
 451 # "Speeding up Big-Number Squaring"
 452 .globl  ecp_nistz256_sqr_mont
 453 .type   ecp_nistz256_sqr_mont,\@function,2
 454 .align  32
 455 ecp_nistz256_sqr_mont:
 456         push    %rbp
 457         push    %rbx
 458         push    %r12
 459         push    %r13
 460         push    %r14
 461         push    %r15
 462
 463         mov     8*0($a_ptr), %rax
 464         mov     8*1($a_ptr), $acc6
 465         mov     8*2($a_ptr), $acc7
 466         mov     8*3($a_ptr), $acc0
 467
 468         call    __ecp_nistz256_sqr_montq
 469
 470         pop     %r15
 471         pop     %r14
 472         pop     %r13
 473         pop     %r12
 474         pop     %rbx
 475         pop     %rbp
 476         ret
 477 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
 478
 479 .type   __ecp_nistz256_sqr_montq,\@abi-omnipotent
 480 .align  32
 481 __ecp_nistz256_sqr_montq:
 482         mov     %rax, $acc5
 483         mulq    $acc6                   # a[1]*a[0]
 484         mov     %rax, $acc1
 485         mov     $acc7, %rax
 486         mov     %rdx, $acc2
 487
 488         mulq    $acc5                   # a[0]*a[2]
 489         add     %rax, $acc2
 490         mov     $acc0, %rax
 491         adc     \$0, %rdx
 492         mov     %rdx, $acc3
 493
 494         mulq    $acc5                   # a[0]*a[3]
 495         add     %rax, $acc3
 496          mov    $acc7, %rax
 497         adc     \$0, %rdx
 498         mov     %rdx, $acc4
 499
 500         #################################
 501         mulq    $acc6                   # a[1]*a[2]
 502         add     %rax, $acc3
 503         mov     $acc0, %rax
 504         adc     \$0, %rdx
 505         mov     %rdx, $t1
 506
 507         mulq    $acc6                   # a[1]*a[3]
 508         add     %rax, $acc4
 509          mov    $acc0, %rax
 510         adc     \$0, %rdx
 511         add     $t1, $acc4
 512         mov     %rdx, $acc5
 513         adc     \$0, $acc5
 514
 515         #################################
 516         mulq    $acc7                   # a[2]*a[3]
 517         xor     $acc7, $acc7
 518         add     %rax, $acc5
 519          mov    8*0($a_ptr), %rax
 520         mov     %rdx, $acc6
 521         adc     \$0, $acc6
 522
 523         add     $acc1, $acc1            # acc1:6<<1
 524         adc     $acc2, $acc2
 525         adc     $acc3, $acc3
 526         adc     $acc4, $acc4
 527         adc     $acc5, $acc5
 528         adc     $acc6, $acc6
 529         adc     \$0, $acc7
 530
 531         mulq    %rax
 532         mov     %rax, $acc0
 533         mov     8*1($a_ptr), %rax
 534         mov     %rdx, $t0
 535
 536         mulq    %rax
 537         add     $t0, $acc1
 538         adc     %rax, $acc2
 539         mov     8*2($a_ptr), %rax
 540         adc     \$0, %rdx
 541         mov     %rdx, $t0
 542
 543         mulq    %rax
 544         add     $t0, $acc3
 545         adc     %rax, $acc4
 546         mov     8*3($a_ptr), %rax
 547         adc     \$0, %rdx
 548         mov     %rdx, $t0
 549
 550         mulq    %rax
 551         add     $t0, $acc5
 552         adc     %rax, $acc6
 553          mov    $acc0, %rax
 554         adc     %rdx, $acc7
 555
 556         mov     .Lpoly+8*1(%rip), $a_ptr
 557         mov     .Lpoly+8*3(%rip), $t1
 558
 559         ##########################################
 560         # Now the reduction
 561         # First iteration
 562         mov     $acc0, $t0
 563         shl     \$32, $acc0
 564         mulq    $t1
 565         shr     \$32, $t0
 566         add     $acc0, $acc1            # +=acc[0]<<96
 567         adc     $t0, $acc2
 568         adc     %rax, $acc3
 569          mov    $acc1, %rax
 570         adc     \$0, %rdx
 571
 572         ##########################################
 573         # Second iteration
 574         mov     $acc1, $t0
 575         shl     \$32, $acc1
 576         mov     %rdx, $acc0
 577         mulq    $t1
 578         shr     \$32, $t0
 579         add     $acc1, $acc2
 580         adc     $t0, $acc3
 581         adc     %rax, $acc0
 582          mov    $acc2, %rax
 583         adc     \$0, %rdx
 584
 585         ##########################################
 586         # Third iteration
 587         mov     $acc2, $t0
 588         shl     \$32, $acc2
 589         mov     %rdx, $acc1
 590         mulq    $t1
 591         shr     \$32, $t0
 592         add     $acc2, $acc3
 593         adc     $t0, $acc0
 594         adc     %rax, $acc1
 595          mov    $acc3, %rax
 596         adc     \$0, %rdx
 597
 598         ###########################################
 599         # Last iteration
 600         mov     $acc3, $t0
 601         shl     \$32, $acc3
 602         mov     %rdx, $acc2
 603         mulq    $t1
 604         shr     \$32, $t0
 605         add     $acc3, $acc0
 606         adc     $t0, $acc1
 607         adc     %rax, $acc2
 608         adc     \$0, %rdx
 609         xor     $acc3, $acc3
 610
 611         ############################################
 612         # Add the rest of the acc
 613         add     $acc0, $acc4
 614         adc     $acc1, $acc5
 615          mov    $acc4, $acc0
 616         adc     $acc2, $acc6
 617         adc     %rdx, $acc7
 618          mov    $acc5, $acc1
 619         adc     \$0, $acc3
 620
 621         sub     \$-1, $acc4             # .Lpoly[0]
 622          mov    $acc6, $acc2
 623         sbb     $a_ptr, $acc5           # .Lpoly[1]
 624         sbb     \$0, $acc6              # .Lpoly[2]
 625          mov    $acc7, $t0
 626         sbb     $t1, $acc7              # .Lpoly[3]
 627         sbb     \$0, $acc3
 628
 629         cmovc   $acc0, $acc4
 630         cmovc   $acc1, $acc5
 631         mov     $acc4, 8*0($r_ptr)
 632         cmovc   $acc2, $acc6
 633         mov     $acc5, 8*1($r_ptr)
 634         cmovc   $t0, $acc7
 635         mov     $acc6, 8*2($r_ptr)
 636         mov     $acc7, 8*3($r_ptr)
 637
 638         ret
 639 .size   __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
 640 ___
 641
 642 }
 643 {
 644 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
 645 my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
 646 my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
 647
 648 $code.=<<___;
 649 ################################################################################
 650 # void ecp_nistz256_from_mont(
 651 #   uint64_t res[4],
 652 #   uint64_t in[4]);
 653 # This one performs Montgomery multiplication by 1, so we only need the reduction
 654
 655 .globl  ecp_nistz256_from_mont
 656 .type   ecp_nistz256_from_mont,\@function,2
 657 .align  32
 658 ecp_nistz256_from_mont:
 659         push    %r12
 660         push    %r13
 661
 662         mov     8*0($in_ptr), %rax
 663         mov     .Lpoly+8*3(%rip), $t2
 664         mov     8*1($in_ptr), $acc1
 665         mov     8*2($in_ptr), $acc2
 666         mov     8*3($in_ptr), $acc3
 667         mov     %rax, $acc0
 668         mov     .Lpoly+8*1(%rip), $t1
 669
 670         #########################################
 671         # First iteration
 672         mov     %rax, $t0
 673         shl     \$32, $acc0
 674         mulq    $t2
 675         shr     \$32, $t0
 676         add     $acc0, $acc1
 677         adc     $t0, $acc2
 678         adc     %rax, $acc3
 679          mov    $acc1, %rax
 680         adc     \$0, %rdx
 681
 682         #########################################
 683         # Second iteration
 684         mov     $acc1, $t0
 685         shl     \$32, $acc1
 686         mov     %rdx, $acc0
 687         mulq    $t2
 688         shr     \$32, $t0
 689         add     $acc1, $acc2
 690         adc     $t0, $acc3
 691         adc     %rax, $acc0
 692          mov    $acc2, %rax
 693         adc     \$0, %rdx
 694
 695         ##########################################
 696         # Third iteration
 697         mov     $acc2, $t0
 698         shl     \$32, $acc2
 699         mov     %rdx, $acc1
 700         mulq    $t2
 701         shr     \$32, $t0
 702         add     $acc2, $acc3
 703         adc     $t0, $acc0
 704         adc     %rax, $acc1
 705          mov    $acc3, %rax
 706         adc     \$0, %rdx
 707
 708         ###########################################
 709         # Last iteration
 710         mov     $acc3, $t0
 711         shl     \$32, $acc3
 712         mov     %rdx, $acc2
 713         mulq    $t2
 714         shr     \$32, $t0
 715         add     $acc3, $acc0
 716         adc     $t0, $acc1
 717          mov    $acc0, $t0
 718         adc     %rax, $acc2
 719          mov    $acc1, $in_ptr
 720         adc     \$0, %rdx
 721
 722         ###########################################
 723         # Branch-less conditional subtraction
 724         sub     \$-1, $acc0
 725          mov    $acc2, %rax
 726         sbb     $t1, $acc1
 727         sbb     \$0, $acc2
 728          mov    %rdx, $acc3
 729         sbb     $t2, %rdx
 730         sbb     $t2, $t2
 731
 732         cmovnz  $t0, $acc0
 733         cmovnz  $in_ptr, $acc1
 734         mov     $acc0, 8*0($r_ptr)
 735         cmovnz  %rax, $acc2
 736         mov     $acc1, 8*1($r_ptr)
 737         cmovz   %rdx, $acc3
 738         mov     $acc2, 8*2($r_ptr)
 739         mov     $acc3, 8*3($r_ptr)
 740
 741         pop     %r13
 742         pop     %r12
 743         ret
 744 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
 745 ___
 746 }
 747 {
 748 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
 749 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
 750 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
 751 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
 752
 753 $code.=<<___;
 754 ################################################################################
 755 # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
 756 .globl  ecp_nistz256_select_w5
 757 .type   ecp_nistz256_select_w5,\@abi-omnipotent
 758 .align  32
 759 ecp_nistz256_select_w5:
 760 ___
 761 $code.=<<___    if ($win64);
 762         lea     -0x88(%rsp), %rax
 763 .LSEH_begin_ecp_nistz256_select_w5:
 764         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
 765         .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6, -0x20(%rax)
 766         .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7, -0x10(%rax)
 767         .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8, 0(%rax)
 768         .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9, 0x10(%rax)
 769         .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10, 0x20(%rax)
 770         .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11, 0x30(%rax)
 771         .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12, 0x40(%rax)
 772         .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13, 0x50(%rax)
 773         .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14, 0x60(%rax)
 774         .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15, 0x70(%rax)
 775 ___
 776 $code.=<<___;
 777         movdqa  .LOne(%rip), $ONE
 778         movd    $index, $INDEX
 779
 780         pxor    $Ra, $Ra
 781         pxor    $Rb, $Rb
 782         pxor    $Rc, $Rc
 783         pxor    $Rd, $Rd
 784         pxor    $Re, $Re
 785         pxor    $Rf, $Rf
 786
 787         movdqa  $ONE, $M0
 788         pshufd  \$0, $INDEX, $INDEX
 789
 790         mov     \$16, %rax
 791 .Lselect_loop_sse_w5:
 792
 793         movdqa  $M0, $TMP0
 794         paddd   $ONE, $M0
 795         pcmpeqd $INDEX, $TMP0
 796
 797         movdqa  16*0($in_t), $T0a
 798         movdqa  16*1($in_t), $T0b
 799         movdqa  16*2($in_t), $T0c
 800         movdqa  16*3($in_t), $T0d
 801         movdqa  16*4($in_t), $T0e
 802         movdqa  16*5($in_t), $T0f
 803         lea 16*6($in_t), $in_t
 804
 805         pand    $TMP0, $T0a
 806         pand    $TMP0, $T0b
 807         por     $T0a, $Ra
 808         pand    $TMP0, $T0c
 809         por     $T0b, $Rb
 810         pand    $TMP0, $T0d
 811         por     $T0c, $Rc
 812         pand    $TMP0, $T0e
 813         por     $T0d, $Rd
 814         pand    $TMP0, $T0f
 815         por     $T0e, $Re
 816         por     $T0f, $Rf
 817
 818         dec     %rax
 819         jnz     .Lselect_loop_sse_w5
 820
 821         movdqu  $Ra, 16*0($val)
 822         movdqu  $Rb, 16*1($val)
 823         movdqu  $Rc, 16*2($val)
 824         movdqu  $Rd, 16*3($val)
 825         movdqu  $Re, 16*4($val)
 826         movdqu  $Rf, 16*5($val)
 827 ___
 828 $code.=<<___    if ($win64);
 829         movaps  (%rsp), %xmm6
 830         movaps  0x10(%rsp), %xmm7
 831         movaps  0x20(%rsp), %xmm8
 832         movaps  0x30(%rsp), %xmm9
 833         movaps  0x40(%rsp), %xmm10
 834         movaps  0x50(%rsp), %xmm11
 835         movaps  0x60(%rsp), %xmm12
 836         movaps  0x70(%rsp), %xmm13
 837         movaps  0x80(%rsp), %xmm14
 838         movaps  0x90(%rsp), %xmm15
 839         lea     0xa8(%rsp), %rsp
 840 .LSEH_end_ecp_nistz256_select_w5:
 841 ___
 842 $code.=<<___;
 843         ret
 844 .size   ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
 845
 846 ################################################################################
 847 # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
 848 .globl  ecp_nistz256_select_w7
 849 .type   ecp_nistz256_select_w7,\@abi-omnipotent
 850 .align  32
 851 ecp_nistz256_select_w7:
 852 ___
 853 $code.=<<___    if ($win64);
 854         lea     -0x88(%rsp), %rax
 855 .LSEH_begin_ecp_nistz256_select_w7:
 856         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
 857         .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6, -0x20(%rax)
 858         .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7, -0x10(%rax)
 859         .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8, 0(%rax)
 860         .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9, 0x10(%rax)
 861         .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10, 0x20(%rax)
 862         .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11, 0x30(%rax)
 863         .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12, 0x40(%rax)
 864         .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13, 0x50(%rax)
 865         .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14, 0x60(%rax)
 866         .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15, 0x70(%rax)
 867 ___
 868 $code.=<<___;
 869         movdqa  .LOne(%rip), $M0
 870         movd    $index, $INDEX
 871
 872         pxor    $Ra, $Ra
 873         pxor    $Rb, $Rb
 874         pxor    $Rc, $Rc
 875         pxor    $Rd, $Rd
 876
 877         movdqa  $M0, $ONE
 878         pshufd  \$0, $INDEX, $INDEX
 879         mov     \$64, %rax
 880
 881 .Lselect_loop_sse_w7:
 882         movdqa  $M0, $TMP0
 883         paddd   $ONE, $M0
 884         movdqa  16*0($in_t), $T0a
 885         movdqa  16*1($in_t), $T0b
 886         pcmpeqd $INDEX, $TMP0
 887         movdqa  16*2($in_t), $T0c
 888         movdqa  16*3($in_t), $T0d
 889         lea     16*4($in_t), $in_t
 890
 891         pand    $TMP0, $T0a
 892         pand    $TMP0, $T0b
 893         por     $T0a, $Ra
 894         pand    $TMP0, $T0c
 895         por     $T0b, $Rb
 896         pand    $TMP0, $T0d
 897         por     $T0c, $Rc
 898         prefetcht0      255($in_t)
 899         por     $T0d, $Rd
 900
 901         dec     %rax
 902         jnz     .Lselect_loop_sse_w7
 903
 904         movdqu  $Ra, 16*0($val)
 905         movdqu  $Rb, 16*1($val)
 906         movdqu  $Rc, 16*2($val)
 907         movdqu  $Rd, 16*3($val)
 908 ___
 909 $code.=<<___    if ($win64);
 910         movaps  (%rsp), %xmm6
 911         movaps  0x10(%rsp), %xmm7
 912         movaps  0x20(%rsp), %xmm8
 913         movaps  0x30(%rsp), %xmm9
 914         movaps  0x40(%rsp), %xmm10
 915         movaps  0x50(%rsp), %xmm11
 916         movaps  0x60(%rsp), %xmm12
 917         movaps  0x70(%rsp), %xmm13
 918         movaps  0x80(%rsp), %xmm14
 919         movaps  0x90(%rsp), %xmm15
 920         lea     0xa8(%rsp), %rsp
 921 .LSEH_end_ecp_nistz256_select_w7:
 922 ___
 923 $code.=<<___;
 924         ret
 925 .size   ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
 926 ___
 927 }
 928 {{{
 929 ########################################################################
 930 # This block implements higher level point_double, point_add and
 931 # point_add_affine. The key to performance in this case is to allow
 932 # out-of-order execution logic to overlap computations from next step
 933 # with tail processing from current step. By using tailored calling
 934 # sequence we minimize inter-step overhead to give processor better
 935 # shot at overlapping operations...
 936 #
 937 # You will notice that input data is copied to stack. Trouble is that
 938 # there are no registers to spare for holding original pointers and
 939 # reloading them, pointers, would create undesired dependencies on
 940 # effective addresses calculation paths. In other words it's too done
 941 # to favour out-of-order execution logic.
 942 #                                               <appro@openssl.org>
 943
 944 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
 945 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
 946 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
 947 my ($poly1,$poly3)=($acc6,$acc7);
 948
 949 sub load_for_mul () {
 950 my ($a,$b,$src0) = @_;
 951 my $bias = $src0 eq "%rax" ? 0 : -128;
 952
 953 "       mov     $b, $src0
 954         lea     $b, $b_ptr
 955         mov     8*0+$a, $acc1
 956         mov     8*1+$a, $acc2
 957         lea     $bias+$a, $a_ptr
 958         mov     8*2+$a, $acc3
 959         mov     8*3+$a, $acc4"
 960 }
 961
 962 sub load_for_sqr () {
 963 my ($a,$src0) = @_;
 964 my $bias = $src0 eq "%rax" ? 0 : -128;
 965
 966 "       mov     8*0+$a, $src0
 967         mov     8*1+$a, $acc6
 968         lea     $bias+$a, $a_ptr
 969         mov     8*2+$a, $acc7
 970         mov     8*3+$a, $acc0"
 971 }
 972
 973                                                                         {
 974 ########################################################################
 975 # operate in 4-5-0-1 "name space" that matches multiplication output
 976 #
 977 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
 978
 979 $code.=<<___;
 980 .type   __ecp_nistz256_add_toq,\@abi-omnipotent
 981 .align  32
 982 __ecp_nistz256_add_toq:
 983         add     8*0($b_ptr), $a0
 984         adc     8*1($b_ptr), $a1
 985          mov    $a0, $t0
 986         adc     8*2($b_ptr), $a2
 987         adc     8*3($b_ptr), $a3
 988          mov    $a1, $t1
 989         sbb     $t4, $t4
 990
 991         sub     \$-1, $a0
 992          mov    $a2, $t2
 993         sbb     $poly1, $a1
 994         sbb     \$0, $a2
 995          mov    $a3, $t3
 996         sbb     $poly3, $a3
 997         test    $t4, $t4
 998
 999         cmovz   $t0, $a0
1000         cmovz   $t1, $a1
1001         mov     $a0, 8*0($r_ptr)
1002         cmovz   $t2, $a2
1003         mov     $a1, 8*1($r_ptr)
1004         cmovz   $t3, $a3
1005         mov     $a2, 8*2($r_ptr)
1006         mov     $a3, 8*3($r_ptr)
1007
1008         ret
1009 .size   __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1010
1011 .type   __ecp_nistz256_sub_fromq,\@abi-omnipotent
1012 .align  32
1013 __ecp_nistz256_sub_fromq:
1014         sub     8*0($b_ptr), $a0
1015         sbb     8*1($b_ptr), $a1
1016          mov    $a0, $t0
1017         sbb     8*2($b_ptr), $a2
1018         sbb     8*3($b_ptr), $a3
1019          mov    $a1, $t1
1020         sbb     $t4, $t4
1021
1022         add     \$-1, $a0
1023          mov    $a2, $t2
1024         adc     $poly1, $a1
1025         adc     \$0, $a2
1026          mov    $a3, $t3
1027         adc     $poly3, $a3
1028         test    $t4, $t4
1029
1030         cmovz   $t0, $a0
1031         cmovz   $t1, $a1
1032         mov     $a0, 8*0($r_ptr)
1033         cmovz   $t2, $a2
1034         mov     $a1, 8*1($r_ptr)
1035         cmovz   $t3, $a3
1036         mov     $a2, 8*2($r_ptr)
1037         mov     $a3, 8*3($r_ptr)
1038
1039         ret
1040 .size   __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1041
1042 .type   __ecp_nistz256_subq,\@abi-omnipotent
1043 .align  32
1044 __ecp_nistz256_subq:
1045         sub     $a0, $t0
1046         sbb     $a1, $t1
1047          mov    $t0, $a0
1048         sbb     $a2, $t2
1049         sbb     $a3, $t3
1050          mov    $t1, $a1
1051         sbb     $t4, $t4
1052
1053         add     \$-1, $t0
1054          mov    $t2, $a2
1055         adc     $poly1, $t1
1056         adc     \$0, $t2
1057          mov    $t3, $a3
1058         adc     $poly3, $t3
1059         test    $t4, $t4
1060
1061         cmovnz  $t0, $a0
1062         cmovnz  $t1, $a1
1063         cmovnz  $t2, $a2
1064         cmovnz  $t3, $a3
1065
1066         ret
1067 .size   __ecp_nistz256_subq,.-__ecp_nistz256_subq
1068
1069 .type   __ecp_nistz256_mul_by_2q,\@abi-omnipotent
1070 .align  32
1071 __ecp_nistz256_mul_by_2q:
1072         add     $a0, $a0                # a0:a3+a0:a3
1073         adc     $a1, $a1
1074          mov    $a0, $t0
1075         adc     $a2, $a2
1076         adc     $a3, $a3
1077          mov    $a1, $t1
1078         sbb     $t4, $t4
1079
1080         sub     \$-1, $a0
1081          mov    $a2, $t2
1082         sbb     $poly1, $a1
1083         sbb     \$0, $a2
1084          mov    $a3, $t3
1085         sbb     $poly3, $a3
1086         test    $t4, $t4
1087
1088         cmovz   $t0, $a0
1089         cmovz   $t1, $a1
1090         mov     $a0, 8*0($r_ptr)
1091         cmovz   $t2, $a2
1092         mov     $a1, 8*1($r_ptr)
1093         cmovz   $t3, $a3
1094         mov     $a2, 8*2($r_ptr)
1095         mov     $a3, 8*3($r_ptr)
1096
1097         ret
1098 .size   __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1099 ___
1100                                                                         }
1101 sub gen_double () {
1102     my $x = shift;
1103     my ($src0,$sfx,$bias);
1104     my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1105
1106     if ($x ne "x") {
1107         $src0 = "%rax";
1108         $sfx  = "";
1109         $bias = 0;
1110
1111 $code.=<<___;
1112 .globl  ecp_nistz256_point_double
1113 .type   ecp_nistz256_point_double,\@function,2
1114 .align  32
1115 ecp_nistz256_point_double:
1116 ___
1117     } else {
1118         $src0 = "%rdx";
1119         $sfx  = "x";
1120         $bias = 128;
1121
1122 $code.=<<___;
1123 .type   ecp_nistz256_point_doublex,\@function,2
1124 .align  32
1125 ecp_nistz256_point_doublex:
1126 .Lpoint_doublex:
1127 ___
1128     }
1129 $code.=<<___;
1130         push    %rbp
1131         push    %rbx
1132         push    %r12
1133         push    %r13
1134         push    %r14
1135         push    %r15
1136         sub     \$32*5+8, %rsp
1137
1138 .Lpoint_double_shortcut$x:
1139         movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$a_ptr.x
1140         mov     $a_ptr, $b_ptr                  # backup copy
1141         movdqu  0x10($a_ptr), %xmm1
1142          mov    0x20+8*0($a_ptr), $acc4         # load in_y in "5-4-0-1" order
1143          mov    0x20+8*1($a_ptr), $acc5
1144          mov    0x20+8*2($a_ptr), $acc0
1145          mov    0x20+8*3($a_ptr), $acc1
1146          mov    .Lpoly+8*1(%rip), $poly1
1147          mov    .Lpoly+8*3(%rip), $poly3
1148         movdqa  %xmm0, $in_x(%rsp)
1149         movdqa  %xmm1, $in_x+0x10(%rsp)
1150         lea     0x20($r_ptr), $acc2
1151         lea     0x40($r_ptr), $acc3
1152         movq    $r_ptr, %xmm0
1153         movq    $acc2, %xmm1
1154         movq    $acc3, %xmm2
1155
1156         lea     $S(%rsp), $r_ptr
1157         call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(S, in_y);
1158
1159         mov     0x40+8*0($a_ptr), $src0
1160         mov     0x40+8*1($a_ptr), $acc6
1161         mov     0x40+8*2($a_ptr), $acc7
1162         mov     0x40+8*3($a_ptr), $acc0
1163         lea     0x40-$bias($a_ptr), $a_ptr
1164         lea     $Zsqr(%rsp), $r_ptr
1165         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Zsqr, in_z);
1166
1167         `&load_for_sqr("$S(%rsp)", "$src0")`
1168         lea     $S(%rsp), $r_ptr
1169         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(S, S);
1170
1171         mov     0x20($b_ptr), $src0             # $b_ptr is still valid
1172         mov     0x40+8*0($b_ptr), $acc1
1173         mov     0x40+8*1($b_ptr), $acc2
1174         mov     0x40+8*2($b_ptr), $acc3
1175         mov     0x40+8*3($b_ptr), $acc4
1176         lea     0x40-$bias($b_ptr), $a_ptr
1177         lea     0x20($b_ptr), $b_ptr
1178         movq    %xmm2, $r_ptr
1179         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, in_z, in_y);
1180         call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(res_z, res_z);
1181
1182         mov     $in_x+8*0(%rsp), $acc4          # "5-4-0-1" order
1183         mov     $in_x+8*1(%rsp), $acc5
1184         lea     $Zsqr(%rsp), $b_ptr
1185         mov     $in_x+8*2(%rsp), $acc0
1186         mov     $in_x+8*3(%rsp), $acc1
1187         lea     $M(%rsp), $r_ptr
1188         call    __ecp_nistz256_add_to$x         # p256_add(M, in_x, Zsqr);
1189
1190         mov     $in_x+8*0(%rsp), $acc4          # "5-4-0-1" order
1191         mov     $in_x+8*1(%rsp), $acc5
1192         lea     $Zsqr(%rsp), $b_ptr
1193         mov     $in_x+8*2(%rsp), $acc0
1194         mov     $in_x+8*3(%rsp), $acc1
1195         lea     $Zsqr(%rsp), $r_ptr
1196         call    __ecp_nistz256_sub_from$x       # p256_sub(Zsqr, in_x, Zsqr);
1197
1198         `&load_for_sqr("$S(%rsp)", "$src0")`
1199         movq    %xmm1, $r_ptr
1200         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(res_y, S);
1201 ___
1202 {
1203 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1204 # operate in 4-5-6-7 "name space" that matches squaring output
1205 #
1206 my ($poly1,$poly3)=($a_ptr,$t1);
1207 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1208
1209 $code.=<<___;
1210         xor     $t4, $t4
1211         mov     $a0, $t0
1212         add     \$-1, $a0
1213         mov     $a1, $t1
1214         adc     $poly1, $a1
1215         mov     $a2, $t2
1216         adc     \$0, $a2
1217         mov     $a3, $t3
1218         adc     $poly3, $a3
1219         adc     \$0, $t4
1220         xor     $a_ptr, $a_ptr          # borrow $a_ptr
1221         test    \$1, $t0
1222
1223         cmovz   $t0, $a0
1224         cmovz   $t1, $a1
1225         cmovz   $t2, $a2
1226         cmovz   $t3, $a3
1227         cmovz   $a_ptr, $t4
1228
1229         mov     $a1, $t0                # a0:a3>>1
1230         shr     \$1, $a0
1231         shl     \$63, $t0
1232         mov     $a2, $t1
1233         shr     \$1, $a1
1234         or      $t0, $a0
1235         shl     \$63, $t1
1236         mov     $a3, $t2
1237         shr     \$1, $a2
1238         or      $t1, $a1
1239         shl     \$63, $t2
1240         mov     $a0, 8*0($r_ptr)
1241         shr     \$1, $a3
1242         mov     $a1, 8*1($r_ptr)
1243         shl     \$63, $t4
1244         or      $t2, $a2
1245         or      $t4, $a3
1246         mov     $a2, 8*2($r_ptr)
1247         mov     $a3, 8*3($r_ptr)
1248 ___
1249 }
1250 $code.=<<___;
1251         `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1252         lea     $M(%rsp), $r_ptr
1253         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(M, M, Zsqr);
1254
1255         lea     $tmp0(%rsp), $r_ptr
1256         call    __ecp_nistz256_mul_by_2$x
1257
1258         lea     $M(%rsp), $b_ptr
1259         lea     $M(%rsp), $r_ptr
1260         call    __ecp_nistz256_add_to$x         # p256_mul_by_3(M, M);
1261
1262         `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1263         lea     $S(%rsp), $r_ptr
1264         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S, S, in_x);
1265
1266         lea     $tmp0(%rsp), $r_ptr
1267         call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(tmp0, S);
1268
1269         `&load_for_sqr("$M(%rsp)", "$src0")`
1270         movq    %xmm0, $r_ptr
1271         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(res_x, M);
1272
1273         lea     $tmp0(%rsp), $b_ptr
1274         mov     $acc6, $acc0                    # harmonize sqr output and sub input
1275         mov     $acc7, $acc1
1276         mov     $a_ptr, $poly1
1277         mov     $t1, $poly3
1278         call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, tmp0);
1279
1280         mov     $S+8*0(%rsp), $t0
1281         mov     $S+8*1(%rsp), $t1
1282         mov     $S+8*2(%rsp), $t2
1283         mov     $S+8*3(%rsp), $acc2             # "4-5-0-1" order
1284         lea     $S(%rsp), $r_ptr
1285         call    __ecp_nistz256_sub$x            # p256_sub(S, S, res_x);
1286
1287         mov     $M(%rsp), $src0
1288         lea     $M(%rsp), $b_ptr
1289         mov     $acc4, $acc6                    # harmonize sub output and mul input
1290         xor     %ecx, %ecx
1291         mov     $acc4, $S+8*0(%rsp)             # have to save:-(
1292         mov     $acc5, $acc2
1293         mov     $acc5, $S+8*1(%rsp)
1294         cmovz   $acc0, $acc3
1295         mov     $acc0, $S+8*2(%rsp)
1296         lea     $S-$bias(%rsp), $a_ptr
1297         cmovz   $acc1, $acc4
1298         mov     $acc1, $S+8*3(%rsp)
1299         mov     $acc6, $acc1
1300         lea     $S(%rsp), $r_ptr
1301         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S, S, M);
1302
1303         movq    %xmm1, $b_ptr
1304         movq    %xmm1, $r_ptr
1305         call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, S, res_y);
1306
1307         add     \$32*5+8, %rsp
1308         pop     %r15
1309         pop     %r14
1310         pop     %r13
1311         pop     %r12
1312         pop     %rbx
1313         pop     %rbp
1314         ret
1315 .size   ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1316 ___
1317 }
1318 &gen_double("q");
1319
1320 sub gen_add () {
1321     my $x = shift;
1322     my ($src0,$sfx,$bias);
1323     my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1324         $U1,$U2,$S1,$S2,
1325         $res_x,$res_y,$res_z,
1326         $in1_x,$in1_y,$in1_z,
1327         $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1328     my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1329
1330     if ($x ne "x") {
1331         $src0 = "%rax";
1332         $sfx  = "";
1333         $bias = 0;
1334
1335 $code.=<<___;
1336 .globl  ecp_nistz256_point_add
1337 .type   ecp_nistz256_point_add,\@function,3
1338 .align  32
1339 ecp_nistz256_point_add:
1340 ___
1341     } else {
1342         $src0 = "%rdx";
1343         $sfx  = "x";
1344         $bias = 128;
1345     }
1346 $code.=<<___;
1347         push    %rbp
1348         push    %rbx
1349         push    %r12
1350         push    %r13
1351         push    %r14
1352         push    %r15
1353         sub     \$32*18+8, %rsp
1354
1355         movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$a_ptr
1356         movdqu  0x10($a_ptr), %xmm1
1357         movdqu  0x20($a_ptr), %xmm2
1358         movdqu  0x30($a_ptr), %xmm3
1359         movdqu  0x40($a_ptr), %xmm4
1360         movdqu  0x50($a_ptr), %xmm5
1361         mov     $a_ptr, $b_ptr                  # reassign
1362         mov     $b_org, $a_ptr                  # reassign
1363         movdqa  %xmm0, $in1_x(%rsp)
1364         movdqa  %xmm1, $in1_x+0x10(%rsp)
1365         por     %xmm0, %xmm1
1366         movdqa  %xmm2, $in1_y(%rsp)
1367         movdqa  %xmm3, $in1_y+0x10(%rsp)
1368         por     %xmm2, %xmm3
1369         movdqa  %xmm4, $in1_z(%rsp)
1370         movdqa  %xmm5, $in1_z+0x10(%rsp)
1371         por     %xmm1, %xmm3
1372
1373         movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$b_ptr
1374          pshufd \$0xb1, %xmm3, %xmm5
1375         movdqu  0x10($a_ptr), %xmm1
1376         movdqu  0x20($a_ptr), %xmm2
1377          por    %xmm3, %xmm5
1378         movdqu  0x30($a_ptr), %xmm3
1379          mov    0x40+8*0($a_ptr), $src0         # load original in2_z
1380          mov    0x40+8*1($a_ptr), $acc6
1381          mov    0x40+8*2($a_ptr), $acc7
1382          mov    0x40+8*3($a_ptr), $acc0
1383         movdqa  %xmm0, $in2_x(%rsp)
1384          pshufd \$0x1e, %xmm5, %xmm4
1385         movdqa  %xmm1, $in2_x+0x10(%rsp)
1386         por     %xmm0, %xmm1
1387          movq   $r_ptr, %xmm0                   # save $r_ptr
1388         movdqa  %xmm2, $in2_y(%rsp)
1389         movdqa  %xmm3, $in2_y+0x10(%rsp)
1390         por     %xmm2, %xmm3
1391          por    %xmm4, %xmm5
1392          pxor   %xmm4, %xmm4
1393         por     %xmm1, %xmm3
1394
1395         lea     0x40-$bias($a_ptr), $a_ptr      # $a_ptr is still valid
1396          mov    $src0, $in2_z+8*0(%rsp)         # make in2_z copy
1397          mov    $acc6, $in2_z+8*1(%rsp)
1398          mov    $acc7, $in2_z+8*2(%rsp)
1399          mov    $acc0, $in2_z+8*3(%rsp)
1400         lea     $Z2sqr(%rsp), $r_ptr            # Z2^2
1401         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z2sqr, in2_z);
1402
1403         pcmpeqd %xmm4, %xmm5
1404         pshufd  \$0xb1, %xmm3, %xmm4
1405         por     %xmm3, %xmm4
1406         pshufd  \$0, %xmm5, %xmm5               # in1infty
1407         pshufd  \$0x1e, %xmm4, %xmm3
1408         por     %xmm3, %xmm4
1409         pxor    %xmm3, %xmm3
1410         pcmpeqd %xmm3, %xmm4
1411         pshufd  \$0, %xmm4, %xmm4               # in2infty
1412          mov    0x40+8*0($b_ptr), $src0         # load original in1_z
1413          mov    0x40+8*1($b_ptr), $acc6
1414          mov    0x40+8*2($b_ptr), $acc7
1415          mov    0x40+8*3($b_ptr), $acc0
1416         movq    $b_ptr, %xmm1
1417
1418         lea     0x40-$bias($b_ptr), $a_ptr
1419         lea     $Z1sqr(%rsp), $r_ptr            # Z1^2
1420         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z1sqr, in1_z);
1421
1422         `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
1423         lea     $S1(%rsp), $r_ptr               # S1 = Z2^3
1424         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S1, Z2sqr, in2_z);
1425
1426         `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1427         lea     $S2(%rsp), $r_ptr               # S2 = Z1^3
1428         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Z1sqr, in1_z);
1429
1430         `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
1431         lea     $S1(%rsp), $r_ptr               # S1 = Y1*Z2^3
1432         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S1, S1, in1_y);
1433
1434         `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1435         lea     $S2(%rsp), $r_ptr               # S2 = Y2*Z1^3
1436         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S2, in2_y);
1437
1438         lea     $S1(%rsp), $b_ptr
1439         lea     $R(%rsp), $r_ptr                # R = S2 - S1
1440         call    __ecp_nistz256_sub_from$x       # p256_sub(R, S2, S1);
1441
1442         or      $acc5, $acc4                    # see if result is zero
1443         movdqa  %xmm4, %xmm2
1444         or      $acc0, $acc4
1445         or      $acc1, $acc4
1446         por     %xmm5, %xmm2                    # in1infty || in2infty
1447         movq    $acc4, %xmm3
1448
1449         `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1450         lea     $U1(%rsp), $r_ptr               # U1 = X1*Z2^2
1451         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U1, in1_x, Z2sqr);
1452
1453         `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
1454         lea     $U2(%rsp), $r_ptr               # U2 = X2*Z1^2
1455         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, in2_x, Z1sqr);
1456
1457         lea     $U1(%rsp), $b_ptr
1458         lea     $H(%rsp), $r_ptr                # H = U2 - U1
1459         call    __ecp_nistz256_sub_from$x       # p256_sub(H, U2, U1);
1460
1461         or      $acc5, $acc4                    # see if result is zero
1462         or      $acc0, $acc4
1463         or      $acc1, $acc4
1464
1465         .byte   0x3e                            # predict taken
1466         jnz     .Ladd_proceed$x                 # is_equal(U1,U2)?
1467         movq    %xmm2, $acc0
1468         movq    %xmm3, $acc1
1469         test    $acc0, $acc0
1470         jnz     .Ladd_proceed$x                 # (in1infty || in2infty)?
1471         test    $acc1, $acc1
1472         jz      .Ladd_double$x                  # is_equal(S1,S2)?
1473
1474         movq    %xmm0, $r_ptr                   # restore $r_ptr
1475         pxor    %xmm0, %xmm0
1476         movdqu  %xmm0, 0x00($r_ptr)
1477         movdqu  %xmm0, 0x10($r_ptr)
1478         movdqu  %xmm0, 0x20($r_ptr)
1479         movdqu  %xmm0, 0x30($r_ptr)
1480         movdqu  %xmm0, 0x40($r_ptr)
1481         movdqu  %xmm0, 0x50($r_ptr)
1482         jmp     .Ladd_done$x
1483
1484 .align  32
1485 .Ladd_double$x:
1486         movq    %xmm1, $a_ptr                   # restore $a_ptr
1487         movq    %xmm0, $r_ptr                   # restore $r_ptr
1488         add     \$`32*(18-5)`, %rsp             # difference in frame sizes
1489         jmp     .Lpoint_double_shortcut$x
1490
1491 .align  32
1492 .Ladd_proceed$x:
1493         `&load_for_sqr("$R(%rsp)", "$src0")`
1494         lea     $Rsqr(%rsp), $r_ptr             # R^2
1495         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Rsqr, R);
1496
1497         `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1498         lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
1499         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, H, in1_z);
1500
1501         `&load_for_sqr("$H(%rsp)", "$src0")`
1502         lea     $Hsqr(%rsp), $r_ptr             # H^2
1503         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Hsqr, H);
1504
1505         `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
1506         lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
1507         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, res_z, in2_z);
1508
1509         `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
1510         lea     $Hcub(%rsp), $r_ptr             # H^3
1511         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(Hcub, Hsqr, H);
1512
1513         `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
1514         lea     $U2(%rsp), $r_ptr               # U1*H^2
1515         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, U1, Hsqr);
1516 ___
1517 {
1518 #######################################################################
1519 # operate in 4-5-0-1 "name space" that matches multiplication output
1520 #
1521 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1522 my ($poly1, $poly3)=($acc6,$acc7);
1523
1524 $code.=<<___;
1525         #lea    $U2(%rsp), $a_ptr
1526         #lea    $Hsqr(%rsp), $r_ptr     # 2*U1*H^2
1527         #call   __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1528
1529         add     $acc0, $acc0            # a0:a3+a0:a3
1530         lea     $Rsqr(%rsp), $a_ptr
1531         adc     $acc1, $acc1
1532          mov    $acc0, $t0
1533         adc     $acc2, $acc2
1534         adc     $acc3, $acc3
1535          mov    $acc1, $t1
1536         sbb     $t4, $t4
1537
1538         sub     \$-1, $acc0
1539          mov    $acc2, $t2
1540         sbb     $poly1, $acc1
1541         sbb     \$0, $acc2
1542          mov    $acc3, $t3
1543         sbb     $poly3, $acc3
1544         test    $t4, $t4
1545
1546         cmovz   $t0, $acc0
1547         mov     8*0($a_ptr), $t0
1548         cmovz   $t1, $acc1
1549         mov     8*1($a_ptr), $t1
1550         cmovz   $t2, $acc2
1551         mov     8*2($a_ptr), $t2
1552         cmovz   $t3, $acc3
1553         mov     8*3($a_ptr), $t3
1554
1555         call    __ecp_nistz256_sub$x            # p256_sub(res_x, Rsqr, Hsqr);
1556
1557         lea     $Hcub(%rsp), $b_ptr
1558         lea     $res_x(%rsp), $r_ptr
1559         call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, Hcub);
1560
1561         mov     $U2+8*0(%rsp), $t0
1562         mov     $U2+8*1(%rsp), $t1
1563         mov     $U2+8*2(%rsp), $t2
1564         mov     $U2+8*3(%rsp), $t3
1565         lea     $res_y(%rsp), $r_ptr
1566
1567         call    __ecp_nistz256_sub$x            # p256_sub(res_y, U2, res_x);
1568
1569         mov     $acc0, 8*0($r_ptr)              # save the result, as
1570         mov     $acc1, 8*1($r_ptr)              # __ecp_nistz256_sub doesn't
1571         mov     $acc2, 8*2($r_ptr)
1572         mov     $acc3, 8*3($r_ptr)
1573 ___
1574 }
1575 $code.=<<___;
1576         `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
1577         lea     $S2(%rsp), $r_ptr
1578         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S1, Hcub);
1579
1580         `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
1581         lea     $res_y(%rsp), $r_ptr
1582         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_y, R, res_y);
1583
1584         lea     $S2(%rsp), $b_ptr
1585         lea     $res_y(%rsp), $r_ptr
1586         call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, res_y, S2);
1587
1588         movq    %xmm0, $r_ptr           # restore $r_ptr
1589
1590         movdqa  %xmm5, %xmm0            # copy_conditional(res_z, in2_z, in1infty);
1591         movdqa  %xmm5, %xmm1
1592         pandn   $res_z(%rsp), %xmm0
1593         movdqa  %xmm5, %xmm2
1594         pandn   $res_z+0x10(%rsp), %xmm1
1595         movdqa  %xmm5, %xmm3
1596         pand    $in2_z(%rsp), %xmm2
1597         pand    $in2_z+0x10(%rsp), %xmm3
1598         por     %xmm0, %xmm2
1599         por     %xmm1, %xmm3
1600
1601         movdqa  %xmm4, %xmm0            # copy_conditional(res_z, in1_z, in2infty);
1602         movdqa  %xmm4, %xmm1
1603         pandn   %xmm2, %xmm0
1604         movdqa  %xmm4, %xmm2
1605         pandn   %xmm3, %xmm1
1606         movdqa  %xmm4, %xmm3
1607         pand    $in1_z(%rsp), %xmm2
1608         pand    $in1_z+0x10(%rsp), %xmm3
1609         por     %xmm0, %xmm2
1610         por     %xmm1, %xmm3
1611         movdqu  %xmm2, 0x40($r_ptr)
1612         movdqu  %xmm3, 0x50($r_ptr)
1613
1614         movdqa  %xmm5, %xmm0            # copy_conditional(res_x, in2_x, in1infty);
1615         movdqa  %xmm5, %xmm1
1616         pandn   $res_x(%rsp), %xmm0
1617         movdqa  %xmm5, %xmm2
1618         pandn   $res_x+0x10(%rsp), %xmm1
1619         movdqa  %xmm5, %xmm3
1620         pand    $in2_x(%rsp), %xmm2
1621         pand    $in2_x+0x10(%rsp), %xmm3
1622         por     %xmm0, %xmm2
1623         por     %xmm1, %xmm3
1624
1625         movdqa  %xmm4, %xmm0            # copy_conditional(res_x, in1_x, in2infty);
1626         movdqa  %xmm4, %xmm1
1627         pandn   %xmm2, %xmm0
1628         movdqa  %xmm4, %xmm2
1629         pandn   %xmm3, %xmm1
1630         movdqa  %xmm4, %xmm3
1631         pand    $in1_x(%rsp), %xmm2
1632         pand    $in1_x+0x10(%rsp), %xmm3
1633         por     %xmm0, %xmm2
1634         por     %xmm1, %xmm3
1635         movdqu  %xmm2, 0x00($r_ptr)
1636         movdqu  %xmm3, 0x10($r_ptr)
1637
1638         movdqa  %xmm5, %xmm0            # copy_conditional(res_y, in2_y, in1infty);
1639         movdqa  %xmm5, %xmm1
1640         pandn   $res_y(%rsp), %xmm0
1641         movdqa  %xmm5, %xmm2
1642         pandn   $res_y+0x10(%rsp), %xmm1
1643         movdqa  %xmm5, %xmm3
1644         pand    $in2_y(%rsp), %xmm2
1645         pand    $in2_y+0x10(%rsp), %xmm3
1646         por     %xmm0, %xmm2
1647         por     %xmm1, %xmm3
1648
1649         movdqa  %xmm4, %xmm0            # copy_conditional(res_y, in1_y, in2infty);
1650         movdqa  %xmm4, %xmm1
1651         pandn   %xmm2, %xmm0
1652         movdqa  %xmm4, %xmm2
1653         pandn   %xmm3, %xmm1
1654         movdqa  %xmm4, %xmm3
1655         pand    $in1_y(%rsp), %xmm2
1656         pand    $in1_y+0x10(%rsp), %xmm3
1657         por     %xmm0, %xmm2
1658         por     %xmm1, %xmm3
1659         movdqu  %xmm2, 0x20($r_ptr)
1660         movdqu  %xmm3, 0x30($r_ptr)
1661
1662 .Ladd_done$x:
1663         add     \$32*18+8, %rsp
1664         pop     %r15
1665         pop     %r14
1666         pop     %r13
1667         pop     %r12
1668         pop     %rbx
1669         pop     %rbp
1670         ret
1671 .size   ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
1672 ___
1673 }
1674 &gen_add("q");
1675
1676 sub gen_add_affine () {
1677     my $x = shift;
1678     my ($src0,$sfx,$bias);
1679     my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
1680         $res_x,$res_y,$res_z,
1681         $in1_x,$in1_y,$in1_z,
1682         $in2_x,$in2_y)=map(32*$_,(0..14));
1683     my $Z1sqr = $S2;
1684
1685     if ($x ne "x") {
1686         $src0 = "%rax";
1687         $sfx  = "";
1688         $bias = 0;
1689
1690 $code.=<<___;
1691 .globl  ecp_nistz256_point_add_affine
1692 .type   ecp_nistz256_point_add_affine,\@function,3
1693 .align  32
1694 ecp_nistz256_point_add_affine:
1695 ___
1696     } else {
1697         $src0 = "%rdx";
1698         $sfx  = "x";
1699         $bias = 128;
1700     }
1701 $code.=<<___;
1702         push    %rbp
1703         push    %rbx
1704         push    %r12
1705         push    %r13
1706         push    %r14
1707         push    %r15
1708         sub     \$32*15+8, %rsp
1709
1710         movdqu  0x00($a_ptr), %xmm0     # copy  *(P256_POINT *)$a_ptr
1711         mov     $b_org, $b_ptr          # reassign
1712         movdqu  0x10($a_ptr), %xmm1
1713         movdqu  0x20($a_ptr), %xmm2
1714         movdqu  0x30($a_ptr), %xmm3
1715         movdqu  0x40($a_ptr), %xmm4
1716         movdqu  0x50($a_ptr), %xmm5
1717          mov    0x40+8*0($a_ptr), $src0 # load original in1_z
1718          mov    0x40+8*1($a_ptr), $acc6
1719          mov    0x40+8*2($a_ptr), $acc7
1720          mov    0x40+8*3($a_ptr), $acc0
1721         movdqa  %xmm0, $in1_x(%rsp)
1722         movdqa  %xmm1, $in1_x+0x10(%rsp)
1723         por     %xmm0, %xmm1
1724         movdqa  %xmm2, $in1_y(%rsp)
1725         movdqa  %xmm3, $in1_y+0x10(%rsp)
1726         por     %xmm2, %xmm3
1727         movdqa  %xmm4, $in1_z(%rsp)
1728         movdqa  %xmm5, $in1_z+0x10(%rsp)
1729         por     %xmm1, %xmm3
1730
1731         movdqu  0x00($b_ptr), %xmm0     # copy  *(P256_POINT_AFFINE *)$b_ptr
1732          pshufd \$0xb1, %xmm3, %xmm5
1733         movdqu  0x10($b_ptr), %xmm1
1734         movdqu  0x20($b_ptr), %xmm2
1735          por    %xmm3, %xmm5
1736         movdqu  0x30($b_ptr), %xmm3
1737         movdqa  %xmm0, $in2_x(%rsp)
1738          pshufd \$0x1e, %xmm5, %xmm4
1739         movdqa  %xmm1, $in2_x+0x10(%rsp)
1740         por     %xmm0, %xmm1
1741          movq   $r_ptr, %xmm0           # save $r_ptr
1742         movdqa  %xmm2, $in2_y(%rsp)
1743         movdqa  %xmm3, $in2_y+0x10(%rsp)
1744         por     %xmm2, %xmm3
1745          por    %xmm4, %xmm5
1746          pxor   %xmm4, %xmm4
1747         por     %xmm1, %xmm3
1748
1749         lea     0x40-$bias($a_ptr), $a_ptr      # $a_ptr is still valid
1750         lea     $Z1sqr(%rsp), $r_ptr            # Z1^2
1751         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z1sqr, in1_z);
1752
1753         pcmpeqd %xmm4, %xmm5
1754         pshufd  \$0xb1, %xmm3, %xmm4
1755          mov    0x00($b_ptr), $src0             # $b_ptr is still valid
1756          #lea   0x00($b_ptr), $b_ptr
1757          mov    $acc4, $acc1                    # harmonize sqr output and mul input
1758         por     %xmm3, %xmm4
1759         pshufd  \$0, %xmm5, %xmm5               # in1infty
1760         pshufd  \$0x1e, %xmm4, %xmm3
1761          mov    $acc5, $acc2
1762         por     %xmm3, %xmm4
1763         pxor    %xmm3, %xmm3
1764          mov    $acc6, $acc3
1765         pcmpeqd %xmm3, %xmm4
1766         pshufd  \$0, %xmm4, %xmm4               # in2infty
1767
1768         lea     $Z1sqr-$bias(%rsp), $a_ptr
1769         mov     $acc7, $acc4
1770         lea     $U2(%rsp), $r_ptr               # U2 = X2*Z1^2
1771         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, Z1sqr, in2_x);
1772
1773         lea     $in1_x(%rsp), $b_ptr
1774         lea     $H(%rsp), $r_ptr                # H = U2 - U1
1775         call    __ecp_nistz256_sub_from$x       # p256_sub(H, U2, in1_x);
1776
1777         `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1778         lea     $S2(%rsp), $r_ptr               # S2 = Z1^3
1779         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Z1sqr, in1_z);
1780
1781         `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1782         lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
1783         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, H, in1_z);
1784
1785         `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1786         lea     $S2(%rsp), $r_ptr               # S2 = Y2*Z1^3
1787         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S2, in2_y);
1788
1789         lea     $in1_y(%rsp), $b_ptr
1790         lea     $R(%rsp), $r_ptr                # R = S2 - S1
1791         call    __ecp_nistz256_sub_from$x       # p256_sub(R, S2, in1_y);
1792
1793         `&load_for_sqr("$H(%rsp)", "$src0")`
1794         lea     $Hsqr(%rsp), $r_ptr             # H^2
1795         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Hsqr, H);
1796
1797         `&load_for_sqr("$R(%rsp)", "$src0")`
1798         lea     $Rsqr(%rsp), $r_ptr             # R^2
1799         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Rsqr, R);
1800
1801         `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
1802         lea     $Hcub(%rsp), $r_ptr             # H^3
1803         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(Hcub, Hsqr, H);
1804
1805         `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1806         lea     $U2(%rsp), $r_ptr               # U1*H^2
1807         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, in1_x, Hsqr);
1808 ___
1809 {
1810 #######################################################################
1811 # operate in 4-5-0-1 "name space" that matches multiplication output
1812 #
1813 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1814 my ($poly1, $poly3)=($acc6,$acc7);
1815
1816 $code.=<<___;
1817         #lea    $U2(%rsp), $a_ptr
1818         #lea    $Hsqr(%rsp), $r_ptr     # 2*U1*H^2
1819         #call   __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
1820
1821         add     $acc0, $acc0            # a0:a3+a0:a3
1822         lea     $Rsqr(%rsp), $a_ptr
1823         adc     $acc1, $acc1
1824          mov    $acc0, $t0
1825         adc     $acc2, $acc2
1826         adc     $acc3, $acc3
1827          mov    $acc1, $t1
1828         sbb     $t4, $t4
1829
1830         sub     \$-1, $acc0
1831          mov    $acc2, $t2
1832         sbb     $poly1, $acc1
1833         sbb     \$0, $acc2
1834          mov    $acc3, $t3
1835         sbb     $poly3, $acc3
1836         test    $t4, $t4
1837
1838         cmovz   $t0, $acc0
1839         mov     8*0($a_ptr), $t0
1840         cmovz   $t1, $acc1
1841         mov     8*1($a_ptr), $t1
1842         cmovz   $t2, $acc2
1843         mov     8*2($a_ptr), $t2
1844         cmovz   $t3, $acc3
1845         mov     8*3($a_ptr), $t3
1846
1847         call    __ecp_nistz256_sub$x            # p256_sub(res_x, Rsqr, Hsqr);
1848
1849         lea     $Hcub(%rsp), $b_ptr
1850         lea     $res_x(%rsp), $r_ptr
1851         call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, Hcub);
1852
1853         mov     $U2+8*0(%rsp), $t0
1854         mov     $U2+8*1(%rsp), $t1
1855         mov     $U2+8*2(%rsp), $t2
1856         mov     $U2+8*3(%rsp), $t3
1857         lea     $H(%rsp), $r_ptr
1858
1859         call    __ecp_nistz256_sub$x            # p256_sub(H, U2, res_x);
1860
1861         mov     $acc0, 8*0($r_ptr)              # save the result, as
1862         mov     $acc1, 8*1($r_ptr)              # __ecp_nistz256_sub doesn't
1863         mov     $acc2, 8*2($r_ptr)
1864         mov     $acc3, 8*3($r_ptr)
1865 ___
1866 }
1867 $code.=<<___;
1868         `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
1869         lea     $S2(%rsp), $r_ptr
1870         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Hcub, in1_y);
1871
1872         `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
1873         lea     $H(%rsp), $r_ptr
1874         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(H, H, R);
1875
1876         lea     $S2(%rsp), $b_ptr
1877         lea     $res_y(%rsp), $r_ptr
1878         call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, H, S2);
1879
1880         movq    %xmm0, $r_ptr           # restore $r_ptr
1881
1882         movdqa  %xmm5, %xmm0            # copy_conditional(res_z, ONE, in1infty);
1883         movdqa  %xmm5, %xmm1
1884         pandn   $res_z(%rsp), %xmm0
1885         movdqa  %xmm5, %xmm2
1886         pandn   $res_z+0x10(%rsp), %xmm1
1887         movdqa  %xmm5, %xmm3
1888         pand    .LONE_mont(%rip), %xmm2
1889         pand    .LONE_mont+0x10(%rip), %xmm3
1890         por     %xmm0, %xmm2
1891         por     %xmm1, %xmm3
1892
1893         movdqa  %xmm4, %xmm0            # copy_conditional(res_z, in1_z, in2infty);
1894         movdqa  %xmm4, %xmm1
1895         pandn   %xmm2, %xmm0
1896         movdqa  %xmm4, %xmm2
1897         pandn   %xmm3, %xmm1
1898         movdqa  %xmm4, %xmm3
1899         pand    $in1_z(%rsp), %xmm2
1900         pand    $in1_z+0x10(%rsp), %xmm3
1901         por     %xmm0, %xmm2
1902         por     %xmm1, %xmm3
1903         movdqu  %xmm2, 0x40($r_ptr)
1904         movdqu  %xmm3, 0x50($r_ptr)
1905
1906         movdqa  %xmm5, %xmm0            # copy_conditional(res_x, in2_x, in1infty);
1907         movdqa  %xmm5, %xmm1
1908         pandn   $res_x(%rsp), %xmm0
1909         movdqa  %xmm5, %xmm2
1910         pandn   $res_x+0x10(%rsp), %xmm1
1911         movdqa  %xmm5, %xmm3
1912         pand    $in2_x(%rsp), %xmm2
1913         pand    $in2_x+0x10(%rsp), %xmm3
1914         por     %xmm0, %xmm2
1915         por     %xmm1, %xmm3
1916
1917         movdqa  %xmm4, %xmm0            # copy_conditional(res_x, in1_x, in2infty);
1918         movdqa  %xmm4, %xmm1
1919         pandn   %xmm2, %xmm0
1920         movdqa  %xmm4, %xmm2
1921         pandn   %xmm3, %xmm1
1922         movdqa  %xmm4, %xmm3
1923         pand    $in1_x(%rsp), %xmm2
1924         pand    $in1_x+0x10(%rsp), %xmm3
1925         por     %xmm0, %xmm2
1926         por     %xmm1, %xmm3
1927         movdqu  %xmm2, 0x00($r_ptr)
1928         movdqu  %xmm3, 0x10($r_ptr)
1929
1930         movdqa  %xmm5, %xmm0            # copy_conditional(res_y, in2_y, in1infty);
1931         movdqa  %xmm5, %xmm1
1932         pandn   $res_y(%rsp), %xmm0
1933         movdqa  %xmm5, %xmm2
1934         pandn   $res_y+0x10(%rsp), %xmm1
1935         movdqa  %xmm5, %xmm3
1936         pand    $in2_y(%rsp), %xmm2
1937         pand    $in2_y+0x10(%rsp), %xmm3
1938         por     %xmm0, %xmm2
1939         por     %xmm1, %xmm3
1940
1941         movdqa  %xmm4, %xmm0            # copy_conditional(res_y, in1_y, in2infty);
1942         movdqa  %xmm4, %xmm1
1943         pandn   %xmm2, %xmm0
1944         movdqa  %xmm4, %xmm2
1945         pandn   %xmm3, %xmm1
1946         movdqa  %xmm4, %xmm3
1947         pand    $in1_y(%rsp), %xmm2
1948         pand    $in1_y+0x10(%rsp), %xmm3
1949         por     %xmm0, %xmm2
1950         por     %xmm1, %xmm3
1951         movdqu  %xmm2, 0x20($r_ptr)
1952         movdqu  %xmm3, 0x30($r_ptr)
1953
1954         add     \$32*15+8, %rsp
1955         pop     %r15
1956         pop     %r14
1957         pop     %r13
1958         pop     %r12
1959         pop     %rbx
1960         pop     %rbp
1961         ret
1962 .size   ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
1963 ___
1964 }
1965 &gen_add_affine("q");
1966
1967 }}}
1968
1969 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1970 print $code;
1971 close STDOUT;