usr/src/common/bignum/i386/bignum_i386_asm.s

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 #include <sys/asm_linkage.h>
  26 #include <sys/x86_archext.h>
  27 #include <sys/controlregs.h>
  28
  29
  30 #if defined(MMX_MANAGE)
  31
  32 #if defined(_KERNEL)
  33
  34 #define KPREEMPT_DISABLE call kpr_disable
  35 #define KPREEMPT_ENABLE call kpr_enable
  36 #define TEST_TS(reg)                                    \
  37         movl    %cr0, reg;                              \
  38         clts;                                           \
  39         testl   $CR0_TS, reg
  40
  41 #else   /* _KERNEL */
  42
  43 #define KPREEMPT_DISABLE
  44 #define KPREEMPT_ENABLE
  45
  46 #define TEST_TS(reg)                                    \
  47         movl    $0, reg;                                \
  48         testl   $CR0_TS, reg
  49
  50 #endif  /* _KERNEL */
  51
  52 #define MMX_SIZE 8
  53 #define MMX_ALIGN 8
  54
  55 #define SAVE_MMX_PROLOG(sreg, nreg)                     \
  56         subl    $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;        \
  57         movl    %esp, sreg;                             \
  58         addl    $MMX_ALIGN, sreg;                       \
  59         andl    $-1![MMX_ALIGN-1], sreg;
  60
  61 #define RSTOR_MMX_EPILOG(nreg)                          \
  62         addl    $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
  63
  64 #define SAVE_MMX_0TO4(sreg)                     \
  65         SAVE_MMX_PROLOG(sreg, 5);               \
  66         movq    %mm0, 0(sreg);                  \
  67         movq    %mm1, 8(sreg);                  \
  68         movq    %mm2, 16(sreg);                 \
  69         movq    %mm3, 24(sreg);                 \
  70         movq    %mm4, 32(sreg)
  71
  72 #define RSTOR_MMX_0TO4(sreg)                    \
  73         movq    0(sreg), %mm0;                  \
  74         movq    8(sreg), %mm1;                  \
  75         movq    16(sreg), %mm2;                 \
  76         movq    24(sreg), %mm3;                 \
  77         movq    32(sreg), %mm4;                 \
  78         RSTOR_MMX_EPILOG(5)
  79
  80 #endif  /* MMX_MANAGE */
  81
  82 / Note: this file contains implementations for
  83 /       big_mul_set_vec()
  84 /       big_mul_add_vec()
  85 /       big_mul_vec()
  86 /       big_sqr_vec()
  87 / One set of implementations is for SSE2-capable models.
  88 / The other uses no MMX, SSE, or SSE2 instructions, only
  89 / the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
  90 /
  91 / The code for the implementations is grouped by SSE2 vs UMUL,
  92 / rather than grouping pairs of implementations for each function.
  93 / This is because the bignum implementation gets "imprinted"
  94 / on the correct implementation, at the time of first use,
  95 / so none of the code for the other implementations is ever
  96 / executed.  So, it is a no-brainer to layout the code to minimize
  97 / the "footprint" of executed code.
  98
  99 / Can we use SSE2 instructions?  Return value is non-zero
 100 / if we can.
 101 /
 102 / Note:
 103 /   Using the cpuid instruction directly would work equally
 104 /   well in userland and in the kernel, but we do not use the
 105 /   cpuid instruction in the kernel, we use x86_featureset,
 106 /   instead.  This means we honor any decisions the kernel
 107 /   startup code may have made in setting this variable,
 108 /   including disabling SSE2.  It might even be a good idea
 109 /   to honor this kind of setting in userland, as well, but
 110 /   the variable, x86_featureset is not readily available to
 111 /   userland processes.
 112 /
 113 / uint32_t
 114 / bignum_use_sse2()
 115
 116         ENTRY(bignum_use_sse2)
 117 #if defined(_KERNEL)
 118         xor     %eax, %eax
 119         bt      $X86FSET_SSE2, x86_featureset
 120         adc     %eax, %eax
 121 #else   /* _KERNEL */
 122         pushl   %ebx
 123         movl    $1, %eax                / Get feature information
 124         cpuid
 125         movl    %edx, %eax              / set return value
 126         popl    %ebx
 127         andl    $CPUID_INTC_EDX_SSE2, %eax
 128 #endif  /* _KERNEL */
 129         ret
 130         SET_SIZE(bignum_use_sse2)
 131
 132
 133 / ------------------------------------------------------------------------
 134 /               SSE2 Implementations
 135 / ------------------------------------------------------------------------
 136
 137 / r = a * digit, r and a are vectors of length len
 138 / returns the carry digit
 139 / Suitable only for x86 models that support SSE2 instruction set extensions
 140 /
 141 / uint32_t
 142 / big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
 143 /
 144 / r     %edx
 145 / a     %ebx
 146 / len   %ecx
 147 / digit %mm3
 148 /
 149 / Does not touch the following registers: %esi, %edi, %mm4
 150 /
 151 / N.B.:
 152 /   This is strictly for internal use.
 153 /   The interface is very light-weight.
 154 /   All parameters are passed in registers.
 155 /   It does not conform to the SYSV x86 ABI.
 156 /   So, don't even think about calling this function directly from C code.
 157 /
 158 / The basic multiply digit loop is unrolled 8 times.
 159 / Each comment is preceded by an instance number.
 160 / Instructions that have been moved retain their original, "natural"
 161 / instance number.  It should be easier this way to follow
 162 / the step-wise refinement process that went into constructing
 163 / the final code.
 164
 165 #define UNROLL          8
 166 #define UNROLL32        32
 167
 168         ENTRY(big_mul_set_vec_sse2_r)
 169         xorl    %eax, %eax      / if (len == 0) return (0);
 170         testl   %ecx, %ecx
 171         jz      .L17
 172
 173         pxor    %mm0, %mm0      / cy = 0
 174
 175 .L15:
 176         cmpl    $UNROLL, %ecx
 177         jl      .L16
 178         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 179         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 180         paddq   %mm1, %mm0      / 1: mm0 = digit * a[i] + cy;
 181         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 182         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 183         psrlq   $32, %mm0       / 1: cy = product[63..32]
 184
 185         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 186         paddq   %mm1, %mm0      / 2: mm0 = digit * a[i] + cy;
 187         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 188         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 189         psrlq   $32, %mm0       / 2: cy = product[63..32]
 190
 191         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 192         paddq   %mm1, %mm0      / 3: mm0 = digit * a[i] + cy;
 193         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 194         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 195         psrlq   $32, %mm0       / 3: cy = product[63..32]
 196
 197         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 198         paddq   %mm1, %mm0      / 4: mm0 = digit * a[i] + cy;
 199         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 200         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 201         psrlq   $32, %mm0       / 4: cy = product[63..32]
 202
 203         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 204         paddq   %mm1, %mm0      / 5: mm0 = digit * a[i] + cy;
 205         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 206         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 207         psrlq   $32, %mm0       / 5: cy = product[63..32]
 208
 209         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 210         paddq   %mm1, %mm0      / 6: mm0 = digit * a[i] + cy;
 211         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 212         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 213         psrlq   $32, %mm0       / 6: cy = product[63..32]
 214
 215         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 216         paddq   %mm1, %mm0      / 7: mm0 = digit * a[i] + cy;
 217         movd    28(%ebx), %mm1  / 8: mm1 = a[i]
 218         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 219         psrlq   $32, %mm0       / 7: cy = product[63..32]
 220
 221         pmuludq %mm3, %mm1      / 8: mm1 = digit * a[i]
 222         paddq   %mm1, %mm0      / 8: mm0 = digit * a[i] + cy;
 223         movd    %mm0, 28(%edx)  / 8: r[i] = product[31..0]
 224         psrlq   $32, %mm0       / 8: cy = product[63..32]
 225
 226         leal    UNROLL32(%ebx), %ebx    / a += UNROLL
 227         leal    UNROLL32(%edx), %edx    / r += UNROLL
 228         subl    $UNROLL, %ecx           / len -= UNROLL
 229         jz      .L17
 230         jmp     .L15
 231
 232 .L16:
 233         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 234         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 235         paddq   %mm1, %mm0      / 1: mm0 = digit * a[i] + cy;
 236         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 237         psrlq   $32, %mm0       / 1: cy = product[63..32]
 238         subl    $1, %ecx
 239         jz      .L17
 240
 241         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 242         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 243         paddq   %mm1, %mm0      / 2: mm0 = digit * a[i] + cy;
 244         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 245         psrlq   $32, %mm0       / 2: cy = product[63..32]
 246         subl    $1, %ecx
 247         jz      .L17
 248
 249         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 250         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 251         paddq   %mm1, %mm0      / 3: mm0 = digit * a[i] + cy;
 252         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 253         psrlq   $32, %mm0       / 3: cy = product[63..32]
 254         subl    $1, %ecx
 255         jz      .L17
 256
 257         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 258         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 259         paddq   %mm1, %mm0      / 4: mm0 = digit * a[i] + cy;
 260         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 261         psrlq   $32, %mm0       / 4: cy = product[63..32]
 262         subl    $1, %ecx
 263         jz      .L17
 264
 265         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 266         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 267         paddq   %mm1, %mm0      / 5: mm0 = digit * a[i] + cy;
 268         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 269         psrlq   $32, %mm0       / 5: cy = product[63..32]
 270         subl    $1, %ecx
 271         jz      .L17
 272
 273         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 274         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 275         paddq   %mm1, %mm0      / 6: mm0 = digit * a[i] + cy;
 276         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 277         psrlq   $32, %mm0       / 6: cy = product[63..32]
 278         subl    $1, %ecx
 279         jz      .L17
 280
 281         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 282         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 283         paddq   %mm1, %mm0      / 7: mm0 = digit * a[i] + cy;
 284         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 285         psrlq   $32, %mm0       / 7: cy = product[63..32]
 286
 287 .L17:
 288         movd    %mm0, %eax      / return (cy)
 289         / no emms.  caller is responsible for emms
 290         ret
 291         SET_SIZE(big_mul_set_vec_sse2_r)
 292
 293
 294 / r = a * digit, r and a are vectors of length len
 295 / returns the carry digit
 296 / Suitable only for x86 models that support SSE2 instruction set extensions
 297 /
 298 / r              8(%ebp)        %edx
 299 / a             12(%ebp)        %ebx
 300 / len           16(%ebp)        %ecx
 301 / digit         20(%ebp)        %mm3
 302 /
 303 / In userland, there is just the one function, big_mul_set_vec_sse2().
 304 / But in the kernel, there are two variations:
 305 /    1. big_mul_set_vec_sse2() which does what is necessary to save and
 306 /       restore state, if necessary, and to ensure that preemtion is
 307 /       disabled.
 308 /    2. big_mul_set_vec_sse2_nsv() which just does the work;
 309 /       it is the caller's responsibility to ensure that MMX state
 310 /       does not need to be saved and restored and that preemption
 311 /       is already disabled.
 312
 313 #if defined(MMX_MANAGE)
 314         ENTRY(big_mul_set_vec_sse2)
 315         pushl   %ebp
 316         movl    %esp, %ebp
 317         pushl   %ebx
 318         pushl   %esi
 319         KPREEMPT_DISABLE
 320         TEST_TS(%ebx)
 321         pushl   %ebx
 322         jnz     .setvec_no_save
 323         pushl   %edi
 324         SAVE_MMX_0TO4(%edi)
 325         movl    8(%ebp), %edx
 326         movl    12(%ebp), %ebx
 327         movl    16(%ebp), %ecx
 328         movd    20(%ebp), %mm3
 329         call    big_mul_set_vec_sse2_r
 330         movl    %eax, %esi
 331         RSTOR_MMX_0TO4(%edi)
 332         popl    %edi
 333         jmp     .setvec_rtn
 334
 335 .setvec_no_save:
 336         movl    8(%ebp), %edx
 337         movl    12(%ebp), %ebx
 338         movl    16(%ebp), %ecx
 339         movd    20(%ebp), %mm3
 340         call    big_mul_set_vec_sse2_r
 341         movl    %eax, %esi
 342
 343 .setvec_rtn:
 344         emms
 345         popl    %ebx
 346         movl    %ebx, %cr0
 347         KPREEMPT_ENABLE
 348         movl    %esi, %eax
 349         popl    %esi
 350         popl    %ebx
 351         leave
 352         ret
 353         SET_SIZE(big_mul_set_vec_sse2)
 354
 355         ENTRY(big_mul_set_vec_sse2_nsv)
 356         pushl   %ebp
 357         movl    %esp, %ebp
 358         pushl   %ebx
 359         movl    8(%ebp), %edx
 360         movl    12(%ebp), %ebx
 361         movl    16(%ebp), %ecx
 362         movd    20(%ebp), %mm3
 363         call    big_mul_set_vec_sse2_r
 364         popl    %ebx
 365         leave
 366         ret
 367         SET_SIZE(big_mul_set_vec_sse2_nsv)
 368
 369 #else   /* !defined(MMX_MANAGE) */
 370
 371 / r = a * digit, r and a are vectors of length len
 372 / returns the carry digit
 373 / Suitable only for x86 models that support SSE2 instruction set extensions
 374 /
 375 / r              8(%ebp)        %edx
 376 / a             12(%ebp)        %ebx
 377 / len           16(%ebp)        %ecx
 378 / digit         20(%ebp)        %mm3
 379
 380         ENTRY(big_mul_set_vec_sse2)
 381         pushl   %ebp
 382         movl    %esp, %ebp
 383         pushl   %ebx
 384         movl    8(%ebp), %edx
 385         movl    12(%ebp), %ebx
 386         movl    16(%ebp), %ecx
 387         movd    20(%ebp), %mm3
 388         call    big_mul_set_vec_sse2_r
 389         popl    %ebx
 390         emms
 391         leave
 392         ret
 393         SET_SIZE(big_mul_set_vec_sse2)
 394
 395 #endif  /* MMX_MANAGE */
 396
 397
 398 / r = r + a * digit, r and a are vectors of length len
 399 / returns the carry digit
 400 / Suitable only for x86 models that support SSE2 instruction set extensions
 401 /
 402 / uint32_t
 403 / big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
 404 /
 405 / r     %edx
 406 / a     %ebx
 407 / len   %ecx
 408 / digit %mm3
 409 /
 410 / N.B.:
 411 /   This is strictly for internal use.
 412 /   The interface is very light-weight.
 413 /   All parameters are passed in registers.
 414 /   It does not conform to the SYSV x86 ABI.
 415 /   So, don't even think about calling this function directly from C code.
 416 /
 417 / The basic multiply digit loop is unrolled 8 times.
 418 / Each comment is preceded by an instance number.
 419 / Instructions that have been moved retain their original, "natural"
 420 / instance number.  It should be easier this way to follow
 421 / the step-wise refinement process that went into constructing
 422 / the final code.
 423
 424         ENTRY(big_mul_add_vec_sse2_r)
 425         xorl    %eax, %eax
 426         testl   %ecx, %ecx
 427         jz      .L27
 428
 429         pxor    %mm0, %mm0      / cy = 0
 430
 431 .L25:
 432         cmpl    $UNROLL, %ecx
 433         jl      .L26
 434         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 435         movd    0(%edx), %mm2   / 1: mm2 = r[i]
 436         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 437         paddq   %mm1, %mm2      / 1: mm2 = digit * a[i] + r[i]
 438         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 439         paddq   %mm2, %mm0      / 1: mm0 = digit * a[i] + r[i] + cy;
 440         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 441         movd    4(%edx), %mm2   / 2: mm2 = r[i]
 442         psrlq   $32, %mm0       / 1: cy = product[63..32]
 443
 444         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 445         paddq   %mm1, %mm2      / 2: mm2 = digit * a[i] + r[i]
 446         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 447         paddq   %mm2, %mm0      / 2: mm0 = digit * a[i] + r[i] + cy;
 448         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 449         movd    8(%edx), %mm2   / 3: mm2 = r[i]
 450         psrlq   $32, %mm0       / 2: cy = product[63..32]
 451
 452         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 453         paddq   %mm1, %mm2      / 3: mm2 = digit * a[i] + r[i]
 454         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 455         paddq   %mm2, %mm0      / 3: mm0 = digit * a[i] + r[i] + cy;
 456         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 457         movd    12(%edx), %mm2  / 4: mm2 = r[i]
 458         psrlq   $32, %mm0       / 3: cy = product[63..32]
 459
 460         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 461         paddq   %mm1, %mm2      / 4: mm2 = digit * a[i] + r[i]
 462         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 463         paddq   %mm2, %mm0      / 4: mm0 = digit * a[i] + r[i] + cy;
 464         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 465         movd    16(%edx), %mm2  / 5: mm2 = r[i]
 466         psrlq   $32, %mm0       / 4: cy = product[63..32]
 467
 468         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 469         paddq   %mm1, %mm2      / 5: mm2 = digit * a[i] + r[i]
 470         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 471         paddq   %mm2, %mm0      / 5: mm0 = digit * a[i] + r[i] + cy;
 472         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 473         movd    20(%edx), %mm2  / 6: mm2 = r[i]
 474         psrlq   $32, %mm0       / 5: cy = product[63..32]
 475
 476         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 477         paddq   %mm1, %mm2      / 6: mm2 = digit * a[i] + r[i]
 478         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 479         paddq   %mm2, %mm0      / 6: mm0 = digit * a[i] + r[i] + cy;
 480         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 481         movd    24(%edx), %mm2  / 7: mm2 = r[i]
 482         psrlq   $32, %mm0       / 6: cy = product[63..32]
 483
 484         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 485         paddq   %mm1, %mm2      / 7: mm2 = digit * a[i] + r[i]
 486         movd    28(%ebx), %mm1  / 8: mm1 = a[i]
 487         paddq   %mm2, %mm0      / 7: mm0 = digit * a[i] + r[i] + cy;
 488         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 489         movd    28(%edx), %mm2  / 8: mm2 = r[i]
 490         psrlq   $32, %mm0       / 7: cy = product[63..32]
 491
 492         pmuludq %mm3, %mm1      / 8: mm1 = digit * a[i]
 493         paddq   %mm1, %mm2      / 8: mm2 = digit * a[i] + r[i]
 494         paddq   %mm2, %mm0      / 8: mm0 = digit * a[i] + r[i] + cy;
 495         movd    %mm0, 28(%edx)  / 8: r[i] = product[31..0]
 496         psrlq   $32, %mm0       / 8: cy = product[63..32]
 497
 498         leal    UNROLL32(%ebx), %ebx    / a += UNROLL
 499         leal    UNROLL32(%edx), %edx    / r += UNROLL
 500         subl    $UNROLL, %ecx           / len -= UNROLL
 501         jz      .L27
 502         jmp     .L25
 503
 504 .L26:
 505         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 506         movd    0(%edx), %mm2   / 1: mm2 = r[i]
 507         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 508         paddq   %mm1, %mm2      / 1: mm2 = digit * a[i] + r[i]
 509         paddq   %mm2, %mm0      / 1: mm0 = digit * a[i] + r[i] + cy;
 510         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 511         psrlq   $32, %mm0       / 1: cy = product[63..32]
 512         subl    $1, %ecx
 513         jz      .L27
 514
 515         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 516         movd    4(%edx), %mm2   / 2: mm2 = r[i]
 517         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 518         paddq   %mm1, %mm2      / 2: mm2 = digit * a[i] + r[i]
 519         paddq   %mm2, %mm0      / 2: mm0 = digit * a[i] + r[i] + cy;
 520         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 521         psrlq   $32, %mm0       / 2: cy = product[63..32]
 522         subl    $1, %ecx
 523         jz      .L27
 524
 525         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 526         movd    8(%edx), %mm2   / 3: mm2 = r[i]
 527         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 528         paddq   %mm1, %mm2      / 3: mm2 = digit * a[i] + r[i]
 529         paddq   %mm2, %mm0      / 3: mm0 = digit * a[i] + r[i] + cy;
 530         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 531         psrlq   $32, %mm0       / 3: cy = product[63..32]
 532         subl    $1, %ecx
 533         jz      .L27
 534
 535         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 536         movd    12(%edx), %mm2  / 4: mm2 = r[i]
 537         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 538         paddq   %mm1, %mm2      / 4: mm2 = digit * a[i] + r[i]
 539         paddq   %mm2, %mm0      / 4: mm0 = digit * a[i] + r[i] + cy;
 540         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 541         psrlq   $32, %mm0       / 4: cy = product[63..32]
 542         subl    $1, %ecx
 543         jz      .L27
 544
 545         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 546         movd    16(%edx), %mm2  / 5: mm2 = r[i]
 547         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 548         paddq   %mm1, %mm2      / 5: mm2 = digit * a[i] + r[i]
 549         paddq   %mm2, %mm0      / 5: mm0 = digit * a[i] + r[i] + cy;
 550         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 551         psrlq   $32, %mm0       / 5: cy = product[63..32]
 552         subl    $1, %ecx
 553         jz      .L27
 554
 555         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 556         movd    20(%edx), %mm2  / 6: mm2 = r[i]
 557         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 558         paddq   %mm1, %mm2      / 6: mm2 = digit * a[i] + r[i]
 559         paddq   %mm2, %mm0      / 6: mm0 = digit * a[i] + r[i] + cy;
 560         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 561         psrlq   $32, %mm0       / 6: cy = product[63..32]
 562         subl    $1, %ecx
 563         jz      .L27
 564
 565         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 566         movd    24(%edx), %mm2  / 7: mm2 = r[i]
 567         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 568         paddq   %mm1, %mm2      / 7: mm2 = digit * a[i] + r[i]
 569         paddq   %mm2, %mm0      / 7: mm0 = digit * a[i] + r[i] + cy;
 570         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 571         psrlq   $32, %mm0       / 7: cy = product[63..32]
 572
 573 .L27:
 574         movd    %mm0, %eax
 575         / no emms.  caller is responsible for emms
 576         ret
 577         SET_SIZE(big_mul_add_vec_sse2_r)
 578
 579
 580 / r = r + a * digit, r and a are vectors of length len
 581 / returns the carry digit
 582 / Suitable only for x86 models that support SSE2 instruction set extensions
 583 /
 584 / r              8(%ebp)        %edx
 585 / a             12(%ebp)        %ebx
 586 / len           16(%ebp)        %ecx
 587 / digit         20(%ebp)        %mm3
 588 /
 589 / In userland, there is just the one function, big_mul_add_vec_sse2().
 590 / But in the kernel, there are two variations:
 591 /    1. big_mul_add_vec_sse2() which does what is necessary to save and
 592 /       restore state, if necessary, and to ensure that preemtion is
 593 /       disabled.
 594 /    2. big_mul_add_vec_sse2_nsv() which just does the work;
 595 /       it is the caller's responsibility to ensure that MMX state
 596 /       does not need to be saved and restored and that preemption
 597 /       is already disabled.
 598
 599
 600 #if defined(MMX_MANAGE)
 601
 602         ENTRY(big_mul_add_vec_sse2)
 603         pushl   %ebp
 604         movl    %esp, %ebp
 605         pushl   %ebx
 606         pushl   %esi
 607         KPREEMPT_DISABLE
 608         TEST_TS(%ebx)
 609         pushl   %ebx
 610         jnz     .addvec_no_save
 611         pushl   %edi
 612         SAVE_MMX_0TO4(%edi)
 613         movl    8(%ebp), %edx
 614         movl    12(%ebp), %ebx
 615         movl    16(%ebp), %ecx
 616         movd    20(%ebp), %mm3
 617         call    big_mul_add_vec_sse2_r
 618         movl    %eax, %esi
 619         RSTOR_MMX_0TO4(%edi)
 620         popl    %edi
 621         jmp     .addvec_rtn
 622
 623 .addvec_no_save:
 624         movl    8(%ebp), %edx
 625         movl    12(%ebp), %ebx
 626         movl    16(%ebp), %ecx
 627         movd    20(%ebp), %mm3
 628         call    big_mul_add_vec_sse2_r
 629         movl    %eax, %esi
 630
 631 .addvec_rtn:
 632         emms
 633         popl    %ebx
 634         movl    %ebx, %cr0
 635         KPREEMPT_ENABLE
 636         movl    %esi, %eax
 637         popl    %esi
 638         popl    %ebx
 639         leave
 640         ret
 641         SET_SIZE(big_mul_add_vec_sse2)
 642
 643         ENTRY(big_mul_add_vec_sse2_nsv)
 644         pushl   %ebp
 645         movl    %esp, %ebp
 646         pushl   %ebx
 647         movl    8(%ebp), %edx
 648         movl    12(%ebp), %ebx
 649         movl    16(%ebp), %ecx
 650         movd    20(%ebp), %mm3
 651         call    big_mul_add_vec_sse2_r
 652         popl    %ebx
 653         leave
 654         ret
 655         SET_SIZE(big_mul_add_vec_sse2_nsv)
 656
 657
 658 #else   /* !defined(MMX_MANAGE) */
 659
 660         ENTRY(big_mul_add_vec_sse2)
 661         pushl   %ebp
 662         movl    %esp, %ebp
 663         pushl   %ebx
 664         movl    8(%ebp), %edx
 665         movl    12(%ebp), %ebx
 666         movl    16(%ebp), %ecx
 667         movd    20(%ebp), %mm3
 668         call    big_mul_add_vec_sse2_r
 669         popl    %ebx
 670         emms
 671         leave
 672         ret
 673         SET_SIZE(big_mul_add_vec_sse2)
 674
 675 #endif  /* MMX_MANAGE */
 676
 677
 678 / void
 679 / big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
 680 / {
 681 /       int i;
 682 /
 683 /       r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
 684 /       for (i = 1; i < blen; ++i)
 685 /               r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
 686 / }
 687
 688
 689 #if defined(MMX_MANAGE)
 690         ENTRY(big_mul_vec_sse2_fc)
 691 #else
 692         ENTRY(big_mul_vec_sse2)
 693 #endif
 694         subl    $0x8, %esp
 695         pushl   %ebx
 696         pushl   %ebp
 697         pushl   %esi
 698         pushl   %edi
 699         movl    40(%esp), %eax
 700         movl    %eax, 20(%esp)
 701         pushl   (%eax)
 702         movl    40(%esp), %edi
 703         pushl   %edi
 704         movl    40(%esp), %esi
 705         pushl   %esi
 706         movl    40(%esp), %ebx
 707         pushl   %ebx
 708 #if defined(MMX_MANAGE)
 709         call    big_mul_set_vec_sse2_nsv
 710 #else
 711         call    big_mul_set_vec_sse2
 712 #endif
 713         addl    $0x10, %esp
 714         movl    %eax, (%ebx,%edi,4)
 715         movl    44(%esp), %eax
 716         movl    %eax, 16(%esp)
 717         cmpl    $0x1, %eax
 718         jle     .mulvec_rtn
 719         movl    $0x1, %ebp
 720
 721         .align 16
 722 .mulvec_add:
 723         movl    20(%esp), %eax
 724         pushl   (%eax,%ebp,4)
 725         pushl   %edi
 726         pushl   %esi
 727         leal    (%ebx,%ebp,4), %eax
 728         pushl   %eax
 729 #if defined(MMX_MANAGE)
 730         call    big_mul_add_vec_sse2_nsv
 731 #else
 732         call    big_mul_add_vec_sse2
 733 #endif
 734         addl    $0x10, %esp
 735         leal    (%ebp,%edi), %ecx
 736         movl    %eax, (%ebx,%ecx,4)
 737         incl    %ebp
 738         cmpl    16(%esp), %ebp
 739         jl      .mulvec_add
 740 .mulvec_rtn:
 741 #if defined(MMX_MANAGE)
 742         emms
 743 #endif
 744         popl    %edi
 745         popl    %esi
 746         popl    %ebp
 747         popl    %ebx
 748         addl    $0x8, %esp
 749         ret
 750 #if defined(MMX_MANAGE)
 751         SET_SIZE(big_mul_vec_sse2_fc)
 752 #else
 753         SET_SIZE(big_mul_vec_sse2)
 754 #endif
 755
 756 #if defined(MMX_MANAGE)
 757
 758         ENTRY(big_mul_vec_sse2)
 759         pushl   %ebp
 760         movl    %esp, %ebp
 761         subl    $8, %esp
 762         pushl   %edi
 763         KPREEMPT_DISABLE
 764         TEST_TS(%eax)
 765         movl    %eax, -8(%ebp)
 766         jnz     .mulvec_no_save
 767         SAVE_MMX_0TO4(%edi)
 768         movl    %edi, -4(%ebp)
 769 .mulvec_no_save:
 770         movl    24(%ebp), %eax          / blen
 771         pushl   %eax
 772         movl    20(%ebp), %eax          / b
 773         pushl   %eax
 774         movl    16(%ebp), %eax          / alen
 775         pushl   %eax
 776         movl    12(%ebp), %eax          / a
 777         pushl   %eax
 778         movl    8(%ebp), %eax           / r
 779         pushl   %eax
 780         call    big_mul_vec_sse2_fc
 781         addl    $20, %esp
 782         movl    -8(%ebp), %eax
 783         testl   $CR0_TS, %eax
 784         jnz     .mulvec_no_rstr
 785         movl    -4(%ebp), %edi
 786         RSTOR_MMX_0TO4(%edi)
 787 .mulvec_no_rstr:
 788         movl    %eax, %cr0
 789         KPREEMPT_ENABLE
 790         popl    %edi
 791         leave
 792         ret
 793         SET_SIZE(big_mul_vec_sse2)
 794
 795 #endif  /* MMX_MANAGE */
 796
 797
 798
 799 #undef UNROLL
 800 #undef UNROLL32
 801
 802
 803 / r = a * a, r and a are vectors of length len
 804 / Suitable only for x86 models that support SSE2 instruction set extensions
 805 /
 806 / This function is not suitable for a truly general-purpose multiprecision
 807 / arithmetic library, because it does not work for "small" numbers, that is
 808 / numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
 809 / for any small numbers.
 810
 811 #if defined(MMX_MANAGE)
 812         ENTRY(big_sqr_vec_sse2_fc)
 813 #else
 814         ENTRY(big_sqr_vec_sse2)
 815         pushl   %ebp
 816         movl    %esp, %ebp
 817 #endif
 818
 819         pushl   %ebx
 820         pushl   %edi
 821         pushl   %esi
 822
 823         / r[1..alen] = a[0] * a[1..alen-1]
 824
 825         movl    8(%ebp), %edi           / r = arg(r)
 826         movl    12(%ebp), %esi          / a = arg(a)
 827         movl    16(%ebp), %ecx          / cnt = arg(alen)
 828         movd    %ecx, %mm4              / save_cnt = arg(alen)
 829         leal    4(%edi), %edx           / dst = &r[1]
 830         movl    %esi, %ebx              / src = a
 831         movd    0(%ebx), %mm3           / mm3 = a[0]
 832         leal    4(%ebx), %ebx           / src = &a[1]
 833         subl    $1, %ecx                / --cnt
 834         call    big_mul_set_vec_sse2_r  / r[1..alen-1] = a[0] * a[1..alen-1]
 835         movl    %edi, %edx              / dst = r
 836         movl    %esi, %ebx              / src = a
 837         movd    %mm4, %ecx              / cnt = save_cnt
 838         movl    %eax, (%edx, %ecx, 4)   / r[cnt] = cy
 839
 840 /       /* High-level vector C pseudocode */
 841 /       for (i = 1; i < alen-1; ++i)
 842 /               r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
 843 /
 844 /       /* Same thing, but slightly lower level C-like pseudocode */
 845 /       i = 1;
 846 /       r = &arg_r[2*i + 1];
 847 /       a = &arg_a[i + 1];
 848 /       digit = arg_a[i];
 849 /       cnt = alen - 3;
 850 /       while (cnt != 0) {
 851 /               r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
 852 /               r += 2;
 853 /               ++a;
 854 /               --cnt;
 855 /       }
 856 /
 857 /       /* Same thing, but even lower level
 858 /        * For example, pointers are raw pointers,
 859 /        * with no scaling by object size.
 860 /        */
 861 /       r = arg_r + 12; /* i == 1; 2i + 1 == 3;  4*3 == 12; */
 862 /       a = arg_a + 8;
 863 /       digit = *(arg_a + 4);
 864 /       cnt = alen - 3;
 865 /       while (cnt != 0) {
 866 /               cy = big_mul_add_vec_sse2_r();
 867 /               *(r + 4 * cnt) = cy;
 868 /               r += 8;
 869 /               a += 4;
 870 /               --cnt;
 871 /       }
 872
 873         leal    4(%edi), %edi           / r += 4; r = &r[1]
 874         leal    4(%esi), %esi           / a += 4; a = &a[1]
 875         movd    %mm4, %ecx              / cnt = save
 876         subl    $2, %ecx                / cnt = alen - 2; i in 1..alen-2
 877         movd    %ecx, %mm4              / save_cnt
 878         jecxz   .L32                    / while (cnt != 0) {
 879 .L31:
 880         movd    0(%esi), %mm3           / digit = a[i]
 881         leal    4(%esi), %esi           / a += 4; a = &a[1]; a = &a[i + 1]
 882         leal    8(%edi), %edi           / r += 8; r = &r[2]; r = &r[2 * i + 1]
 883         movl    %edi, %edx              / edx = r
 884         movl    %esi, %ebx              / ebx = a
 885         cmp     $1, %ecx                / The last triangle term is special
 886         jz      .L32
 887         call    big_mul_add_vec_sse2_r
 888         movd    %mm4, %ecx              / cnt = save_cnt
 889         movl    %eax, (%edi, %ecx, 4)   / r[cnt] = cy
 890         subl    $1, %ecx                / --cnt
 891         movd    %ecx, %mm4              / save_cnt = cnt
 892         jmp     .L31                    / }
 893
 894 .L32:
 895         movd    0(%ebx), %mm1           / mm1 = a[i + 1]
 896         movd    0(%edx), %mm2           / mm2 = r[2 * i + 1]
 897         pmuludq %mm3, %mm1              / mm1 = p = digit * a[i + 1]
 898         paddq   %mm1, %mm2              / mm2 = r[2 * i + 1] + p
 899         movd    %mm2, 0(%edx)           / r[2 * i + 1] += lo32(p)
 900         psrlq   $32, %mm2               / mm2 = cy
 901         movd    %mm2, 4(%edx)           / r[2 * i + 2] = cy
 902         pxor    %mm2, %mm2
 903         movd    %mm2, 8(%edx)           / r[2 * i + 3] = 0
 904
 905         movl    8(%ebp), %edx           / r = arg(r)
 906         movl    12(%ebp), %ebx          / a = arg(a)
 907         movl    16(%ebp), %ecx          / cnt = arg(alen)
 908
 909         / compute low-order corner
 910         / p = a[0]**2
 911         / r[0] = lo32(p)
 912         / cy   = hi32(p)
 913         movd    0(%ebx), %mm2           / mm2 = a[0]
 914         pmuludq %mm2, %mm2              / mm2 = p = a[0]**2
 915         movd    %mm2, 0(%edx)           / r[0] = lo32(p)
 916         psrlq   $32, %mm2               / mm2 = cy = hi32(p)
 917
 918         / p = 2 * r[1]
 919         / t = p + cy
 920         / r[1] = lo32(t)
 921         / cy   = hi32(t)
 922         movd    4(%edx), %mm1           / mm1 = r[1]
 923         psllq   $1, %mm1                / mm1 = p = 2 * r[1]
 924         paddq   %mm1, %mm2              / mm2 = t = p + cy
 925         movd    %mm2, 4(%edx)           / r[1] = low32(t)
 926         psrlq   $32, %mm2               / mm2 = cy = hi32(t)
 927
 928         / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
 929         subl    $2, %ecx                / cnt = alen - 2
 930 .L34:
 931         movd    4(%ebx), %mm0           / mm0 = diag = a[i+1]
 932         pmuludq %mm0, %mm0              / mm0 = p = diag**2
 933         paddq   %mm0, %mm2              / mm2 = t = p + cy
 934         movd    %mm2, %eax
 935         movd    %eax, %mm1              / mm1 = lo32(t)
 936         psrlq   $32, %mm2               / mm2 = hi32(t)
 937
 938         movd    8(%edx), %mm3           / mm3 = r[2*i]
 939         psllq   $1, %mm3                / mm3 = 2*r[2*i]
 940         paddq   %mm3, %mm1              / mm1 = 2*r[2*i] + lo32(t)
 941         movd    %mm1, 8(%edx)           / r[2*i] = 2*r[2*i] + lo32(t)
 942         psrlq   $32, %mm1
 943         paddq   %mm1, %mm2
 944
 945         movd    12(%edx), %mm3          / mm3 = r[2*i+1]
 946         psllq   $1, %mm3                / mm3 = 2*r[2*i+1]
 947         paddq   %mm3, %mm2              / mm2 = 2*r[2*i+1] + hi32(t)
 948         movd    %mm2, 12(%edx)          / r[2*i+1] = mm2
 949         psrlq   $32, %mm2               / mm2 = cy
 950         leal    8(%edx), %edx           / r += 2
 951         leal    4(%ebx), %ebx           / ++a
 952         subl    $1, %ecx                / --cnt
 953         jnz     .L34
 954
 955         / Carry from last triangle term must participate in doubling,
 956         / but this step isn't paired up with a squaring the elements
 957         / of the inner diagonal.
 958         / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
 959         movd    8(%edx), %mm3           / mm3 = r[2*i]
 960         psllq   $1, %mm3                / mm3 = 2*r[2*i]
 961         paddq   %mm3, %mm2              / mm2 = 2*r[2*i] + cy
 962         movd    %mm2, 8(%edx)           / r[2*i] = lo32(2*r[2*i] + cy)
 963         psrlq   $32, %mm2               / mm2 = cy = hi32(2*r[2*i] + cy)
 964
 965         movd    12(%edx), %mm3          / mm3 = r[2*i+1]
 966         psllq   $1, %mm3                / mm3 = 2*r[2*i+1]
 967         paddq   %mm3, %mm2              / mm2 = 2*r[2*i+1] + cy
 968         movd    %mm2, 12(%edx)          / r[2*i+1] = mm2
 969         psrlq   $32, %mm2               / mm2 = cy
 970
 971         / compute high-order corner and add it in
 972         / p = a[alen - 1]**2
 973         / t = p + cy
 974         / r[alen + alen - 2] += lo32(t)
 975         / cy = hi32(t)
 976         / r[alen + alen - 1] = cy
 977         movd    4(%ebx), %mm0           / mm0 = a[$-1]
 978         movd    8(%edx), %mm3           / mm3 = r[$-2]
 979         pmuludq %mm0, %mm0              / mm0 = p = a[$-1]**2
 980         paddq   %mm0, %mm2              / mm2 = t = p + cy
 981         paddq   %mm3, %mm2              / mm2 = r[$-2] + t
 982         movd    %mm2, 8(%edx)           / r[$-2] = lo32(r[$-2] + t)
 983         psrlq   $32, %mm2               / mm2 = cy = hi32(r[$-2] + t)
 984         movd    12(%edx), %mm3
 985         paddq   %mm3, %mm2
 986         movd    %mm2, 12(%edx)          / r[$-1] += cy
 987
 988 .L35:
 989         emms
 990         popl    %esi
 991         popl    %edi
 992         popl    %ebx
 993
 994 #if defined(MMX_MANAGE)
 995         ret
 996         SET_SIZE(big_sqr_vec_sse2_fc)
 997 #else
 998         leave
 999         ret
1000         SET_SIZE(big_sqr_vec_sse2)
1001 #endif
1002
1003
1004 #if defined(MMX_MANAGE)
1005         ENTRY(big_sqr_vec_sse2)
1006         pushl   %ebp
1007         movl    %esp, %ebp
1008         KPREEMPT_DISABLE
1009         TEST_TS(%ebx)
1010         pushl   %ebx
1011         jnz     .sqr_no_save
1012         pushl   %edi
1013         SAVE_MMX_0TO4(%edi)
1014         call    big_sqr_vec_sse2_fc
1015         RSTOR_MMX_0TO4(%edi)
1016         popl    %edi
1017         jmp     .sqr_rtn
1018
1019 .sqr_no_save:
1020         call    big_sqr_vec_sse2_fc
1021
1022 .sqr_rtn:
1023         popl    %ebx
1024         movl    %ebx, %cr0
1025         KPREEMPT_ENABLE
1026         leave
1027         ret
1028         SET_SIZE(big_sqr_vec_sse2)
1029
1030 #endif  /* MMX_MANAGE */
1031
1032 / ------------------------------------------------------------------------
1033 /               UMUL Implementations
1034 / ------------------------------------------------------------------------
1035
1036
1037 / r = a * digit, r and a are vectors of length len
1038 / returns the carry digit
1039 / Does not use any MMX, SSE, or SSE2 instructions.
1040 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1041 / This is a fall-back implementation for x86 models that do not support
1042 / the PMULUDQ instruction.
1043 /
1044 / uint32_t
1045 / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1046 /
1047 / r              8(%ebp)        %edx    %edi
1048 / a             12(%ebp)        %ebx    %esi
1049 / len           16(%ebp)        %ecx
1050 / digit         20(%ebp)        %esi
1051
1052         ENTRY(big_mul_set_vec_umul)
1053         pushl   %ebp
1054         movl    %esp, %ebp
1055         pushl   %esi
1056         pushl   %edi
1057         pushl   %ebx
1058         movl    16(%ebp), %ecx
1059         xorl    %ebx, %ebx      / cy = 0
1060         testl   %ecx, %ecx
1061         movl    8(%ebp), %edi
1062         movl    12(%ebp), %esi
1063         je      .L57
1064
1065 .L55:
1066         movl    (%esi), %eax    / eax = a[i]
1067         leal    4(%esi), %esi   / ++a
1068         mull    20(%ebp)        / edx:eax = a[i] * digit
1069         addl    %ebx, %eax
1070         adcl    $0, %edx        / edx:eax = a[i] * digit + cy
1071         movl    %eax, (%edi)    / r[i] = product[31..0]
1072         movl    %edx, %ebx      / cy = product[63..32]
1073         leal    4(%edi), %edi   / ++r
1074         decl    %ecx            / --len
1075         jnz     .L55            / while (len != 0)
1076 .L57:
1077         movl    %ebx, %eax
1078         popl    %ebx
1079         popl    %edi
1080         popl    %esi
1081         leave
1082         ret
1083         SET_SIZE(big_mul_set_vec_umul)
1084
1085
1086 / r = r + a * digit, r and a are vectors of length len
1087 / returns the carry digit
1088 / Does not use any MMX, SSE, or SSE2 instructions.
1089 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1090 / This is a fall-back implementation for x86 models that do not support
1091 / the PMULUDQ instruction.
1092 /
1093 / uint32_t
1094 / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1095 /
1096 / r              8(%ebp)        %edx    %edi
1097 / a             12(%ebp)        %ebx    %esi
1098 / len           16(%ebp)        %ecx
1099 / digit         20(%ebp)        %esi
1100
1101         ENTRY(big_mul_add_vec_umul)
1102         pushl   %ebp
1103         movl    %esp, %ebp
1104         pushl   %esi
1105         pushl   %edi
1106         pushl   %ebx
1107         movl    16(%ebp), %ecx
1108         xorl    %ebx, %ebx      / cy = 0
1109         testl   %ecx, %ecx
1110         movl    8(%ebp), %edi
1111         movl    12(%ebp), %esi
1112         je      .L67
1113         .align 4
1114 .L65:
1115         movl    (%esi), %eax    / eax = a[i]
1116         leal    4(%esi), %esi   / ++a
1117         mull    20(%ebp)        / edx:eax = a[i] * digit
1118         addl    (%edi), %eax
1119         adcl    $0, %edx        / edx:eax = a[i] * digit + r[i]
1120         addl    %ebx, %eax
1121         adcl    $0, %edx        / edx:eax = a[i] * digit + r[i] + cy
1122         movl    %eax, (%edi)    / r[i] = product[31..0]
1123         movl    %edx, %ebx      / cy = product[63..32]
1124         leal    4(%edi), %edi   / ++r
1125         decl    %ecx            / --len
1126         jnz     .L65            / while (len != 0)
1127 .L67:
1128         movl    %ebx, %eax
1129         popl    %ebx
1130         popl    %edi
1131         popl    %esi
1132         leave
1133         ret
1134         SET_SIZE(big_mul_add_vec_umul)
1135