4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/asm_linkage.h>
26 #include <sys/x86_archext.h>
27 #include <sys/controlregs.h>
30 #if defined(MMX_MANAGE)
34 #define KPREEMPT_DISABLE call kpr_disable
35 #define KPREEMPT_ENABLE call kpr_enable
36 #define TEST_TS(reg) \
43 #define KPREEMPT_DISABLE
44 #define KPREEMPT_ENABLE
46 #define TEST_TS(reg) \
55 #define SAVE_MMX_PROLOG(sreg, nreg) \
56 subl $_MUL
(MMX_SIZE
, nreg
+ MMX_ALIGN
), %esp; \
58 addl $MMX_ALIGN
, sreg; \
59 andl $
-1![MMX_ALIGN-
1], sreg;
61 #define RSTOR_MMX_EPILOG(nreg) \
62 addl $_MUL
(MMX_SIZE
, nreg
+ MMX_ALIGN
), %esp;
64 #define SAVE_MMX_0TO4(sreg) \
65 SAVE_MMX_PROLOG
(sreg
, 5); \
68 movq
%mm2
, 16(sreg
); \
69 movq
%mm3
, 24(sreg
); \
72 #define RSTOR_MMX_0TO4(sreg) \
75 movq
16(sreg
), %mm2; \
76 movq
24(sreg
), %mm3; \
77 movq
32(sreg
), %mm4; \
80 #endif /* MMX_MANAGE */
82 / Note
: this file contains implementations for
87 / One set of implementations is for SSE2-capable models.
88 / The other uses no MMX
, SSE
, or SSE2 instructions
, only
89 / the x86
32 X
32 -> 64 unsigned multiply instruction
, MUL.
91 / The code for the implementations is grouped by SSE2 vs UMUL
,
92 / rather than grouping pairs of implementations for each function.
93 / This is because the bignum implementation gets
"imprinted"
94 / on the correct implementation
, at the time of first use
,
95 / so none of the code for the other implementations is ever
96 / executed. So
, it is
a no-brainer to layout the code to minimize
97 / the
"footprint" of executed code.
99 / Can we use SSE2 instructions? Return value is non-zero
103 / Using the cpuid instruction directly would work equally
104 / well in userland
and in the kernel
, but we do
not use the
105 / cpuid instruction in the kernel
, we use x86_featureset
,
106 / instead. This means we honor any decisions the kernel
107 / startup code may have made in setting this variable
,
108 / including disabling SSE2. It might even
be a good idea
109 / to honor this kind of setting in userland
, as well
, but
110 / the variable
, x86_featureset is
not readily available to
111 / userland processes.
116 ENTRY
(bignum_use_sse2
)
119 bt $X86FSET_SSE2
, x86_featureset
123 movl $
1, %eax
/ Get feature information
125 movl
%edx
, %eax
/ set return value
127 andl $CPUID_INTC_EDX_SSE2
, %eax
130 SET_SIZE
(bignum_use_sse2
)
133 / ------------------------------------------------------------------------
134 / SSE2 Implementations
135 / ------------------------------------------------------------------------
137 / r
= a * digit
, r
and a are vectors of length len
138 / returns the carry digit
139 / Suitable only for x86 models that support SSE2 instruction set extensions
142 / big_mul_set_vec_sse2_r
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
149 / Does
not touch the following registers
: %esi
, %edi
, %mm4
152 / This is strictly for internal use.
153 / The interface is very light-weight.
154 / All parameters are passed in registers.
155 / It does
not conform to the SYSV x86 ABI.
156 / So
, don
't even think about calling this function directly from C code.
158 / The basic multiply digit loop is unrolled 8 times.
159 / Each comment is preceded by an instance number.
160 / Instructions that have been moved retain their original, "natural"
161 / instance number. It should be easier this way to follow
162 / the step-wise refinement process that went into constructing
168 ENTRY(big_mul_set_vec_sse2_r)
169 xorl %eax, %eax / if (len == 0) return (0);
173 pxor %mm0, %mm0 / cy = 0
178 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
179 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
180 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
181 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
182 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
183 psrlq $32, %mm0 / 1: cy = product[63..32]
185 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
186 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
187 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
188 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
189 psrlq $32, %mm0 / 2: cy = product[63..32]
191 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
192 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
193 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
194 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
195 psrlq $32, %mm0 / 3: cy = product[63..32]
197 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
198 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
199 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
200 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
201 psrlq $32, %mm0 / 4: cy = product[63..32]
203 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
204 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
205 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
206 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
207 psrlq $32, %mm0 / 5: cy = product[63..32]
209 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
210 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
211 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
212 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
213 psrlq $32, %mm0 / 6: cy = product[63..32]
215 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
216 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
217 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
218 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
219 psrlq $32, %mm0 / 7: cy = product[63..32]
221 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
222 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy;
223 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
224 psrlq $32, %mm0 / 8: cy = product[63..32]
226 leal UNROLL32(%ebx), %ebx / a += UNROLL
227 leal UNROLL32(%edx), %edx / r += UNROLL
228 subl $UNROLL, %ecx / len -= UNROLL
233 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
234 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
235 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
236 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
237 psrlq $32, %mm0 / 1: cy = product[63..32]
241 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
242 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
243 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
244 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
245 psrlq $32, %mm0 / 2: cy = product[63..32]
249 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
250 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
251 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
252 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
253 psrlq $32, %mm0 / 3: cy = product[63..32]
257 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
258 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
259 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
260 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
261 psrlq $32, %mm0 / 4: cy = product[63..32]
265 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
266 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
267 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
268 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
269 psrlq $32, %mm0 / 5: cy = product[63..32]
273 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
274 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
275 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
276 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
277 psrlq $32, %mm0 / 6: cy = product[63..32]
281 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
282 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
283 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
284 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
285 psrlq $32, %mm0 / 7: cy = product[63..32]
288 movd %mm0, %eax / return (cy)
289 / no emms. caller is responsible for emms
291 SET_SIZE(big_mul_set_vec_sse2_r)
294 / r = a * digit, r and a are vectors of length len
295 / returns the carry digit
296 / Suitable only for x86 models that support SSE2 instruction set extensions
301 / digit 20(%ebp) %mm3
303 / In userland, there is just the one function, big_mul_set_vec_sse2().
304 / But in the kernel, there are two variations:
305 / 1. big_mul_set_vec_sse2() which does what is necessary to save and
306 / restore state, if necessary, and to ensure that preemtion is
308 / 2. big_mul_set_vec_sse2_nsv() which just does the work;
309 / it is the caller's responsibility to ensure that MMX state
310 / does
not need to
be saved
and restored
and that preemption
311 / is already disabled.
313 #if defined(MMX_MANAGE)
314 ENTRY
(big_mul_set_vec_sse2
)
329 call big_mul_set_vec_sse2_r
340 call big_mul_set_vec_sse2_r
353 SET_SIZE
(big_mul_set_vec_sse2
)
355 ENTRY
(big_mul_set_vec_sse2_nsv
)
363 call big_mul_set_vec_sse2_r
367 SET_SIZE
(big_mul_set_vec_sse2_nsv
)
369 #else /* !defined(MMX_MANAGE) */
371 / r
= a * digit
, r
and a are vectors of length len
372 / returns the carry digit
373 / Suitable only for x86 models that support SSE2 instruction set extensions
378 / digit
20(%ebp
) %mm3
380 ENTRY
(big_mul_set_vec_sse2
)
388 call big_mul_set_vec_sse2_r
393 SET_SIZE
(big_mul_set_vec_sse2
)
395 #endif /* MMX_MANAGE */
398 / r
= r
+ a * digit
, r
and a are vectors of length len
399 / returns the carry digit
400 / Suitable only for x86 models that support SSE2 instruction set extensions
403 / big_mul_add_vec_sse2_r
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
411 / This is strictly for internal use.
412 / The interface is very light-weight.
413 / All parameters are passed in registers.
414 / It does
not conform to the SYSV x86 ABI.
415 / So
, don
't even think about calling this function directly from C code.
417 / The basic multiply digit loop is unrolled 8 times.
418 / Each comment is preceded by an instance number.
419 / Instructions that have been moved retain their original, "natural"
420 / instance number. It should be easier this way to follow
421 / the step-wise refinement process that went into constructing
424 ENTRY(big_mul_add_vec_sse2_r)
429 pxor %mm0, %mm0 / cy = 0
434 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
435 movd 0(%edx), %mm2 / 1: mm2 = r[i]
436 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
437 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
438 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
439 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
440 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
441 movd 4(%edx), %mm2 / 2: mm2 = r[i]
442 psrlq $32, %mm0 / 1: cy = product[63..32]
444 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
445 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
446 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
447 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
448 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
449 movd 8(%edx), %mm2 / 3: mm2 = r[i]
450 psrlq $32, %mm0 / 2: cy = product[63..32]
452 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
453 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
454 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
455 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
456 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
457 movd 12(%edx), %mm2 / 4: mm2 = r[i]
458 psrlq $32, %mm0 / 3: cy = product[63..32]
460 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
461 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
462 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
463 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
464 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
465 movd 16(%edx), %mm2 / 5: mm2 = r[i]
466 psrlq $32, %mm0 / 4: cy = product[63..32]
468 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
469 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
470 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
471 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
472 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
473 movd 20(%edx), %mm2 / 6: mm2 = r[i]
474 psrlq $32, %mm0 / 5: cy = product[63..32]
476 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
477 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
478 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
479 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
480 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
481 movd 24(%edx), %mm2 / 7: mm2 = r[i]
482 psrlq $32, %mm0 / 6: cy = product[63..32]
484 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
485 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
486 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
487 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
488 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
489 movd 28(%edx), %mm2 / 8: mm2 = r[i]
490 psrlq $32, %mm0 / 7: cy = product[63..32]
492 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
493 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i]
494 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy;
495 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
496 psrlq $32, %mm0 / 8: cy = product[63..32]
498 leal UNROLL32(%ebx), %ebx / a += UNROLL
499 leal UNROLL32(%edx), %edx / r += UNROLL
500 subl $UNROLL, %ecx / len -= UNROLL
505 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
506 movd 0(%edx), %mm2 / 1: mm2 = r[i]
507 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
508 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
509 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
510 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
511 psrlq $32, %mm0 / 1: cy = product[63..32]
515 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
516 movd 4(%edx), %mm2 / 2: mm2 = r[i]
517 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
518 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
519 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
520 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
521 psrlq $32, %mm0 / 2: cy = product[63..32]
525 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
526 movd 8(%edx), %mm2 / 3: mm2 = r[i]
527 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
528 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
529 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
530 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
531 psrlq $32, %mm0 / 3: cy = product[63..32]
535 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
536 movd 12(%edx), %mm2 / 4: mm2 = r[i]
537 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
538 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
539 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
540 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
541 psrlq $32, %mm0 / 4: cy = product[63..32]
545 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
546 movd 16(%edx), %mm2 / 5: mm2 = r[i]
547 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
548 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
549 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
550 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
551 psrlq $32, %mm0 / 5: cy = product[63..32]
555 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
556 movd 20(%edx), %mm2 / 6: mm2 = r[i]
557 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
558 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
559 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
560 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
561 psrlq $32, %mm0 / 6: cy = product[63..32]
565 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
566 movd 24(%edx), %mm2 / 7: mm2 = r[i]
567 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
568 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
569 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
570 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
571 psrlq $32, %mm0 / 7: cy = product[63..32]
575 / no emms. caller is responsible for emms
577 SET_SIZE(big_mul_add_vec_sse2_r)
580 / r = r + a * digit, r and a are vectors of length len
581 / returns the carry digit
582 / Suitable only for x86 models that support SSE2 instruction set extensions
587 / digit 20(%ebp) %mm3
589 / In userland, there is just the one function, big_mul_add_vec_sse2().
590 / But in the kernel, there are two variations:
591 / 1. big_mul_add_vec_sse2() which does what is necessary to save and
592 / restore state, if necessary, and to ensure that preemtion is
594 / 2. big_mul_add_vec_sse2_nsv() which just does the work;
595 / it is the caller's responsibility to ensure that MMX state
596 / does
not need to
be saved
and restored
and that preemption
597 / is already disabled.
600 #if defined(MMX_MANAGE)
602 ENTRY
(big_mul_add_vec_sse2
)
617 call big_mul_add_vec_sse2_r
628 call big_mul_add_vec_sse2_r
641 SET_SIZE
(big_mul_add_vec_sse2
)
643 ENTRY
(big_mul_add_vec_sse2_nsv
)
651 call big_mul_add_vec_sse2_r
655 SET_SIZE
(big_mul_add_vec_sse2_nsv
)
658 #else /* !defined(MMX_MANAGE) */
660 ENTRY
(big_mul_add_vec_sse2
)
668 call big_mul_add_vec_sse2_r
673 SET_SIZE
(big_mul_add_vec_sse2
)
675 #endif /* MMX_MANAGE */
679 / big_mul_vec_sse2
(uint32_t
*r
, uint32_t
*a, int alen
, uint32_t
*b, int blen
)
683 / r
[alen
] = big_mul_set_vec_sse2
(r
, a, alen
, b[0]);
684 / for
(i
= 1; i
< blen;
++i
)
685 / r
[alen
+ i
] = big_mul_add_vec_sse2
(r+i
, a, alen
, b[i
]);
689 #if defined(MMX_MANAGE)
690 ENTRY
(big_mul_vec_sse2_fc
)
692 ENTRY
(big_mul_vec_sse2
)
708 #if defined(MMX_MANAGE)
709 call big_mul_set_vec_sse2_nsv
711 call big_mul_set_vec_sse2
714 movl
%eax
, (%ebx
,%edi
,4)
727 leal
(%ebx
,%ebp
,4), %eax
729 #if defined(MMX_MANAGE)
730 call big_mul_add_vec_sse2_nsv
732 call big_mul_add_vec_sse2
735 leal
(%ebp
,%edi
), %ecx
736 movl
%eax
, (%ebx
,%ecx
,4)
741 #if defined(MMX_MANAGE)
750 #if defined(MMX_MANAGE)
751 SET_SIZE
(big_mul_vec_sse2_fc
)
753 SET_SIZE
(big_mul_vec_sse2
)
756 #if defined(MMX_MANAGE)
758 ENTRY
(big_mul_vec_sse2
)
770 movl
24(%ebp
), %eax
/ blen
772 movl
20(%ebp
), %eax
/ b
774 movl
16(%ebp
), %eax
/ alen
776 movl
12(%ebp
), %eax
/ a
778 movl
8(%ebp
), %eax
/ r
780 call big_mul_vec_sse2_fc
793 SET_SIZE
(big_mul_vec_sse2
)
795 #endif /* MMX_MANAGE */
803 / r
= a * a, r
and a are vectors of length len
804 / Suitable only for x86 models that support SSE2 instruction set extensions
806 / This function is
not suitable for
a truly general-purpose multiprecision
807 / arithmetic library
, because it does
not work for
"small" numbers
, that is
808 / numbers of
1 or 2 digits. big_mul
() just uses the ordinary big_mul_vec
()
809 / for any small numbers.
811 #if defined(MMX_MANAGE)
812 ENTRY
(big_sqr_vec_sse2_fc
)
814 ENTRY
(big_sqr_vec_sse2
)
823 / r
[1..alen] = a[0] * a[1..alen-1]
825 movl
8(%ebp
), %edi
/ r
= arg
(r
)
826 movl
12(%ebp
), %esi
/ a = arg
(a)
827 movl
16(%ebp
), %ecx
/ cnt
= arg
(alen
)
828 movd
%ecx
, %mm4
/ save_cnt
= arg
(alen
)
829 leal
4(%edi
), %edx
/ dst = &r
[1]
830 movl
%esi
, %ebx
/ src
= a
831 movd
0(%ebx
), %mm3
/ mm3
= a[0]
832 leal
4(%ebx
), %ebx
/ src
= &a[1]
833 subl $
1, %ecx
/ --cnt
834 call big_mul_set_vec_sse2_r
/ r
[1..alen-1] = a[0] * a[1..alen-1]
835 movl
%edi
, %edx
/ dst = r
836 movl
%esi
, %ebx
/ src
= a
837 movd
%mm4
, %ecx
/ cnt
= save_cnt
838 movl
%eax
, (%edx
, %ecx
, 4) / r
[cnt
] = cy
840 / /* High-level vector C pseudocode */
841 / for
(i
= 1; i
< alen-
1;
++i
)
842 / r
[2*i
+ 1 ... ] += a[i] * a[i+1 .. alen-1]
844 / /* Same thing, but slightly lower level C-like pseudocode */
846 / r
= &arg_r
[2*i
+ 1];
851 / r
[cnt
] = big_mul_add_vec_sse2_r
(r
, a, cnt
, digit
);
857 / /* Same thing, but even lower level
858 / * For example, pointers are raw pointers,
859 / * with no scaling by object size.
861 / r
= arg_r
+ 12;
/* i == 1; 2i + 1 == 3; 4*3 == 12; */
863 / digit
= *(arg_a
+ 4);
866 / cy
= big_mul_add_vec_sse2_r
();
867 / *(r
+ 4 * cnt
) = cy;
873 leal
4(%edi
), %edi
/ r
+= 4; r
= &r
[1]
874 leal
4(%esi
), %esi
/ a += 4;
a = &a[1]
875 movd
%mm4
, %ecx
/ cnt
= save
876 subl $
2, %ecx
/ cnt
= alen
- 2; i in
1..alen-2
877 movd
%ecx
, %mm4
/ save_cnt
878 jecxz
.L32 / while (cnt != 0) {
880 movd
0(%esi
), %mm3
/ digit
= a[i
]
881 leal
4(%esi
), %esi
/ a += 4;
a = &a[1];
a = &a[i
+ 1]
882 leal
8(%edi
), %edi
/ r
+= 8; r
= &r
[2]; r
= &r
[2 * i
+ 1]
883 movl
%edi
, %edx
/ edx
= r
884 movl
%esi
, %ebx
/ ebx
= a
885 cmp $
1, %ecx
/ The last triangle term is special
887 call big_mul_add_vec_sse2_r
888 movd
%mm4
, %ecx
/ cnt
= save_cnt
889 movl
%eax
, (%edi
, %ecx
, 4) / r
[cnt
] = cy
890 subl $
1, %ecx
/ --cnt
891 movd
%ecx
, %mm4
/ save_cnt
= cnt
895 movd
0(%ebx
), %mm1
/ mm1
= a[i
+ 1]
896 movd
0(%edx
), %mm2
/ mm2
= r
[2 * i
+ 1]
897 pmuludq
%mm3
, %mm1
/ mm1
= p
= digit
* a[i
+ 1]
898 paddq
%mm1
, %mm2
/ mm2
= r
[2 * i
+ 1] + p
899 movd
%mm2
, 0(%edx
) / r
[2 * i
+ 1] += lo32
(p
)
900 psrlq $
32, %mm2
/ mm2
= cy
901 movd
%mm2
, 4(%edx
) / r
[2 * i
+ 2] = cy
903 movd
%mm2
, 8(%edx
) / r
[2 * i
+ 3] = 0
905 movl
8(%ebp
), %edx
/ r
= arg
(r
)
906 movl
12(%ebp
), %ebx
/ a = arg
(a)
907 movl
16(%ebp
), %ecx
/ cnt
= arg
(alen
)
909 / compute low-order corner
913 movd
0(%ebx
), %mm2
/ mm2
= a[0]
914 pmuludq
%mm2
, %mm2
/ mm2
= p
= a[0]**2
915 movd
%mm2
, 0(%edx
) / r
[0] = lo32
(p
)
916 psrlq $
32, %mm2
/ mm2
= cy
= hi32
(p
)
922 movd
4(%edx
), %mm1
/ mm1
= r
[1]
923 psllq $
1, %mm1
/ mm1
= p
= 2 * r
[1]
924 paddq
%mm1
, %mm2
/ mm2
= t = p
+ cy
925 movd
%mm2
, 4(%edx
) / r
[1] = low32
(t)
926 psrlq $
32, %mm2
/ mm2
= cy
= hi32
(t)
928 / r
[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
929 subl $
2, %ecx
/ cnt
= alen
- 2
931 movd
4(%ebx
), %mm0
/ mm0
= diag
= a[i+
1]
932 pmuludq
%mm0
, %mm0
/ mm0
= p
= diag
**2
933 paddq
%mm0
, %mm2
/ mm2
= t = p
+ cy
935 movd
%eax
, %mm1
/ mm1
= lo32
(t)
936 psrlq $
32, %mm2
/ mm2
= hi32
(t)
938 movd
8(%edx
), %mm3
/ mm3
= r
[2*i
]
939 psllq $
1, %mm3
/ mm3
= 2*r
[2*i
]
940 paddq
%mm3
, %mm1
/ mm1
= 2*r
[2*i
] + lo32
(t)
941 movd
%mm1
, 8(%edx
) / r
[2*i
] = 2*r
[2*i
] + lo32
(t)
945 movd
12(%edx
), %mm3
/ mm3
= r
[2*i+
1]
946 psllq $
1, %mm3
/ mm3
= 2*r
[2*i+
1]
947 paddq
%mm3
, %mm2
/ mm2
= 2*r
[2*i+
1] + hi32
(t)
948 movd
%mm2
, 12(%edx
) / r
[2*i+
1] = mm2
949 psrlq $
32, %mm2
/ mm2
= cy
950 leal
8(%edx
), %edx
/ r
+= 2
951 leal
4(%ebx
), %ebx
/ ++a
952 subl $
1, %ecx
/ --cnt
955 / Carry from last triangle term must participate in doubling
,
956 / but this step isn
't paired up with a squaring the elements
957 / of the inner diagonal.
958 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
959 movd 8(%edx), %mm3 / mm3 = r[2*i]
960 psllq $1, %mm3 / mm3 = 2*r[2*i]
961 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy
962 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy)
963 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy)
965 movd 12(%edx), %mm3 / mm3 = r[2*i+1]
966 psllq $1, %mm3 / mm3 = 2*r[2*i+1]
967 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy
968 movd %mm2, 12(%edx) / r[2*i+1] = mm2
969 psrlq $32, %mm2 / mm2 = cy
971 / compute high-order corner and add it in
974 / r[alen + alen - 2] += lo32(t)
976 / r[alen + alen - 1] = cy
977 movd 4(%ebx), %mm0 / mm0 = a[$-1]
978 movd 8(%edx), %mm3 / mm3 = r[$-2]
979 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2
980 paddq %mm0, %mm2 / mm2 = t = p + cy
981 paddq %mm3, %mm2 / mm2 = r[$-2] + t
982 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t)
983 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t)
986 movd %mm2, 12(%edx) / r[$-1] += cy
994 #if defined(MMX_MANAGE)
996 SET_SIZE(big_sqr_vec_sse2_fc)
1000 SET_SIZE(big_sqr_vec_sse2)
1004 #if defined(MMX_MANAGE)
1005 ENTRY(big_sqr_vec_sse2)
1014 call big_sqr_vec_sse2_fc
1015 RSTOR_MMX_0TO4(%edi)
1020 call big_sqr_vec_sse2_fc
1028 SET_SIZE(big_sqr_vec_sse2)
1030 #endif /* MMX_MANAGE */
1032 / ------------------------------------------------------------------------
1033 / UMUL Implementations
1034 / ------------------------------------------------------------------------
1037 / r = a * digit, r and a are vectors of length len
1038 / returns the carry digit
1039 / Does not use any MMX, SSE, or SSE2 instructions.
1040 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1041 / This is a fall-back implementation for x86 models that do not support
1042 / the PMULUDQ instruction.
1045 / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1047 / r 8(%ebp) %edx %edi
1048 / a 12(%ebp) %ebx %esi
1050 / digit 20(%ebp) %esi
1052 ENTRY(big_mul_set_vec_umul)
1059 xorl %ebx, %ebx / cy = 0
1066 movl (%esi), %eax / eax = a[i]
1067 leal 4(%esi), %esi / ++a
1068 mull 20(%ebp) / edx:eax = a[i] * digit
1070 adcl $0, %edx / edx:eax = a[i] * digit + cy
1071 movl %eax, (%edi) / r[i] = product[31..0]
1072 movl %edx, %ebx / cy = product[63..32]
1073 leal 4(%edi), %edi / ++r
1075 jnz .L55 / while (len != 0)
1083 SET_SIZE(big_mul_set_vec_umul)
1086 / r = r + a * digit, r and a are vectors of length len
1087 / returns the carry digit
1088 / Does not use any MMX, SSE, or SSE2 instructions.
1089 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1090 / This is a fall-back implementation for x86 models that do not support
1091 / the PMULUDQ instruction.
1094 / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1096 / r 8(%ebp) %edx %edi
1097 / a 12(%ebp) %ebx %esi
1099 / digit 20(%ebp) %esi
1101 ENTRY(big_mul_add_vec_umul)
1108 xorl %ebx, %ebx / cy = 0
1115 movl (%esi), %eax / eax = a[i]
1116 leal 4(%esi), %esi / ++a
1117 mull 20(%ebp) / edx:eax = a[i] * digit
1119 adcl $0, %edx / edx:eax = a[i] * digit + r[i]
1121 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy
1122 movl %eax, (%edi) / r[i] = product[31..0]
1123 movl %edx, %ebx / cy = product[63..32]
1124 leal 4(%edi), %edi / ++r
1126 jnz .L65 / while (len != 0)
1134 SET_SIZE(big_mul_add_vec_umul)