import less(1)
[unleashed/tickless.git] / usr / src / common / bignum / i386 / bignum_i386_asm.s
blob8c8fd21fdcca6785a4588103e619866858c846a3
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/asm_linkage.h>
26 #include <sys/x86_archext.h>
27 #include <sys/controlregs.h>
30 #if defined(MMX_MANAGE)
32 #if defined(_KERNEL)
34 #define KPREEMPT_DISABLE call kpr_disable
35 #define KPREEMPT_ENABLE call kpr_enable
36 #define TEST_TS(reg) \
37 movl %cr0, reg; \
38 clts; \
39 testl $CR0_TS, reg
41 #else /* _KERNEL */
43 #define KPREEMPT_DISABLE
44 #define KPREEMPT_ENABLE
46 #define TEST_TS(reg) \
47 movl $0, reg; \
48 testl $CR0_TS, reg
50 #endif /* _KERNEL */
52 #define MMX_SIZE 8
53 #define MMX_ALIGN 8
55 #define SAVE_MMX_PROLOG(sreg, nreg) \
56 subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \
57 movl %esp, sreg; \
58 addl $MMX_ALIGN, sreg; \
59 andl $-1![MMX_ALIGN-1], sreg;
61 #define RSTOR_MMX_EPILOG(nreg) \
62 addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
64 #define SAVE_MMX_0TO4(sreg) \
65 SAVE_MMX_PROLOG(sreg, 5); \
66 movq %mm0, 0(sreg); \
67 movq %mm1, 8(sreg); \
68 movq %mm2, 16(sreg); \
69 movq %mm3, 24(sreg); \
70 movq %mm4, 32(sreg)
72 #define RSTOR_MMX_0TO4(sreg) \
73 movq 0(sreg), %mm0; \
74 movq 8(sreg), %mm1; \
75 movq 16(sreg), %mm2; \
76 movq 24(sreg), %mm3; \
77 movq 32(sreg), %mm4; \
78 RSTOR_MMX_EPILOG(5)
80 #endif /* MMX_MANAGE */
82 / Note: this file contains implementations for
83 / big_mul_set_vec()
84 / big_mul_add_vec()
85 / big_mul_vec()
86 / big_sqr_vec()
87 / One set of implementations is for SSE2-capable models.
88 / The other uses no MMX, SSE, or SSE2 instructions, only
89 / the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
91 / The code for the implementations is grouped by SSE2 vs UMUL,
92 / rather than grouping pairs of implementations for each function.
93 / This is because the bignum implementation gets "imprinted"
94 / on the correct implementation, at the time of first use,
95 / so none of the code for the other implementations is ever
96 / executed. So, it is a no-brainer to layout the code to minimize
97 / the "footprint" of executed code.
99 / Can we use SSE2 instructions? Return value is non-zero
100 / if we can.
102 / Note:
103 / Using the cpuid instruction directly would work equally
104 / well in userland and in the kernel, but we do not use the
105 / cpuid instruction in the kernel, we use x86_featureset,
106 / instead. This means we honor any decisions the kernel
107 / startup code may have made in setting this variable,
108 / including disabling SSE2. It might even be a good idea
109 / to honor this kind of setting in userland, as well, but
110 / the variable, x86_featureset is not readily available to
111 / userland processes.
113 / uint32_t
114 / bignum_use_sse2()
116 ENTRY(bignum_use_sse2)
117 #if defined(_KERNEL)
118 xor %eax, %eax
119 bt $X86FSET_SSE2, x86_featureset
120 adc %eax, %eax
121 #else /* _KERNEL */
122 pushl %ebx
123 movl $1, %eax / Get feature information
124 cpuid
125 movl %edx, %eax / set return value
126 popl %ebx
127 andl $CPUID_INTC_EDX_SSE2, %eax
128 #endif /* _KERNEL */
130 SET_SIZE(bignum_use_sse2)
133 / ------------------------------------------------------------------------
134 / SSE2 Implementations
135 / ------------------------------------------------------------------------
137 / r = a * digit, r and a are vectors of length len
138 / returns the carry digit
139 / Suitable only for x86 models that support SSE2 instruction set extensions
141 / uint32_t
142 / big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
144 / r %edx
145 / a %ebx
146 / len %ecx
147 / digit %mm3
149 / Does not touch the following registers: %esi, %edi, %mm4
151 / N.B.:
152 / This is strictly for internal use.
153 / The interface is very light-weight.
154 / All parameters are passed in registers.
155 / It does not conform to the SYSV x86 ABI.
156 / So, don't even think about calling this function directly from C code.
158 / The basic multiply digit loop is unrolled 8 times.
159 / Each comment is preceded by an instance number.
160 / Instructions that have been moved retain their original, "natural"
161 / instance number. It should be easier this way to follow
162 / the step-wise refinement process that went into constructing
163 / the final code.
165 #define UNROLL 8
166 #define UNROLL32 32
168 ENTRY(big_mul_set_vec_sse2_r)
169 xorl %eax, %eax / if (len == 0) return (0);
170 testl %ecx, %ecx
171 jz .L17
173 pxor %mm0, %mm0 / cy = 0
175 .L15:
176 cmpl $UNROLL, %ecx
177 jl .L16
178 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
179 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
180 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
181 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
182 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
183 psrlq $32, %mm0 / 1: cy = product[63..32]
185 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
186 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
187 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
188 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
189 psrlq $32, %mm0 / 2: cy = product[63..32]
191 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
192 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
193 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
194 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
195 psrlq $32, %mm0 / 3: cy = product[63..32]
197 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
198 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
199 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
200 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
201 psrlq $32, %mm0 / 4: cy = product[63..32]
203 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
204 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
205 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
206 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
207 psrlq $32, %mm0 / 5: cy = product[63..32]
209 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
210 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
211 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
212 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
213 psrlq $32, %mm0 / 6: cy = product[63..32]
215 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
216 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
217 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
218 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
219 psrlq $32, %mm0 / 7: cy = product[63..32]
221 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
222 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy;
223 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
224 psrlq $32, %mm0 / 8: cy = product[63..32]
226 leal UNROLL32(%ebx), %ebx / a += UNROLL
227 leal UNROLL32(%edx), %edx / r += UNROLL
228 subl $UNROLL, %ecx / len -= UNROLL
229 jz .L17
230 jmp .L15
232 .L16:
233 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
234 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
235 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
236 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
237 psrlq $32, %mm0 / 1: cy = product[63..32]
238 subl $1, %ecx
239 jz .L17
241 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
242 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
243 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
244 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
245 psrlq $32, %mm0 / 2: cy = product[63..32]
246 subl $1, %ecx
247 jz .L17
249 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
250 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
251 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
252 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
253 psrlq $32, %mm0 / 3: cy = product[63..32]
254 subl $1, %ecx
255 jz .L17
257 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
258 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
259 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
260 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
261 psrlq $32, %mm0 / 4: cy = product[63..32]
262 subl $1, %ecx
263 jz .L17
265 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
266 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
267 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
268 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
269 psrlq $32, %mm0 / 5: cy = product[63..32]
270 subl $1, %ecx
271 jz .L17
273 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
274 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
275 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
276 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
277 psrlq $32, %mm0 / 6: cy = product[63..32]
278 subl $1, %ecx
279 jz .L17
281 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
282 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
283 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
284 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
285 psrlq $32, %mm0 / 7: cy = product[63..32]
287 .L17:
288 movd %mm0, %eax / return (cy)
289 / no emms. caller is responsible for emms
291 SET_SIZE(big_mul_set_vec_sse2_r)
294 / r = a * digit, r and a are vectors of length len
295 / returns the carry digit
296 / Suitable only for x86 models that support SSE2 instruction set extensions
298 / r 8(%ebp) %edx
299 / a 12(%ebp) %ebx
300 / len 16(%ebp) %ecx
301 / digit 20(%ebp) %mm3
303 / In userland, there is just the one function, big_mul_set_vec_sse2().
304 / But in the kernel, there are two variations:
305 / 1. big_mul_set_vec_sse2() which does what is necessary to save and
306 / restore state, if necessary, and to ensure that preemtion is
307 / disabled.
308 / 2. big_mul_set_vec_sse2_nsv() which just does the work;
309 / it is the caller's responsibility to ensure that MMX state
310 / does not need to be saved and restored and that preemption
311 / is already disabled.
313 #if defined(MMX_MANAGE)
314 ENTRY(big_mul_set_vec_sse2)
315 pushl %ebp
316 movl %esp, %ebp
317 pushl %ebx
318 pushl %esi
319 KPREEMPT_DISABLE
320 TEST_TS(%ebx)
321 pushl %ebx
322 jnz .setvec_no_save
323 pushl %edi
324 SAVE_MMX_0TO4(%edi)
325 movl 8(%ebp), %edx
326 movl 12(%ebp), %ebx
327 movl 16(%ebp), %ecx
328 movd 20(%ebp), %mm3
329 call big_mul_set_vec_sse2_r
330 movl %eax, %esi
331 RSTOR_MMX_0TO4(%edi)
332 popl %edi
333 jmp .setvec_rtn
335 .setvec_no_save:
336 movl 8(%ebp), %edx
337 movl 12(%ebp), %ebx
338 movl 16(%ebp), %ecx
339 movd 20(%ebp), %mm3
340 call big_mul_set_vec_sse2_r
341 movl %eax, %esi
343 .setvec_rtn:
344 emms
345 popl %ebx
346 movl %ebx, %cr0
347 KPREEMPT_ENABLE
348 movl %esi, %eax
349 popl %esi
350 popl %ebx
351 leave
353 SET_SIZE(big_mul_set_vec_sse2)
355 ENTRY(big_mul_set_vec_sse2_nsv)
356 pushl %ebp
357 movl %esp, %ebp
358 pushl %ebx
359 movl 8(%ebp), %edx
360 movl 12(%ebp), %ebx
361 movl 16(%ebp), %ecx
362 movd 20(%ebp), %mm3
363 call big_mul_set_vec_sse2_r
364 popl %ebx
365 leave
367 SET_SIZE(big_mul_set_vec_sse2_nsv)
369 #else /* !defined(MMX_MANAGE) */
371 / r = a * digit, r and a are vectors of length len
372 / returns the carry digit
373 / Suitable only for x86 models that support SSE2 instruction set extensions
375 / r 8(%ebp) %edx
376 / a 12(%ebp) %ebx
377 / len 16(%ebp) %ecx
378 / digit 20(%ebp) %mm3
380 ENTRY(big_mul_set_vec_sse2)
381 pushl %ebp
382 movl %esp, %ebp
383 pushl %ebx
384 movl 8(%ebp), %edx
385 movl 12(%ebp), %ebx
386 movl 16(%ebp), %ecx
387 movd 20(%ebp), %mm3
388 call big_mul_set_vec_sse2_r
389 popl %ebx
390 emms
391 leave
393 SET_SIZE(big_mul_set_vec_sse2)
395 #endif /* MMX_MANAGE */
398 / r = r + a * digit, r and a are vectors of length len
399 / returns the carry digit
400 / Suitable only for x86 models that support SSE2 instruction set extensions
402 / uint32_t
403 / big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
405 / r %edx
406 / a %ebx
407 / len %ecx
408 / digit %mm3
410 / N.B.:
411 / This is strictly for internal use.
412 / The interface is very light-weight.
413 / All parameters are passed in registers.
414 / It does not conform to the SYSV x86 ABI.
415 / So, don't even think about calling this function directly from C code.
417 / The basic multiply digit loop is unrolled 8 times.
418 / Each comment is preceded by an instance number.
419 / Instructions that have been moved retain their original, "natural"
420 / instance number. It should be easier this way to follow
421 / the step-wise refinement process that went into constructing
422 / the final code.
424 ENTRY(big_mul_add_vec_sse2_r)
425 xorl %eax, %eax
426 testl %ecx, %ecx
427 jz .L27
429 pxor %mm0, %mm0 / cy = 0
431 .L25:
432 cmpl $UNROLL, %ecx
433 jl .L26
434 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
435 movd 0(%edx), %mm2 / 1: mm2 = r[i]
436 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
437 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
438 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
439 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
440 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
441 movd 4(%edx), %mm2 / 2: mm2 = r[i]
442 psrlq $32, %mm0 / 1: cy = product[63..32]
444 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
445 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
446 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
447 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
448 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
449 movd 8(%edx), %mm2 / 3: mm2 = r[i]
450 psrlq $32, %mm0 / 2: cy = product[63..32]
452 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
453 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
454 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
455 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
456 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
457 movd 12(%edx), %mm2 / 4: mm2 = r[i]
458 psrlq $32, %mm0 / 3: cy = product[63..32]
460 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
461 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
462 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
463 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
464 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
465 movd 16(%edx), %mm2 / 5: mm2 = r[i]
466 psrlq $32, %mm0 / 4: cy = product[63..32]
468 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
469 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
470 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
471 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
472 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
473 movd 20(%edx), %mm2 / 6: mm2 = r[i]
474 psrlq $32, %mm0 / 5: cy = product[63..32]
476 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
477 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
478 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
479 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
480 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
481 movd 24(%edx), %mm2 / 7: mm2 = r[i]
482 psrlq $32, %mm0 / 6: cy = product[63..32]
484 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
485 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
486 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
487 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
488 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
489 movd 28(%edx), %mm2 / 8: mm2 = r[i]
490 psrlq $32, %mm0 / 7: cy = product[63..32]
492 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
493 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i]
494 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy;
495 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
496 psrlq $32, %mm0 / 8: cy = product[63..32]
498 leal UNROLL32(%ebx), %ebx / a += UNROLL
499 leal UNROLL32(%edx), %edx / r += UNROLL
500 subl $UNROLL, %ecx / len -= UNROLL
501 jz .L27
502 jmp .L25
504 .L26:
505 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
506 movd 0(%edx), %mm2 / 1: mm2 = r[i]
507 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
508 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
509 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
510 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
511 psrlq $32, %mm0 / 1: cy = product[63..32]
512 subl $1, %ecx
513 jz .L27
515 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
516 movd 4(%edx), %mm2 / 2: mm2 = r[i]
517 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
518 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
519 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
520 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
521 psrlq $32, %mm0 / 2: cy = product[63..32]
522 subl $1, %ecx
523 jz .L27
525 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
526 movd 8(%edx), %mm2 / 3: mm2 = r[i]
527 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
528 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
529 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
530 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
531 psrlq $32, %mm0 / 3: cy = product[63..32]
532 subl $1, %ecx
533 jz .L27
535 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
536 movd 12(%edx), %mm2 / 4: mm2 = r[i]
537 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
538 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
539 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
540 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
541 psrlq $32, %mm0 / 4: cy = product[63..32]
542 subl $1, %ecx
543 jz .L27
545 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
546 movd 16(%edx), %mm2 / 5: mm2 = r[i]
547 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
548 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
549 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
550 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
551 psrlq $32, %mm0 / 5: cy = product[63..32]
552 subl $1, %ecx
553 jz .L27
555 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
556 movd 20(%edx), %mm2 / 6: mm2 = r[i]
557 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
558 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
559 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
560 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
561 psrlq $32, %mm0 / 6: cy = product[63..32]
562 subl $1, %ecx
563 jz .L27
565 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
566 movd 24(%edx), %mm2 / 7: mm2 = r[i]
567 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
568 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
569 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
570 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
571 psrlq $32, %mm0 / 7: cy = product[63..32]
573 .L27:
574 movd %mm0, %eax
575 / no emms. caller is responsible for emms
577 SET_SIZE(big_mul_add_vec_sse2_r)
580 / r = r + a * digit, r and a are vectors of length len
581 / returns the carry digit
582 / Suitable only for x86 models that support SSE2 instruction set extensions
584 / r 8(%ebp) %edx
585 / a 12(%ebp) %ebx
586 / len 16(%ebp) %ecx
587 / digit 20(%ebp) %mm3
589 / In userland, there is just the one function, big_mul_add_vec_sse2().
590 / But in the kernel, there are two variations:
591 / 1. big_mul_add_vec_sse2() which does what is necessary to save and
592 / restore state, if necessary, and to ensure that preemtion is
593 / disabled.
594 / 2. big_mul_add_vec_sse2_nsv() which just does the work;
595 / it is the caller's responsibility to ensure that MMX state
596 / does not need to be saved and restored and that preemption
597 / is already disabled.
600 #if defined(MMX_MANAGE)
602 ENTRY(big_mul_add_vec_sse2)
603 pushl %ebp
604 movl %esp, %ebp
605 pushl %ebx
606 pushl %esi
607 KPREEMPT_DISABLE
608 TEST_TS(%ebx)
609 pushl %ebx
610 jnz .addvec_no_save
611 pushl %edi
612 SAVE_MMX_0TO4(%edi)
613 movl 8(%ebp), %edx
614 movl 12(%ebp), %ebx
615 movl 16(%ebp), %ecx
616 movd 20(%ebp), %mm3
617 call big_mul_add_vec_sse2_r
618 movl %eax, %esi
619 RSTOR_MMX_0TO4(%edi)
620 popl %edi
621 jmp .addvec_rtn
623 .addvec_no_save:
624 movl 8(%ebp), %edx
625 movl 12(%ebp), %ebx
626 movl 16(%ebp), %ecx
627 movd 20(%ebp), %mm3
628 call big_mul_add_vec_sse2_r
629 movl %eax, %esi
631 .addvec_rtn:
632 emms
633 popl %ebx
634 movl %ebx, %cr0
635 KPREEMPT_ENABLE
636 movl %esi, %eax
637 popl %esi
638 popl %ebx
639 leave
641 SET_SIZE(big_mul_add_vec_sse2)
643 ENTRY(big_mul_add_vec_sse2_nsv)
644 pushl %ebp
645 movl %esp, %ebp
646 pushl %ebx
647 movl 8(%ebp), %edx
648 movl 12(%ebp), %ebx
649 movl 16(%ebp), %ecx
650 movd 20(%ebp), %mm3
651 call big_mul_add_vec_sse2_r
652 popl %ebx
653 leave
655 SET_SIZE(big_mul_add_vec_sse2_nsv)
658 #else /* !defined(MMX_MANAGE) */
660 ENTRY(big_mul_add_vec_sse2)
661 pushl %ebp
662 movl %esp, %ebp
663 pushl %ebx
664 movl 8(%ebp), %edx
665 movl 12(%ebp), %ebx
666 movl 16(%ebp), %ecx
667 movd 20(%ebp), %mm3
668 call big_mul_add_vec_sse2_r
669 popl %ebx
670 emms
671 leave
673 SET_SIZE(big_mul_add_vec_sse2)
675 #endif /* MMX_MANAGE */
678 / void
679 / big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
681 / int i;
683 / r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
684 / for (i = 1; i < blen; ++i)
685 / r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
689 #if defined(MMX_MANAGE)
690 ENTRY(big_mul_vec_sse2_fc)
691 #else
692 ENTRY(big_mul_vec_sse2)
693 #endif
694 subl $0x8, %esp
695 pushl %ebx
696 pushl %ebp
697 pushl %esi
698 pushl %edi
699 movl 40(%esp), %eax
700 movl %eax, 20(%esp)
701 pushl (%eax)
702 movl 40(%esp), %edi
703 pushl %edi
704 movl 40(%esp), %esi
705 pushl %esi
706 movl 40(%esp), %ebx
707 pushl %ebx
708 #if defined(MMX_MANAGE)
709 call big_mul_set_vec_sse2_nsv
710 #else
711 call big_mul_set_vec_sse2
712 #endif
713 addl $0x10, %esp
714 movl %eax, (%ebx,%edi,4)
715 movl 44(%esp), %eax
716 movl %eax, 16(%esp)
717 cmpl $0x1, %eax
718 jle .mulvec_rtn
719 movl $0x1, %ebp
721 .align 16
722 .mulvec_add:
723 movl 20(%esp), %eax
724 pushl (%eax,%ebp,4)
725 pushl %edi
726 pushl %esi
727 leal (%ebx,%ebp,4), %eax
728 pushl %eax
729 #if defined(MMX_MANAGE)
730 call big_mul_add_vec_sse2_nsv
731 #else
732 call big_mul_add_vec_sse2
733 #endif
734 addl $0x10, %esp
735 leal (%ebp,%edi), %ecx
736 movl %eax, (%ebx,%ecx,4)
737 incl %ebp
738 cmpl 16(%esp), %ebp
739 jl .mulvec_add
740 .mulvec_rtn:
741 #if defined(MMX_MANAGE)
742 emms
743 #endif
744 popl %edi
745 popl %esi
746 popl %ebp
747 popl %ebx
748 addl $0x8, %esp
749 ret
750 #if defined(MMX_MANAGE)
751 SET_SIZE(big_mul_vec_sse2_fc)
752 #else
753 SET_SIZE(big_mul_vec_sse2)
754 #endif
756 #if defined(MMX_MANAGE)
758 ENTRY(big_mul_vec_sse2)
759 pushl %ebp
760 movl %esp, %ebp
761 subl $8, %esp
762 pushl %edi
763 KPREEMPT_DISABLE
764 TEST_TS(%eax)
765 movl %eax, -8(%ebp)
766 jnz .mulvec_no_save
767 SAVE_MMX_0TO4(%edi)
768 movl %edi, -4(%ebp)
769 .mulvec_no_save:
770 movl 24(%ebp), %eax / blen
771 pushl %eax
772 movl 20(%ebp), %eax / b
773 pushl %eax
774 movl 16(%ebp), %eax / alen
775 pushl %eax
776 movl 12(%ebp), %eax / a
777 pushl %eax
778 movl 8(%ebp), %eax / r
779 pushl %eax
780 call big_mul_vec_sse2_fc
781 addl $20, %esp
782 movl -8(%ebp), %eax
783 testl $CR0_TS, %eax
784 jnz .mulvec_no_rstr
785 movl -4(%ebp), %edi
786 RSTOR_MMX_0TO4(%edi)
787 .mulvec_no_rstr:
788 movl %eax, %cr0
789 KPREEMPT_ENABLE
790 popl %edi
791 leave
793 SET_SIZE(big_mul_vec_sse2)
795 #endif /* MMX_MANAGE */
799 #undef UNROLL
800 #undef UNROLL32
803 / r = a * a, r and a are vectors of length len
804 / Suitable only for x86 models that support SSE2 instruction set extensions
806 / This function is not suitable for a truly general-purpose multiprecision
807 / arithmetic library, because it does not work for "small" numbers, that is
808 / numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec()
809 / for any small numbers.
811 #if defined(MMX_MANAGE)
812 ENTRY(big_sqr_vec_sse2_fc)
813 #else
814 ENTRY(big_sqr_vec_sse2)
815 pushl %ebp
816 movl %esp, %ebp
817 #endif
819 pushl %ebx
820 pushl %edi
821 pushl %esi
823 / r[1..alen] = a[0] * a[1..alen-1]
825 movl 8(%ebp), %edi / r = arg(r)
826 movl 12(%ebp), %esi / a = arg(a)
827 movl 16(%ebp), %ecx / cnt = arg(alen)
828 movd %ecx, %mm4 / save_cnt = arg(alen)
829 leal 4(%edi), %edx / dst = &r[1]
830 movl %esi, %ebx / src = a
831 movd 0(%ebx), %mm3 / mm3 = a[0]
832 leal 4(%ebx), %ebx / src = &a[1]
833 subl $1, %ecx / --cnt
834 call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1]
835 movl %edi, %edx / dst = r
836 movl %esi, %ebx / src = a
837 movd %mm4, %ecx / cnt = save_cnt
838 movl %eax, (%edx, %ecx, 4) / r[cnt] = cy
840 / /* High-level vector C pseudocode */
841 / for (i = 1; i < alen-1; ++i)
842 / r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
844 / /* Same thing, but slightly lower level C-like pseudocode */
845 / i = 1;
846 / r = &arg_r[2*i + 1];
847 / a = &arg_a[i + 1];
848 / digit = arg_a[i];
849 / cnt = alen - 3;
850 / while (cnt != 0) {
851 / r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
852 / r += 2;
853 / ++a;
854 / --cnt;
857 / /* Same thing, but even lower level
858 / * For example, pointers are raw pointers,
859 / * with no scaling by object size.
860 / */
861 / r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */
862 / a = arg_a + 8;
863 / digit = *(arg_a + 4);
864 / cnt = alen - 3;
865 / while (cnt != 0) {
866 / cy = big_mul_add_vec_sse2_r();
867 / *(r + 4 * cnt) = cy;
868 / r += 8;
869 / a += 4;
870 / --cnt;
873 leal 4(%edi), %edi / r += 4; r = &r[1]
874 leal 4(%esi), %esi / a += 4; a = &a[1]
875 movd %mm4, %ecx / cnt = save
876 subl $2, %ecx / cnt = alen - 2; i in 1..alen-2
877 movd %ecx, %mm4 / save_cnt
878 jecxz .L32 / while (cnt != 0) {
879 .L31:
880 movd 0(%esi), %mm3 / digit = a[i]
881 leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1]
882 leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1]
883 movl %edi, %edx / edx = r
884 movl %esi, %ebx / ebx = a
885 cmp $1, %ecx / The last triangle term is special
886 jz .L32
887 call big_mul_add_vec_sse2_r
888 movd %mm4, %ecx / cnt = save_cnt
889 movl %eax, (%edi, %ecx, 4) / r[cnt] = cy
890 subl $1, %ecx / --cnt
891 movd %ecx, %mm4 / save_cnt = cnt
892 jmp .L31 / }
894 .L32:
895 movd 0(%ebx), %mm1 / mm1 = a[i + 1]
896 movd 0(%edx), %mm2 / mm2 = r[2 * i + 1]
897 pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1]
898 paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p
899 movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p)
900 psrlq $32, %mm2 / mm2 = cy
901 movd %mm2, 4(%edx) / r[2 * i + 2] = cy
902 pxor %mm2, %mm2
903 movd %mm2, 8(%edx) / r[2 * i + 3] = 0
905 movl 8(%ebp), %edx / r = arg(r)
906 movl 12(%ebp), %ebx / a = arg(a)
907 movl 16(%ebp), %ecx / cnt = arg(alen)
909 / compute low-order corner
910 / p = a[0]**2
911 / r[0] = lo32(p)
912 / cy = hi32(p)
913 movd 0(%ebx), %mm2 / mm2 = a[0]
914 pmuludq %mm2, %mm2 / mm2 = p = a[0]**2
915 movd %mm2, 0(%edx) / r[0] = lo32(p)
916 psrlq $32, %mm2 / mm2 = cy = hi32(p)
918 / p = 2 * r[1]
919 / t = p + cy
920 / r[1] = lo32(t)
921 / cy = hi32(t)
922 movd 4(%edx), %mm1 / mm1 = r[1]
923 psllq $1, %mm1 / mm1 = p = 2 * r[1]
924 paddq %mm1, %mm2 / mm2 = t = p + cy
925 movd %mm2, 4(%edx) / r[1] = low32(t)
926 psrlq $32, %mm2 / mm2 = cy = hi32(t)
928 / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
929 subl $2, %ecx / cnt = alen - 2
930 .L34:
931 movd 4(%ebx), %mm0 / mm0 = diag = a[i+1]
932 pmuludq %mm0, %mm0 / mm0 = p = diag**2
933 paddq %mm0, %mm2 / mm2 = t = p + cy
934 movd %mm2, %eax
935 movd %eax, %mm1 / mm1 = lo32(t)
936 psrlq $32, %mm2 / mm2 = hi32(t)
938 movd 8(%edx), %mm3 / mm3 = r[2*i]
939 psllq $1, %mm3 / mm3 = 2*r[2*i]
940 paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t)
941 movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t)
942 psrlq $32, %mm1
943 paddq %mm1, %mm2
945 movd 12(%edx), %mm3 / mm3 = r[2*i+1]
946 psllq $1, %mm3 / mm3 = 2*r[2*i+1]
947 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t)
948 movd %mm2, 12(%edx) / r[2*i+1] = mm2
949 psrlq $32, %mm2 / mm2 = cy
950 leal 8(%edx), %edx / r += 2
951 leal 4(%ebx), %ebx / ++a
952 subl $1, %ecx / --cnt
953 jnz .L34
955 / Carry from last triangle term must participate in doubling,
956 / but this step isn't paired up with a squaring the elements
957 / of the inner diagonal.
958 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
959 movd 8(%edx), %mm3 / mm3 = r[2*i]
960 psllq $1, %mm3 / mm3 = 2*r[2*i]
961 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy
962 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy)
963 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy)
965 movd 12(%edx), %mm3 / mm3 = r[2*i+1]
966 psllq $1, %mm3 / mm3 = 2*r[2*i+1]
967 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy
968 movd %mm2, 12(%edx) / r[2*i+1] = mm2
969 psrlq $32, %mm2 / mm2 = cy
971 / compute high-order corner and add it in
972 / p = a[alen - 1]**2
973 / t = p + cy
974 / r[alen + alen - 2] += lo32(t)
975 / cy = hi32(t)
976 / r[alen + alen - 1] = cy
977 movd 4(%ebx), %mm0 / mm0 = a[$-1]
978 movd 8(%edx), %mm3 / mm3 = r[$-2]
979 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2
980 paddq %mm0, %mm2 / mm2 = t = p + cy
981 paddq %mm3, %mm2 / mm2 = r[$-2] + t
982 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t)
983 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t)
984 movd 12(%edx), %mm3
985 paddq %mm3, %mm2
986 movd %mm2, 12(%edx) / r[$-1] += cy
988 .L35:
989 emms
990 popl %esi
991 popl %edi
992 popl %ebx
994 #if defined(MMX_MANAGE)
996 SET_SIZE(big_sqr_vec_sse2_fc)
997 #else
998 leave
1000 SET_SIZE(big_sqr_vec_sse2)
1001 #endif
1004 #if defined(MMX_MANAGE)
1005 ENTRY(big_sqr_vec_sse2)
1006 pushl %ebp
1007 movl %esp, %ebp
1008 KPREEMPT_DISABLE
1009 TEST_TS(%ebx)
1010 pushl %ebx
1011 jnz .sqr_no_save
1012 pushl %edi
1013 SAVE_MMX_0TO4(%edi)
1014 call big_sqr_vec_sse2_fc
1015 RSTOR_MMX_0TO4(%edi)
1016 popl %edi
1017 jmp .sqr_rtn
1019 .sqr_no_save:
1020 call big_sqr_vec_sse2_fc
1022 .sqr_rtn:
1023 popl %ebx
1024 movl %ebx, %cr0
1025 KPREEMPT_ENABLE
1026 leave
1028 SET_SIZE(big_sqr_vec_sse2)
1030 #endif /* MMX_MANAGE */
1032 / ------------------------------------------------------------------------
1033 / UMUL Implementations
1034 / ------------------------------------------------------------------------
1037 / r = a * digit, r and a are vectors of length len
1038 / returns the carry digit
1039 / Does not use any MMX, SSE, or SSE2 instructions.
1040 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1041 / This is a fall-back implementation for x86 models that do not support
1042 / the PMULUDQ instruction.
1044 / uint32_t
1045 / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1047 / r 8(%ebp) %edx %edi
1048 / a 12(%ebp) %ebx %esi
1049 / len 16(%ebp) %ecx
1050 / digit 20(%ebp) %esi
1052 ENTRY(big_mul_set_vec_umul)
1053 pushl %ebp
1054 movl %esp, %ebp
1055 pushl %esi
1056 pushl %edi
1057 pushl %ebx
1058 movl 16(%ebp), %ecx
1059 xorl %ebx, %ebx / cy = 0
1060 testl %ecx, %ecx
1061 movl 8(%ebp), %edi
1062 movl 12(%ebp), %esi
1063 je .L57
1065 .L55:
1066 movl (%esi), %eax / eax = a[i]
1067 leal 4(%esi), %esi / ++a
1068 mull 20(%ebp) / edx:eax = a[i] * digit
1069 addl %ebx, %eax
1070 adcl $0, %edx / edx:eax = a[i] * digit + cy
1071 movl %eax, (%edi) / r[i] = product[31..0]
1072 movl %edx, %ebx / cy = product[63..32]
1073 leal 4(%edi), %edi / ++r
1074 decl %ecx / --len
1075 jnz .L55 / while (len != 0)
1076 .L57:
1077 movl %ebx, %eax
1078 popl %ebx
1079 popl %edi
1080 popl %esi
1081 leave
1083 SET_SIZE(big_mul_set_vec_umul)
1086 / r = r + a * digit, r and a are vectors of length len
1087 / returns the carry digit
1088 / Does not use any MMX, SSE, or SSE2 instructions.
1089 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1090 / This is a fall-back implementation for x86 models that do not support
1091 / the PMULUDQ instruction.
1093 / uint32_t
1094 / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1096 / r 8(%ebp) %edx %edi
1097 / a 12(%ebp) %ebx %esi
1098 / len 16(%ebp) %ecx
1099 / digit 20(%ebp) %esi
1101 ENTRY(big_mul_add_vec_umul)
1102 pushl %ebp
1103 movl %esp, %ebp
1104 pushl %esi
1105 pushl %edi
1106 pushl %ebx
1107 movl 16(%ebp), %ecx
1108 xorl %ebx, %ebx / cy = 0
1109 testl %ecx, %ecx
1110 movl 8(%ebp), %edi
1111 movl 12(%ebp), %esi
1112 je .L67
1113 .align 4
1114 .L65:
1115 movl (%esi), %eax / eax = a[i]
1116 leal 4(%esi), %esi / ++a
1117 mull 20(%ebp) / edx:eax = a[i] * digit
1118 addl (%edi), %eax
1119 adcl $0, %edx / edx:eax = a[i] * digit + r[i]
1120 addl %ebx, %eax
1121 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy
1122 movl %eax, (%edi) / r[i] = product[31..0]
1123 movl %edx, %ebx / cy = product[63..32]
1124 leal 4(%edi), %edi / ++r
1125 decl %ecx / --len
1126 jnz .L65 / while (len != 0)
1127 .L67:
1128 movl %ebx, %eax
1129 popl %ebx
1130 popl %edi
1131 popl %esi
1132 leave
1134 SET_SIZE(big_mul_add_vec_umul)