4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/asm_linkage.h>
28 #if defined(lint) || defined(__lint)
30 #include <sys/types.h>
34 big_mul_set_vec
(uint64_t
*r
, uint64_t
*a, int len
, uint64_t digit
)
39 big_mul_add_vec
(uint64_t
*r
, uint64_t
*a, int len
, uint64_t digit
)
44 big_sqr_vec
(uint64_t
*r
, uint64_t
*a, int len
)
49 / ------------------------------------------------------------------------
51 / Implementation of big_mul_set_vec which exploits
52 / the
64X64-
>128 bit unsigned multiply instruction.
54 / As defined in Sun
's bignum library for pkcs11, bignums are
55 / composed of an array of 64-bit "digits" or "chunks" along with
56 / descriptive information.
58 / ------------------------------------------------------------------------
60 / r = a * digit, r and a are vectors of length len
61 / returns the carry digit
62 / r and a are 64 bit aligned.
65 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
67 ENTRY(big_mul_set_vec)
68 xorq %rax, %rax / if (len == 0) return (0)
72 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
73 xorq %r9, %r9 / cy = 0
76 cmpq $8, %r8 / 8 - len
78 movq 0(%rsi), %rax / rax = a[0]
79 movq 8(%rsi), %r11 / prefetch a[1]
80 mulq %rcx / p = a[0] * digit
82 adcq $0, %rdx / p += cy
83 movq %rax, 0(%rdi) / r[0] = lo(p)
84 movq %rdx, %r9 / cy = hi(p)
87 movq 16(%rsi), %r11 / prefetch a[2]
88 mulq %rcx / p = a[1] * digit
90 adcq $0, %rdx / p += cy
91 movq %rax, 8(%rdi) / r[1] = lo(p)
92 movq %rdx, %r9 / cy = hi(p)
95 movq 24(%rsi), %r11 / prefetch a[3]
96 mulq %rcx / p = a[2] * digit
98 adcq $0, %rdx / p += cy
99 movq %rax, 16(%rdi) / r[2] = lo(p)
100 movq %rdx, %r9 / cy = hi(p)
103 movq 32(%rsi), %r11 / prefetch a[4]
104 mulq %rcx / p = a[3] * digit
106 adcq $0, %rdx / p += cy
107 movq %rax, 24(%rdi) / r[3] = lo(p)
108 movq %rdx, %r9 / cy = hi(p)
111 movq 40(%rsi), %r11 / prefetch a[5]
112 mulq %rcx / p = a[4] * digit
114 adcq $0, %rdx / p += cy
115 movq %rax, 32(%rdi) / r[4] = lo(p)
116 movq %rdx, %r9 / cy = hi(p)
119 movq 48(%rsi), %r11 / prefetch a[6]
120 mulq %rcx / p = a[5] * digit
122 adcq $0, %rdx / p += cy
123 movq %rax, 40(%rdi) / r[5] = lo(p)
124 movq %rdx, %r9 / cy = hi(p)
127 movq 56(%rsi), %r11 / prefetch a[7]
128 mulq %rcx / p = a[6] * digit
130 adcq $0, %rdx / p += cy
131 movq %rax, 48(%rdi) / r[6] = lo(p)
132 movq %rdx, %r9 / cy = hi(p)
135 mulq %rcx / p = a[7] * digit
137 adcq $0, %rdx / p += cy
138 movq %rax, 56(%rdi) / r[7] = lo(p)
139 movq %rdx, %r9 / cy = hi(p)
150 mulq %rcx / p = a[0] * digit
152 adcq $0, %rdx / p += cy
153 movq %rax, 0(%rdi) / r[0] = lo(p)
154 movq %rdx, %r9 / cy = hi(p)
159 mulq %rcx / p = a[1] * digit
161 adcq $0, %rdx / p += cy
162 movq %rax, 8(%rdi) / r[1] = lo(p)
163 movq %rdx, %r9 / cy = hi(p)
168 mulq %rcx / p = a[2] * digit
170 adcq $0, %rdx / p += cy
171 movq %rax, 16(%rdi) / r[2] = lo(p)
172 movq %rdx, %r9 / cy = hi(p)
177 mulq %rcx / p = a[3] * digit
179 adcq $0, %rdx / p += cy
180 movq %rax, 24(%rdi) / r[3] = lo(p)
181 movq %rdx, %r9 / cy = hi(p)
186 mulq %rcx / p = a[4] * digit
188 adcq $0, %rdx / p += cy
189 movq %rax, 32(%rdi) / r[4] = lo(p)
190 movq %rdx, %r9 / cy = hi(p)
195 mulq %rcx / p = a[5] * digit
197 adcq $0, %rdx / p += cy
198 movq %rax, 40(%rdi) / r[5] = lo(p)
199 movq %rdx, %r9 / cy = hi(p)
204 mulq %rcx / p = a[6] * digit
206 adcq $0, %rdx / p += cy
207 movq %rax, 48(%rdi) / r[6] = lo(p)
208 movq %rdx, %r9 / cy = hi(p)
216 SET_SIZE(big_mul_set_vec)
219 / ------------------------------------------------------------------------
221 / Implementation of big_mul_add_vec which exploits
222 / the 64X64->128 bit unsigned multiply instruction.
224 / As defined in Sun's bignum library for pkcs11
, bignums are
225 / composed of an array of
64-bit
"digits" or "chunks" along with
226 / descriptive information.
228 / ------------------------------------------------------------------------
230 / r
+= a * digit
, r
and a are vectors of length len
231 / returns the carry digit
232 / r
and a are
64 bit aligned.
235 / big_mul_add_vec
(uint64_t
*r
, uint64_t
*a, int len
, uint64_t digit
)
237 ENTRY
(big_mul_add_vec
)
238 xorq
%rax
, %rax
/ if
(len
== 0) return
(0)
242 movq
%rdx
, %r8 / Use
r8 for len;
%rdx is used by
mul
243 xorq
%r9, %r9 / cy
= 0
246 cmpq $
8, %r8 / 8 - len
248 movq
0(%rsi
), %rax
/ rax
= a[0]
249 movq
0(%rdi
), %r10 / r10 = r
[0]
250 movq
8(%rsi
), %r11 / prefetch
a[1]
251 mulq
%rcx
/ p
= a[0] * digit
253 adcq $
0, %rdx
/ p
+= r
[0]
254 movq
8(%rdi
), %r10 / prefetch r
[1]
256 adcq $
0, %rdx
/ p
+= cy
257 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
258 movq
%rdx
, %r9 / cy
= hi
(p
)
261 movq
16(%rsi
), %r11 / prefetch
a[2]
262 mulq
%rcx
/ p
= a[1] * digit
264 adcq $
0, %rdx
/ p
+= r
[1]
265 movq
16(%rdi
), %r10 / prefetch r
[2]
267 adcq $
0, %rdx
/ p
+= cy
268 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
269 movq
%rdx
, %r9 / cy
= hi
(p
)
272 movq
24(%rsi
), %r11 / prefetch
a[3]
273 mulq
%rcx
/ p
= a[2] * digit
275 adcq $
0, %rdx
/ p
+= r
[2]
276 movq
24(%rdi
), %r10 / prefetch r
[3]
278 adcq $
0, %rdx
/ p
+= cy
279 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
280 movq
%rdx
, %r9 / cy
= hi
(p
)
283 movq
32(%rsi
), %r11 / prefetch
a[4]
284 mulq
%rcx
/ p
= a[3] * digit
286 adcq $
0, %rdx
/ p
+= r
[3]
287 movq
32(%rdi
), %r10 / prefetch r
[4]
289 adcq $
0, %rdx
/ p
+= cy
290 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
291 movq
%rdx
, %r9 / cy
= hi
(p
)
294 movq
40(%rsi
), %r11 / prefetch
a[5]
295 mulq
%rcx
/ p
= a[4] * digit
297 adcq $
0, %rdx
/ p
+= r
[4]
298 movq
40(%rdi
), %r10 / prefetch r
[5]
300 adcq $
0, %rdx
/ p
+= cy
301 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
302 movq
%rdx
, %r9 / cy
= hi
(p
)
305 movq
48(%rsi
), %r11 / prefetch
a[6]
306 mulq
%rcx
/ p
= a[5] * digit
308 adcq $
0, %rdx
/ p
+= r
[5]
309 movq
48(%rdi
), %r10 / prefetch r
[6]
311 adcq $
0, %rdx
/ p
+= cy
312 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
313 movq
%rdx
, %r9 / cy
= hi
(p
)
316 movq
56(%rsi
), %r11 / prefetch
a[7]
317 mulq
%rcx
/ p
= a[6] * digit
319 adcq $
0, %rdx
/ p
+= r
[6]
320 movq
56(%rdi
), %r10 / prefetch r
[7]
322 adcq $
0, %rdx
/ p
+= cy
323 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
324 movq
%rdx
, %r9 / cy
= hi
(p
)
327 mulq
%rcx
/ p
= a[7] * digit
329 adcq $
0, %rdx
/ p
+= r
[7]
331 adcq $
0, %rdx
/ p
+= cy
332 movq
%rax
, 56(%rdi
) / r
[7] = lo
(p
)
333 movq
%rdx
, %r9 / cy
= hi
(p
)
345 mulq
%rcx
/ p
= a[0] * digit
347 adcq $
0, %rdx
/ p
+= r
[0]
349 adcq $
0, %rdx
/ p
+= cy
350 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
351 movq
%rdx
, %r9 / cy
= hi
(p
)
357 mulq
%rcx
/ p
= a[1] * digit
359 adcq $
0, %rdx
/ p
+= r
[1]
361 adcq $
0, %rdx
/ p
+= cy
362 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
363 movq
%rdx
, %r9 / cy
= hi
(p
)
369 mulq
%rcx
/ p
= a[2] * digit
371 adcq $
0, %rdx
/ p
+= r
[2]
373 adcq $
0, %rdx
/ p
+= cy
374 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
375 movq
%rdx
, %r9 / cy
= hi
(p
)
381 mulq
%rcx
/ p
= a[3] * digit
383 adcq $
0, %rdx
/ p
+= r
[3]
385 adcq $
0, %rdx
/ p
+= cy
386 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
387 movq
%rdx
, %r9 / cy
= hi
(p
)
393 mulq
%rcx
/ p
= a[4] * digit
395 adcq $
0, %rdx
/ p
+= r
[4]
397 adcq $
0, %rdx
/ p
+= cy
398 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
399 movq
%rdx
, %r9 / cy
= hi
(p
)
405 mulq
%rcx
/ p
= a[5] * digit
407 adcq $
0, %rdx
/ p
+= r
[5]
409 adcq $
0, %rdx
/ p
+= cy
410 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
411 movq
%rdx
, %r9 / cy
= hi
(p
)
417 mulq
%rcx
/ p
= a[6] * digit
419 adcq $
0, %rdx
/ p
+= r
[6]
421 adcq $
0, %rdx
/ p
+= cy
422 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
423 movq
%rdx
, %r9 / cy
= hi
(p
)
431 SET_SIZE
(big_mul_add_vec
)
435 / big_sqr_vec
(uint64_t
*r
, uint64_t
*a, int len
)
444 pushq
%rdx
/ save arg3
, len
445 pushq
%rsi
/ save arg2
, a
446 pushq
%rdi
/ save arg1
, r
448 leaq
8(%rdi
), %r13 / tr
= r
+ 1
449 movq
%rsi
, %r14 / ta
= a
450 movq
%rdx
, %r15 / tlen
= len
451 decq
%r15 / tlen
= len
- 1
452 movq
%r13, %rdi
/ arg1
= tr
453 leaq
8(%r14), %rsi
/ arg2
= ta
+ 1
454 movq
%r15, %rdx
/ arg3
= tlen
455 movq
0(%r14), %rcx
/ arg4
= ta
[0]
457 movq
%rax
, 0(%r13, %r15, 8) / tr
[tlen
] = cy
460 jz
.L32 / while (--tlen != 0)
462 addq $
16, %r13 / tr
+= 2
464 movq
%r13, %rdi
/ arg1
= tr
465 leaq
8(%r14), %rsi
/ arg2
= ta
+ 1
466 movq
%r15, %rdx
/ arg3
= tlen
467 movq
0(%r14), %rcx
/ arg4
= ta
[0]
469 movq
%rax
, 0(%r13, %r15, 8) / tr
[tlen
] = cy
474 / No more function calls after this.
475 / Restore arguments to registers.
476 / However
, don
't use %rdx for arg3, len, because it is heavily
477 / used by the hardware MUL instruction. Use %r8, instead.
478 movq 0(%rsp), %rdi / %rdi == arg1 == r
479 movq 8(%rsp), %rsi / %rsi == arg2 == a
480 movq 16(%rsp), %r8 / %r8 == arg3 == len
482 movq 0(%rsi), %rax / %rax = a[0];
483 mulq %rax / s = %edx:%eax = a[0]**2
484 movq %rax, 0(%rdi) / r[0] = lo64(s)
485 movq %rdx, %r9 / cy = hi64(s)
487 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
489 adcq $0, %rdx / p = p << 1
491 adcq $0, %rdx / p = (r[1] << 1) + cy
492 movq %rax, 8(%rdi) / r[1] = lo64(p)
493 movq %rdx, %r9 / cy = hi64(p)
494 movq $1, %r11 / row = 1
495 movq $2, %r12 / col = 2
497 decq %r15 / tlen = len - 1
499 cmpq %r8, %r11 / len - row
500 jae .L34 / while (row < len)
502 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row]
503 mulq %rax / s = s * s
505 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col]
507 adcq $0, %rbx / p = p << 1
509 adcq %rbx, %rdx / t = p + s
511 movq %rax, %rbp / t2 = 0:lo64(t)
513 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy
514 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2)
518 adcq $0, %rcx / cy = hi64(t) + hi64(t2)
520 je .L34 / if (row == len - 1) break
522 movq 8(%rdi, %r12, 8), %rax
526 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy
527 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
528 movq %rdx, %r9 / cy = hi64(p)
531 addq $2, %r12 / col += 2
535 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
537 addq $24, %rsp / skip %rdi, %rsi, %rdx
547 SET_SIZE(big_sqr_vec)