4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/asm_linkage.h>
29 / ------------------------------------------------------------------------
31 / Implementation of big_mul_set_vec which exploits
32 / the
64X64-
>128 bit unsigned multiply instruction.
34 / As defined in Sun
's bignum library for pkcs11, bignums are
35 / composed of an array of 64-bit "digits" or "chunks" along with
36 / descriptive information.
38 / ------------------------------------------------------------------------
40 / r = a * digit, r and a are vectors of length len
41 / returns the carry digit
42 / r and a are 64 bit aligned.
45 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
47 ENTRY(big_mul_set_vec)
48 xorq %rax, %rax / if (len == 0) return (0)
52 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
53 xorq %r9, %r9 / cy = 0
56 cmpq $8, %r8 / 8 - len
58 movq 0(%rsi), %rax / rax = a[0]
59 movq 8(%rsi), %r11 / prefetch a[1]
60 mulq %rcx / p = a[0] * digit
62 adcq $0, %rdx / p += cy
63 movq %rax, 0(%rdi) / r[0] = lo(p)
64 movq %rdx, %r9 / cy = hi(p)
67 movq 16(%rsi), %r11 / prefetch a[2]
68 mulq %rcx / p = a[1] * digit
70 adcq $0, %rdx / p += cy
71 movq %rax, 8(%rdi) / r[1] = lo(p)
72 movq %rdx, %r9 / cy = hi(p)
75 movq 24(%rsi), %r11 / prefetch a[3]
76 mulq %rcx / p = a[2] * digit
78 adcq $0, %rdx / p += cy
79 movq %rax, 16(%rdi) / r[2] = lo(p)
80 movq %rdx, %r9 / cy = hi(p)
83 movq 32(%rsi), %r11 / prefetch a[4]
84 mulq %rcx / p = a[3] * digit
86 adcq $0, %rdx / p += cy
87 movq %rax, 24(%rdi) / r[3] = lo(p)
88 movq %rdx, %r9 / cy = hi(p)
91 movq 40(%rsi), %r11 / prefetch a[5]
92 mulq %rcx / p = a[4] * digit
94 adcq $0, %rdx / p += cy
95 movq %rax, 32(%rdi) / r[4] = lo(p)
96 movq %rdx, %r9 / cy = hi(p)
99 movq 48(%rsi), %r11 / prefetch a[6]
100 mulq %rcx / p = a[5] * digit
102 adcq $0, %rdx / p += cy
103 movq %rax, 40(%rdi) / r[5] = lo(p)
104 movq %rdx, %r9 / cy = hi(p)
107 movq 56(%rsi), %r11 / prefetch a[7]
108 mulq %rcx / p = a[6] * digit
110 adcq $0, %rdx / p += cy
111 movq %rax, 48(%rdi) / r[6] = lo(p)
112 movq %rdx, %r9 / cy = hi(p)
115 mulq %rcx / p = a[7] * digit
117 adcq $0, %rdx / p += cy
118 movq %rax, 56(%rdi) / r[7] = lo(p)
119 movq %rdx, %r9 / cy = hi(p)
130 mulq %rcx / p = a[0] * digit
132 adcq $0, %rdx / p += cy
133 movq %rax, 0(%rdi) / r[0] = lo(p)
134 movq %rdx, %r9 / cy = hi(p)
139 mulq %rcx / p = a[1] * digit
141 adcq $0, %rdx / p += cy
142 movq %rax, 8(%rdi) / r[1] = lo(p)
143 movq %rdx, %r9 / cy = hi(p)
148 mulq %rcx / p = a[2] * digit
150 adcq $0, %rdx / p += cy
151 movq %rax, 16(%rdi) / r[2] = lo(p)
152 movq %rdx, %r9 / cy = hi(p)
157 mulq %rcx / p = a[3] * digit
159 adcq $0, %rdx / p += cy
160 movq %rax, 24(%rdi) / r[3] = lo(p)
161 movq %rdx, %r9 / cy = hi(p)
166 mulq %rcx / p = a[4] * digit
168 adcq $0, %rdx / p += cy
169 movq %rax, 32(%rdi) / r[4] = lo(p)
170 movq %rdx, %r9 / cy = hi(p)
175 mulq %rcx / p = a[5] * digit
177 adcq $0, %rdx / p += cy
178 movq %rax, 40(%rdi) / r[5] = lo(p)
179 movq %rdx, %r9 / cy = hi(p)
184 mulq %rcx / p = a[6] * digit
186 adcq $0, %rdx / p += cy
187 movq %rax, 48(%rdi) / r[6] = lo(p)
188 movq %rdx, %r9 / cy = hi(p)
196 SET_SIZE(big_mul_set_vec)
199 / ------------------------------------------------------------------------
201 / Implementation of big_mul_add_vec which exploits
202 / the 64X64->128 bit unsigned multiply instruction.
204 / As defined in Sun's bignum library for pkcs11
, bignums are
205 / composed of an array of
64-bit
"digits" or "chunks" along with
206 / descriptive information.
208 / ------------------------------------------------------------------------
210 / r
+= a * digit
, r
and a are vectors of length len
211 / returns the carry digit
212 / r
and a are
64 bit aligned.
215 / big_mul_add_vec
(uint64_t
*r
, uint64_t
*a, int len
, uint64_t digit
)
217 ENTRY
(big_mul_add_vec
)
218 xorq
%rax
, %rax
/ if
(len
== 0) return
(0)
222 movq
%rdx
, %r8 / Use
r8 for len;
%rdx is used by
mul
223 xorq
%r9, %r9 / cy
= 0
226 cmpq $
8, %r8 / 8 - len
228 movq
0(%rsi
), %rax
/ rax
= a[0]
229 movq
0(%rdi
), %r10 / r10 = r
[0]
230 movq
8(%rsi
), %r11 / prefetch
a[1]
231 mulq
%rcx
/ p
= a[0] * digit
233 adcq $
0, %rdx
/ p
+= r
[0]
234 movq
8(%rdi
), %r10 / prefetch r
[1]
236 adcq $
0, %rdx
/ p
+= cy
237 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
238 movq
%rdx
, %r9 / cy
= hi
(p
)
241 movq
16(%rsi
), %r11 / prefetch
a[2]
242 mulq
%rcx
/ p
= a[1] * digit
244 adcq $
0, %rdx
/ p
+= r
[1]
245 movq
16(%rdi
), %r10 / prefetch r
[2]
247 adcq $
0, %rdx
/ p
+= cy
248 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
249 movq
%rdx
, %r9 / cy
= hi
(p
)
252 movq
24(%rsi
), %r11 / prefetch
a[3]
253 mulq
%rcx
/ p
= a[2] * digit
255 adcq $
0, %rdx
/ p
+= r
[2]
256 movq
24(%rdi
), %r10 / prefetch r
[3]
258 adcq $
0, %rdx
/ p
+= cy
259 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
260 movq
%rdx
, %r9 / cy
= hi
(p
)
263 movq
32(%rsi
), %r11 / prefetch
a[4]
264 mulq
%rcx
/ p
= a[3] * digit
266 adcq $
0, %rdx
/ p
+= r
[3]
267 movq
32(%rdi
), %r10 / prefetch r
[4]
269 adcq $
0, %rdx
/ p
+= cy
270 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
271 movq
%rdx
, %r9 / cy
= hi
(p
)
274 movq
40(%rsi
), %r11 / prefetch
a[5]
275 mulq
%rcx
/ p
= a[4] * digit
277 adcq $
0, %rdx
/ p
+= r
[4]
278 movq
40(%rdi
), %r10 / prefetch r
[5]
280 adcq $
0, %rdx
/ p
+= cy
281 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
282 movq
%rdx
, %r9 / cy
= hi
(p
)
285 movq
48(%rsi
), %r11 / prefetch
a[6]
286 mulq
%rcx
/ p
= a[5] * digit
288 adcq $
0, %rdx
/ p
+= r
[5]
289 movq
48(%rdi
), %r10 / prefetch r
[6]
291 adcq $
0, %rdx
/ p
+= cy
292 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
293 movq
%rdx
, %r9 / cy
= hi
(p
)
296 movq
56(%rsi
), %r11 / prefetch
a[7]
297 mulq
%rcx
/ p
= a[6] * digit
299 adcq $
0, %rdx
/ p
+= r
[6]
300 movq
56(%rdi
), %r10 / prefetch r
[7]
302 adcq $
0, %rdx
/ p
+= cy
303 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
304 movq
%rdx
, %r9 / cy
= hi
(p
)
307 mulq
%rcx
/ p
= a[7] * digit
309 adcq $
0, %rdx
/ p
+= r
[7]
311 adcq $
0, %rdx
/ p
+= cy
312 movq
%rax
, 56(%rdi
) / r
[7] = lo
(p
)
313 movq
%rdx
, %r9 / cy
= hi
(p
)
325 mulq
%rcx
/ p
= a[0] * digit
327 adcq $
0, %rdx
/ p
+= r
[0]
329 adcq $
0, %rdx
/ p
+= cy
330 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
331 movq
%rdx
, %r9 / cy
= hi
(p
)
337 mulq
%rcx
/ p
= a[1] * digit
339 adcq $
0, %rdx
/ p
+= r
[1]
341 adcq $
0, %rdx
/ p
+= cy
342 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
343 movq
%rdx
, %r9 / cy
= hi
(p
)
349 mulq
%rcx
/ p
= a[2] * digit
351 adcq $
0, %rdx
/ p
+= r
[2]
353 adcq $
0, %rdx
/ p
+= cy
354 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
355 movq
%rdx
, %r9 / cy
= hi
(p
)
361 mulq
%rcx
/ p
= a[3] * digit
363 adcq $
0, %rdx
/ p
+= r
[3]
365 adcq $
0, %rdx
/ p
+= cy
366 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
367 movq
%rdx
, %r9 / cy
= hi
(p
)
373 mulq
%rcx
/ p
= a[4] * digit
375 adcq $
0, %rdx
/ p
+= r
[4]
377 adcq $
0, %rdx
/ p
+= cy
378 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
379 movq
%rdx
, %r9 / cy
= hi
(p
)
385 mulq
%rcx
/ p
= a[5] * digit
387 adcq $
0, %rdx
/ p
+= r
[5]
389 adcq $
0, %rdx
/ p
+= cy
390 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
391 movq
%rdx
, %r9 / cy
= hi
(p
)
397 mulq
%rcx
/ p
= a[6] * digit
399 adcq $
0, %rdx
/ p
+= r
[6]
401 adcq $
0, %rdx
/ p
+= cy
402 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
403 movq
%rdx
, %r9 / cy
= hi
(p
)
411 SET_SIZE
(big_mul_add_vec
)
415 / big_sqr_vec
(uint64_t
*r
, uint64_t
*a, int len
)
424 pushq
%rdx
/ save arg3
, len
425 pushq
%rsi
/ save arg2
, a
426 pushq
%rdi
/ save arg1
, r
428 leaq
8(%rdi
), %r13 / tr
= r
+ 1
429 movq
%rsi
, %r14 / ta
= a
430 movq
%rdx
, %r15 / tlen
= len
431 decq
%r15 / tlen
= len
- 1
432 movq
%r13, %rdi
/ arg1
= tr
433 leaq
8(%r14), %rsi
/ arg2
= ta
+ 1
434 movq
%r15, %rdx
/ arg3
= tlen
435 movq
0(%r14), %rcx
/ arg4
= ta
[0]
437 movq
%rax
, 0(%r13, %r15, 8) / tr
[tlen
] = cy
440 jz
.L32 / while (--tlen != 0)
442 addq $
16, %r13 / tr
+= 2
444 movq
%r13, %rdi
/ arg1
= tr
445 leaq
8(%r14), %rsi
/ arg2
= ta
+ 1
446 movq
%r15, %rdx
/ arg3
= tlen
447 movq
0(%r14), %rcx
/ arg4
= ta
[0]
449 movq
%rax
, 0(%r13, %r15, 8) / tr
[tlen
] = cy
454 / No more function calls after this.
455 / Restore arguments to registers.
456 / However
, don
't use %rdx for arg3, len, because it is heavily
457 / used by the hardware MUL instruction. Use %r8, instead.
458 movq 0(%rsp), %rdi / %rdi == arg1 == r
459 movq 8(%rsp), %rsi / %rsi == arg2 == a
460 movq 16(%rsp), %r8 / %r8 == arg3 == len
462 movq 0(%rsi), %rax / %rax = a[0];
463 mulq %rax / s = %edx:%eax = a[0]**2
464 movq %rax, 0(%rdi) / r[0] = lo64(s)
465 movq %rdx, %r9 / cy = hi64(s)
467 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
469 adcq $0, %rdx / p = p << 1
471 adcq $0, %rdx / p = (r[1] << 1) + cy
472 movq %rax, 8(%rdi) / r[1] = lo64(p)
473 movq %rdx, %r9 / cy = hi64(p)
474 movq $1, %r11 / row = 1
475 movq $2, %r12 / col = 2
477 decq %r15 / tlen = len - 1
479 cmpq %r8, %r11 / len - row
480 jae .L34 / while (row < len)
482 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row]
483 mulq %rax / s = s * s
485 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col]
487 adcq $0, %rbx / p = p << 1
489 adcq %rbx, %rdx / t = p + s
491 movq %rax, %rbp / t2 = 0:lo64(t)
493 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy
494 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2)
498 adcq $0, %rcx / cy = hi64(t) + hi64(t2)
500 je .L34 / if (row == len - 1) break
502 movq 8(%rdi, %r12, 8), %rax
506 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy
507 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
508 movq %rdx, %r9 / cy = hi64(p)
511 addq $2, %r12 / col += 2
515 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
517 addq $24, %rsp / skip %rdi, %rsi, %rdx
527 SET_SIZE(big_sqr_vec)