dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / usr / src / common / bignum / amd64 / bignum_amd64_asm.s
blob835854b1940d6e2c68aa87ff167cee1376509334
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/asm_linkage.h>
29 / ------------------------------------------------------------------------
31 / Implementation of big_mul_set_vec which exploits
32 / the 64X64->128 bit unsigned multiply instruction.
34 / As defined in Sun's bignum library for pkcs11, bignums are
35 / composed of an array of 64-bit "digits" or "chunks" along with
36 / descriptive information.
38 / ------------------------------------------------------------------------
40 / r = a * digit, r and a are vectors of length len
41 / returns the carry digit
42 / r and a are 64 bit aligned.
44 / uint64_t
45 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
47 ENTRY(big_mul_set_vec)
48 xorq %rax, %rax / if (len == 0) return (0)
49 testq %rdx, %rdx
50 jz .L17
52 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
53 xorq %r9, %r9 / cy = 0
55 .L15:
56 cmpq $8, %r8 / 8 - len
57 jb .L16
58 movq 0(%rsi), %rax / rax = a[0]
59 movq 8(%rsi), %r11 / prefetch a[1]
60 mulq %rcx / p = a[0] * digit
61 addq %r9, %rax
62 adcq $0, %rdx / p += cy
63 movq %rax, 0(%rdi) / r[0] = lo(p)
64 movq %rdx, %r9 / cy = hi(p)
66 movq %r11, %rax
67 movq 16(%rsi), %r11 / prefetch a[2]
68 mulq %rcx / p = a[1] * digit
69 addq %r9, %rax
70 adcq $0, %rdx / p += cy
71 movq %rax, 8(%rdi) / r[1] = lo(p)
72 movq %rdx, %r9 / cy = hi(p)
74 movq %r11, %rax
75 movq 24(%rsi), %r11 / prefetch a[3]
76 mulq %rcx / p = a[2] * digit
77 addq %r9, %rax
78 adcq $0, %rdx / p += cy
79 movq %rax, 16(%rdi) / r[2] = lo(p)
80 movq %rdx, %r9 / cy = hi(p)
82 movq %r11, %rax
83 movq 32(%rsi), %r11 / prefetch a[4]
84 mulq %rcx / p = a[3] * digit
85 addq %r9, %rax
86 adcq $0, %rdx / p += cy
87 movq %rax, 24(%rdi) / r[3] = lo(p)
88 movq %rdx, %r9 / cy = hi(p)
90 movq %r11, %rax
91 movq 40(%rsi), %r11 / prefetch a[5]
92 mulq %rcx / p = a[4] * digit
93 addq %r9, %rax
94 adcq $0, %rdx / p += cy
95 movq %rax, 32(%rdi) / r[4] = lo(p)
96 movq %rdx, %r9 / cy = hi(p)
98 movq %r11, %rax
99 movq 48(%rsi), %r11 / prefetch a[6]
100 mulq %rcx / p = a[5] * digit
101 addq %r9, %rax
102 adcq $0, %rdx / p += cy
103 movq %rax, 40(%rdi) / r[5] = lo(p)
104 movq %rdx, %r9 / cy = hi(p)
106 movq %r11, %rax
107 movq 56(%rsi), %r11 / prefetch a[7]
108 mulq %rcx / p = a[6] * digit
109 addq %r9, %rax
110 adcq $0, %rdx / p += cy
111 movq %rax, 48(%rdi) / r[6] = lo(p)
112 movq %rdx, %r9 / cy = hi(p)
114 movq %r11, %rax
115 mulq %rcx / p = a[7] * digit
116 addq %r9, %rax
117 adcq $0, %rdx / p += cy
118 movq %rax, 56(%rdi) / r[7] = lo(p)
119 movq %rdx, %r9 / cy = hi(p)
121 addq $64, %rsi
122 addq $64, %rdi
123 subq $8, %r8
125 jz .L17
126 jmp .L15
128 .L16:
129 movq 0(%rsi), %rax
130 mulq %rcx / p = a[0] * digit
131 addq %r9, %rax
132 adcq $0, %rdx / p += cy
133 movq %rax, 0(%rdi) / r[0] = lo(p)
134 movq %rdx, %r9 / cy = hi(p)
135 decq %r8
136 jz .L17
138 movq 8(%rsi), %rax
139 mulq %rcx / p = a[1] * digit
140 addq %r9, %rax
141 adcq $0, %rdx / p += cy
142 movq %rax, 8(%rdi) / r[1] = lo(p)
143 movq %rdx, %r9 / cy = hi(p)
144 decq %r8
145 jz .L17
147 movq 16(%rsi), %rax
148 mulq %rcx / p = a[2] * digit
149 addq %r9, %rax
150 adcq $0, %rdx / p += cy
151 movq %rax, 16(%rdi) / r[2] = lo(p)
152 movq %rdx, %r9 / cy = hi(p)
153 decq %r8
154 jz .L17
156 movq 24(%rsi), %rax
157 mulq %rcx / p = a[3] * digit
158 addq %r9, %rax
159 adcq $0, %rdx / p += cy
160 movq %rax, 24(%rdi) / r[3] = lo(p)
161 movq %rdx, %r9 / cy = hi(p)
162 decq %r8
163 jz .L17
165 movq 32(%rsi), %rax
166 mulq %rcx / p = a[4] * digit
167 addq %r9, %rax
168 adcq $0, %rdx / p += cy
169 movq %rax, 32(%rdi) / r[4] = lo(p)
170 movq %rdx, %r9 / cy = hi(p)
171 decq %r8
172 jz .L17
174 movq 40(%rsi), %rax
175 mulq %rcx / p = a[5] * digit
176 addq %r9, %rax
177 adcq $0, %rdx / p += cy
178 movq %rax, 40(%rdi) / r[5] = lo(p)
179 movq %rdx, %r9 / cy = hi(p)
180 decq %r8
181 jz .L17
183 movq 48(%rsi), %rax
184 mulq %rcx / p = a[6] * digit
185 addq %r9, %rax
186 adcq $0, %rdx / p += cy
187 movq %rax, 48(%rdi) / r[6] = lo(p)
188 movq %rdx, %r9 / cy = hi(p)
189 decq %r8
190 jz .L17
193 .L17:
194 movq %r9, %rax
196 SET_SIZE(big_mul_set_vec)
199 / ------------------------------------------------------------------------
201 / Implementation of big_mul_add_vec which exploits
202 / the 64X64->128 bit unsigned multiply instruction.
204 / As defined in Sun's bignum library for pkcs11, bignums are
205 / composed of an array of 64-bit "digits" or "chunks" along with
206 / descriptive information.
208 / ------------------------------------------------------------------------
210 / r += a * digit, r and a are vectors of length len
211 / returns the carry digit
212 / r and a are 64 bit aligned.
214 / uint64_t
215 / big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
217 ENTRY(big_mul_add_vec)
218 xorq %rax, %rax / if (len == 0) return (0)
219 testq %rdx, %rdx
220 jz .L27
222 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
223 xorq %r9, %r9 / cy = 0
225 .L25:
226 cmpq $8, %r8 / 8 - len
227 jb .L26
228 movq 0(%rsi), %rax / rax = a[0]
229 movq 0(%rdi), %r10 / r10 = r[0]
230 movq 8(%rsi), %r11 / prefetch a[1]
231 mulq %rcx / p = a[0] * digit
232 addq %r10, %rax
233 adcq $0, %rdx / p += r[0]
234 movq 8(%rdi), %r10 / prefetch r[1]
235 addq %r9, %rax
236 adcq $0, %rdx / p += cy
237 movq %rax, 0(%rdi) / r[0] = lo(p)
238 movq %rdx, %r9 / cy = hi(p)
240 movq %r11, %rax
241 movq 16(%rsi), %r11 / prefetch a[2]
242 mulq %rcx / p = a[1] * digit
243 addq %r10, %rax
244 adcq $0, %rdx / p += r[1]
245 movq 16(%rdi), %r10 / prefetch r[2]
246 addq %r9, %rax
247 adcq $0, %rdx / p += cy
248 movq %rax, 8(%rdi) / r[1] = lo(p)
249 movq %rdx, %r9 / cy = hi(p)
251 movq %r11, %rax
252 movq 24(%rsi), %r11 / prefetch a[3]
253 mulq %rcx / p = a[2] * digit
254 addq %r10, %rax
255 adcq $0, %rdx / p += r[2]
256 movq 24(%rdi), %r10 / prefetch r[3]
257 addq %r9, %rax
258 adcq $0, %rdx / p += cy
259 movq %rax, 16(%rdi) / r[2] = lo(p)
260 movq %rdx, %r9 / cy = hi(p)
262 movq %r11, %rax
263 movq 32(%rsi), %r11 / prefetch a[4]
264 mulq %rcx / p = a[3] * digit
265 addq %r10, %rax
266 adcq $0, %rdx / p += r[3]
267 movq 32(%rdi), %r10 / prefetch r[4]
268 addq %r9, %rax
269 adcq $0, %rdx / p += cy
270 movq %rax, 24(%rdi) / r[3] = lo(p)
271 movq %rdx, %r9 / cy = hi(p)
273 movq %r11, %rax
274 movq 40(%rsi), %r11 / prefetch a[5]
275 mulq %rcx / p = a[4] * digit
276 addq %r10, %rax
277 adcq $0, %rdx / p += r[4]
278 movq 40(%rdi), %r10 / prefetch r[5]
279 addq %r9, %rax
280 adcq $0, %rdx / p += cy
281 movq %rax, 32(%rdi) / r[4] = lo(p)
282 movq %rdx, %r9 / cy = hi(p)
284 movq %r11, %rax
285 movq 48(%rsi), %r11 / prefetch a[6]
286 mulq %rcx / p = a[5] * digit
287 addq %r10, %rax
288 adcq $0, %rdx / p += r[5]
289 movq 48(%rdi), %r10 / prefetch r[6]
290 addq %r9, %rax
291 adcq $0, %rdx / p += cy
292 movq %rax, 40(%rdi) / r[5] = lo(p)
293 movq %rdx, %r9 / cy = hi(p)
295 movq %r11, %rax
296 movq 56(%rsi), %r11 / prefetch a[7]
297 mulq %rcx / p = a[6] * digit
298 addq %r10, %rax
299 adcq $0, %rdx / p += r[6]
300 movq 56(%rdi), %r10 / prefetch r[7]
301 addq %r9, %rax
302 adcq $0, %rdx / p += cy
303 movq %rax, 48(%rdi) / r[6] = lo(p)
304 movq %rdx, %r9 / cy = hi(p)
306 movq %r11, %rax
307 mulq %rcx / p = a[7] * digit
308 addq %r10, %rax
309 adcq $0, %rdx / p += r[7]
310 addq %r9, %rax
311 adcq $0, %rdx / p += cy
312 movq %rax, 56(%rdi) / r[7] = lo(p)
313 movq %rdx, %r9 / cy = hi(p)
315 addq $64, %rsi
316 addq $64, %rdi
317 subq $8, %r8
319 jz .L27
320 jmp .L25
322 .L26:
323 movq 0(%rsi), %rax
324 movq 0(%rdi), %r10
325 mulq %rcx / p = a[0] * digit
326 addq %r10, %rax
327 adcq $0, %rdx / p += r[0]
328 addq %r9, %rax
329 adcq $0, %rdx / p += cy
330 movq %rax, 0(%rdi) / r[0] = lo(p)
331 movq %rdx, %r9 / cy = hi(p)
332 decq %r8
333 jz .L27
335 movq 8(%rsi), %rax
336 movq 8(%rdi), %r10
337 mulq %rcx / p = a[1] * digit
338 addq %r10, %rax
339 adcq $0, %rdx / p += r[1]
340 addq %r9, %rax
341 adcq $0, %rdx / p += cy
342 movq %rax, 8(%rdi) / r[1] = lo(p)
343 movq %rdx, %r9 / cy = hi(p)
344 decq %r8
345 jz .L27
347 movq 16(%rsi), %rax
348 movq 16(%rdi), %r10
349 mulq %rcx / p = a[2] * digit
350 addq %r10, %rax
351 adcq $0, %rdx / p += r[2]
352 addq %r9, %rax
353 adcq $0, %rdx / p += cy
354 movq %rax, 16(%rdi) / r[2] = lo(p)
355 movq %rdx, %r9 / cy = hi(p)
356 decq %r8
357 jz .L27
359 movq 24(%rsi), %rax
360 movq 24(%rdi), %r10
361 mulq %rcx / p = a[3] * digit
362 addq %r10, %rax
363 adcq $0, %rdx / p += r[3]
364 addq %r9, %rax
365 adcq $0, %rdx / p += cy
366 movq %rax, 24(%rdi) / r[3] = lo(p)
367 movq %rdx, %r9 / cy = hi(p)
368 decq %r8
369 jz .L27
371 movq 32(%rsi), %rax
372 movq 32(%rdi), %r10
373 mulq %rcx / p = a[4] * digit
374 addq %r10, %rax
375 adcq $0, %rdx / p += r[4]
376 addq %r9, %rax
377 adcq $0, %rdx / p += cy
378 movq %rax, 32(%rdi) / r[4] = lo(p)
379 movq %rdx, %r9 / cy = hi(p)
380 decq %r8
381 jz .L27
383 movq 40(%rsi), %rax
384 movq 40(%rdi), %r10
385 mulq %rcx / p = a[5] * digit
386 addq %r10, %rax
387 adcq $0, %rdx / p += r[5]
388 addq %r9, %rax
389 adcq $0, %rdx / p += cy
390 movq %rax, 40(%rdi) / r[5] = lo(p)
391 movq %rdx, %r9 / cy = hi(p)
392 decq %r8
393 jz .L27
395 movq 48(%rsi), %rax
396 movq 48(%rdi), %r10
397 mulq %rcx / p = a[6] * digit
398 addq %r10, %rax
399 adcq $0, %rdx / p += r[6]
400 addq %r9, %rax
401 adcq $0, %rdx / p += cy
402 movq %rax, 48(%rdi) / r[6] = lo(p)
403 movq %rdx, %r9 / cy = hi(p)
404 decq %r8
405 jz .L27
408 .L27:
409 movq %r9, %rax
411 SET_SIZE(big_mul_add_vec)
414 / void
415 / big_sqr_vec(uint64_t *r, uint64_t *a, int len)
417 ENTRY(big_sqr_vec)
418 pushq %rbx
419 pushq %rbp
420 pushq %r12
421 pushq %r13
422 pushq %r14
423 pushq %r15
424 pushq %rdx / save arg3, len
425 pushq %rsi / save arg2, a
426 pushq %rdi / save arg1, r
428 leaq 8(%rdi), %r13 / tr = r + 1
429 movq %rsi, %r14 / ta = a
430 movq %rdx, %r15 / tlen = len
431 decq %r15 / tlen = len - 1
432 movq %r13, %rdi / arg1 = tr
433 leaq 8(%r14), %rsi / arg2 = ta + 1
434 movq %r15, %rdx / arg3 = tlen
435 movq 0(%r14), %rcx / arg4 = ta[0]
436 call big_mul_set_vec
437 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
438 .L31:
439 decq %r15 / --tlen
440 jz .L32 / while (--tlen != 0)
442 addq $16, %r13 / tr += 2
443 addq $8, %r14 / ++ta
444 movq %r13, %rdi / arg1 = tr
445 leaq 8(%r14), %rsi / arg2 = ta + 1
446 movq %r15, %rdx / arg3 = tlen
447 movq 0(%r14), %rcx / arg4 = ta[0]
448 call big_mul_add_vec
449 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
450 jmp .L31
452 .L32:
454 / No more function calls after this.
455 / Restore arguments to registers.
456 / However, don't use %rdx for arg3, len, because it is heavily
457 / used by the hardware MUL instruction. Use %r8, instead.
458 movq 0(%rsp), %rdi / %rdi == arg1 == r
459 movq 8(%rsp), %rsi / %rsi == arg2 == a
460 movq 16(%rsp), %r8 / %r8 == arg3 == len
462 movq 0(%rsi), %rax / %rax = a[0];
463 mulq %rax / s = %edx:%eax = a[0]**2
464 movq %rax, 0(%rdi) / r[0] = lo64(s)
465 movq %rdx, %r9 / cy = hi64(s)
466 xorq %rdx, %rdx
467 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
468 addq %rax, %rax
469 adcq $0, %rdx / p = p << 1
470 addq %r9, %rax
471 adcq $0, %rdx / p = (r[1] << 1) + cy
472 movq %rax, 8(%rdi) / r[1] = lo64(p)
473 movq %rdx, %r9 / cy = hi64(p)
474 movq $1, %r11 / row = 1
475 movq $2, %r12 / col = 2
476 movq %r8, %r15
477 decq %r15 / tlen = len - 1
478 .L33:
479 cmpq %r8, %r11 / len - row
480 jae .L34 / while (row < len)
482 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row]
483 mulq %rax / s = s * s
484 xorq %rbx, %rbx
485 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col]
486 addq %rcx, %rcx
487 adcq $0, %rbx / p = p << 1
488 addq %rcx, %rax
489 adcq %rbx, %rdx / t = p + s
490 xorq %r10, %r10
491 movq %rax, %rbp / t2 = 0:lo64(t)
492 addq %r9, %rbp
493 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy
494 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2)
495 xorq %rcx, %rcx
496 movq %rdx, %r9
497 addq %r10, %r9
498 adcq $0, %rcx / cy = hi64(t) + hi64(t2)
499 cmpq %r11, %r15
500 je .L34 / if (row == len - 1) break
501 xorq %rdx, %rdx
502 movq 8(%rdi, %r12, 8), %rax
503 addq %rax, %rax
504 adcq $0, %rdx
505 addq %r9, %rax
506 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy
507 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
508 movq %rdx, %r9 / cy = hi64(p)
510 incq %r11 / ++row
511 addq $2, %r12 / col += 2
512 jmp .L33
514 .L34:
515 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
517 addq $24, %rsp / skip %rdi, %rsi, %rdx
518 popq %r15
519 popq %r14
520 popq %r13
521 popq %r12
522 popq %rbp
523 popq %rbx
527 SET_SIZE(big_sqr_vec)