8322 nl: misleading-indentation
[unleashed/tickless.git] / usr / src / common / bignum / amd64 / bignum_amd64_asm.s
blob411129dd4cb6fbfe4644ec78e6c8bff6ea7aa660
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/asm_linkage.h>
28 #if defined(lint) || defined(__lint)
30 #include <sys/types.h>
32 /* ARGSUSED */
33 uint64_t
34 big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
35 { return (0); }
37 /* ARGSUSED */
38 uint64_t
39 big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
40 { return (0); }
42 /* ARGSUSED */
43 void
44 big_sqr_vec(uint64_t *r, uint64_t *a, int len)
47 #else /* lint */
49 / ------------------------------------------------------------------------
51 / Implementation of big_mul_set_vec which exploits
52 / the 64X64->128 bit unsigned multiply instruction.
54 / As defined in Sun's bignum library for pkcs11, bignums are
55 / composed of an array of 64-bit "digits" or "chunks" along with
56 / descriptive information.
58 / ------------------------------------------------------------------------
60 / r = a * digit, r and a are vectors of length len
61 / returns the carry digit
62 / r and a are 64 bit aligned.
64 / uint64_t
65 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
67 ENTRY(big_mul_set_vec)
68 xorq %rax, %rax / if (len == 0) return (0)
69 testq %rdx, %rdx
70 jz .L17
72 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
73 xorq %r9, %r9 / cy = 0
75 .L15:
76 cmpq $8, %r8 / 8 - len
77 jb .L16
78 movq 0(%rsi), %rax / rax = a[0]
79 movq 8(%rsi), %r11 / prefetch a[1]
80 mulq %rcx / p = a[0] * digit
81 addq %r9, %rax
82 adcq $0, %rdx / p += cy
83 movq %rax, 0(%rdi) / r[0] = lo(p)
84 movq %rdx, %r9 / cy = hi(p)
86 movq %r11, %rax
87 movq 16(%rsi), %r11 / prefetch a[2]
88 mulq %rcx / p = a[1] * digit
89 addq %r9, %rax
90 adcq $0, %rdx / p += cy
91 movq %rax, 8(%rdi) / r[1] = lo(p)
92 movq %rdx, %r9 / cy = hi(p)
94 movq %r11, %rax
95 movq 24(%rsi), %r11 / prefetch a[3]
96 mulq %rcx / p = a[2] * digit
97 addq %r9, %rax
98 adcq $0, %rdx / p += cy
99 movq %rax, 16(%rdi) / r[2] = lo(p)
100 movq %rdx, %r9 / cy = hi(p)
102 movq %r11, %rax
103 movq 32(%rsi), %r11 / prefetch a[4]
104 mulq %rcx / p = a[3] * digit
105 addq %r9, %rax
106 adcq $0, %rdx / p += cy
107 movq %rax, 24(%rdi) / r[3] = lo(p)
108 movq %rdx, %r9 / cy = hi(p)
110 movq %r11, %rax
111 movq 40(%rsi), %r11 / prefetch a[5]
112 mulq %rcx / p = a[4] * digit
113 addq %r9, %rax
114 adcq $0, %rdx / p += cy
115 movq %rax, 32(%rdi) / r[4] = lo(p)
116 movq %rdx, %r9 / cy = hi(p)
118 movq %r11, %rax
119 movq 48(%rsi), %r11 / prefetch a[6]
120 mulq %rcx / p = a[5] * digit
121 addq %r9, %rax
122 adcq $0, %rdx / p += cy
123 movq %rax, 40(%rdi) / r[5] = lo(p)
124 movq %rdx, %r9 / cy = hi(p)
126 movq %r11, %rax
127 movq 56(%rsi), %r11 / prefetch a[7]
128 mulq %rcx / p = a[6] * digit
129 addq %r9, %rax
130 adcq $0, %rdx / p += cy
131 movq %rax, 48(%rdi) / r[6] = lo(p)
132 movq %rdx, %r9 / cy = hi(p)
134 movq %r11, %rax
135 mulq %rcx / p = a[7] * digit
136 addq %r9, %rax
137 adcq $0, %rdx / p += cy
138 movq %rax, 56(%rdi) / r[7] = lo(p)
139 movq %rdx, %r9 / cy = hi(p)
141 addq $64, %rsi
142 addq $64, %rdi
143 subq $8, %r8
145 jz .L17
146 jmp .L15
148 .L16:
149 movq 0(%rsi), %rax
150 mulq %rcx / p = a[0] * digit
151 addq %r9, %rax
152 adcq $0, %rdx / p += cy
153 movq %rax, 0(%rdi) / r[0] = lo(p)
154 movq %rdx, %r9 / cy = hi(p)
155 decq %r8
156 jz .L17
158 movq 8(%rsi), %rax
159 mulq %rcx / p = a[1] * digit
160 addq %r9, %rax
161 adcq $0, %rdx / p += cy
162 movq %rax, 8(%rdi) / r[1] = lo(p)
163 movq %rdx, %r9 / cy = hi(p)
164 decq %r8
165 jz .L17
167 movq 16(%rsi), %rax
168 mulq %rcx / p = a[2] * digit
169 addq %r9, %rax
170 adcq $0, %rdx / p += cy
171 movq %rax, 16(%rdi) / r[2] = lo(p)
172 movq %rdx, %r9 / cy = hi(p)
173 decq %r8
174 jz .L17
176 movq 24(%rsi), %rax
177 mulq %rcx / p = a[3] * digit
178 addq %r9, %rax
179 adcq $0, %rdx / p += cy
180 movq %rax, 24(%rdi) / r[3] = lo(p)
181 movq %rdx, %r9 / cy = hi(p)
182 decq %r8
183 jz .L17
185 movq 32(%rsi), %rax
186 mulq %rcx / p = a[4] * digit
187 addq %r9, %rax
188 adcq $0, %rdx / p += cy
189 movq %rax, 32(%rdi) / r[4] = lo(p)
190 movq %rdx, %r9 / cy = hi(p)
191 decq %r8
192 jz .L17
194 movq 40(%rsi), %rax
195 mulq %rcx / p = a[5] * digit
196 addq %r9, %rax
197 adcq $0, %rdx / p += cy
198 movq %rax, 40(%rdi) / r[5] = lo(p)
199 movq %rdx, %r9 / cy = hi(p)
200 decq %r8
201 jz .L17
203 movq 48(%rsi), %rax
204 mulq %rcx / p = a[6] * digit
205 addq %r9, %rax
206 adcq $0, %rdx / p += cy
207 movq %rax, 48(%rdi) / r[6] = lo(p)
208 movq %rdx, %r9 / cy = hi(p)
209 decq %r8
210 jz .L17
213 .L17:
214 movq %r9, %rax
216 SET_SIZE(big_mul_set_vec)
219 / ------------------------------------------------------------------------
221 / Implementation of big_mul_add_vec which exploits
222 / the 64X64->128 bit unsigned multiply instruction.
224 / As defined in Sun's bignum library for pkcs11, bignums are
225 / composed of an array of 64-bit "digits" or "chunks" along with
226 / descriptive information.
228 / ------------------------------------------------------------------------
230 / r += a * digit, r and a are vectors of length len
231 / returns the carry digit
232 / r and a are 64 bit aligned.
234 / uint64_t
235 / big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
237 ENTRY(big_mul_add_vec)
238 xorq %rax, %rax / if (len == 0) return (0)
239 testq %rdx, %rdx
240 jz .L27
242 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
243 xorq %r9, %r9 / cy = 0
245 .L25:
246 cmpq $8, %r8 / 8 - len
247 jb .L26
248 movq 0(%rsi), %rax / rax = a[0]
249 movq 0(%rdi), %r10 / r10 = r[0]
250 movq 8(%rsi), %r11 / prefetch a[1]
251 mulq %rcx / p = a[0] * digit
252 addq %r10, %rax
253 adcq $0, %rdx / p += r[0]
254 movq 8(%rdi), %r10 / prefetch r[1]
255 addq %r9, %rax
256 adcq $0, %rdx / p += cy
257 movq %rax, 0(%rdi) / r[0] = lo(p)
258 movq %rdx, %r9 / cy = hi(p)
260 movq %r11, %rax
261 movq 16(%rsi), %r11 / prefetch a[2]
262 mulq %rcx / p = a[1] * digit
263 addq %r10, %rax
264 adcq $0, %rdx / p += r[1]
265 movq 16(%rdi), %r10 / prefetch r[2]
266 addq %r9, %rax
267 adcq $0, %rdx / p += cy
268 movq %rax, 8(%rdi) / r[1] = lo(p)
269 movq %rdx, %r9 / cy = hi(p)
271 movq %r11, %rax
272 movq 24(%rsi), %r11 / prefetch a[3]
273 mulq %rcx / p = a[2] * digit
274 addq %r10, %rax
275 adcq $0, %rdx / p += r[2]
276 movq 24(%rdi), %r10 / prefetch r[3]
277 addq %r9, %rax
278 adcq $0, %rdx / p += cy
279 movq %rax, 16(%rdi) / r[2] = lo(p)
280 movq %rdx, %r9 / cy = hi(p)
282 movq %r11, %rax
283 movq 32(%rsi), %r11 / prefetch a[4]
284 mulq %rcx / p = a[3] * digit
285 addq %r10, %rax
286 adcq $0, %rdx / p += r[3]
287 movq 32(%rdi), %r10 / prefetch r[4]
288 addq %r9, %rax
289 adcq $0, %rdx / p += cy
290 movq %rax, 24(%rdi) / r[3] = lo(p)
291 movq %rdx, %r9 / cy = hi(p)
293 movq %r11, %rax
294 movq 40(%rsi), %r11 / prefetch a[5]
295 mulq %rcx / p = a[4] * digit
296 addq %r10, %rax
297 adcq $0, %rdx / p += r[4]
298 movq 40(%rdi), %r10 / prefetch r[5]
299 addq %r9, %rax
300 adcq $0, %rdx / p += cy
301 movq %rax, 32(%rdi) / r[4] = lo(p)
302 movq %rdx, %r9 / cy = hi(p)
304 movq %r11, %rax
305 movq 48(%rsi), %r11 / prefetch a[6]
306 mulq %rcx / p = a[5] * digit
307 addq %r10, %rax
308 adcq $0, %rdx / p += r[5]
309 movq 48(%rdi), %r10 / prefetch r[6]
310 addq %r9, %rax
311 adcq $0, %rdx / p += cy
312 movq %rax, 40(%rdi) / r[5] = lo(p)
313 movq %rdx, %r9 / cy = hi(p)
315 movq %r11, %rax
316 movq 56(%rsi), %r11 / prefetch a[7]
317 mulq %rcx / p = a[6] * digit
318 addq %r10, %rax
319 adcq $0, %rdx / p += r[6]
320 movq 56(%rdi), %r10 / prefetch r[7]
321 addq %r9, %rax
322 adcq $0, %rdx / p += cy
323 movq %rax, 48(%rdi) / r[6] = lo(p)
324 movq %rdx, %r9 / cy = hi(p)
326 movq %r11, %rax
327 mulq %rcx / p = a[7] * digit
328 addq %r10, %rax
329 adcq $0, %rdx / p += r[7]
330 addq %r9, %rax
331 adcq $0, %rdx / p += cy
332 movq %rax, 56(%rdi) / r[7] = lo(p)
333 movq %rdx, %r9 / cy = hi(p)
335 addq $64, %rsi
336 addq $64, %rdi
337 subq $8, %r8
339 jz .L27
340 jmp .L25
342 .L26:
343 movq 0(%rsi), %rax
344 movq 0(%rdi), %r10
345 mulq %rcx / p = a[0] * digit
346 addq %r10, %rax
347 adcq $0, %rdx / p += r[0]
348 addq %r9, %rax
349 adcq $0, %rdx / p += cy
350 movq %rax, 0(%rdi) / r[0] = lo(p)
351 movq %rdx, %r9 / cy = hi(p)
352 decq %r8
353 jz .L27
355 movq 8(%rsi), %rax
356 movq 8(%rdi), %r10
357 mulq %rcx / p = a[1] * digit
358 addq %r10, %rax
359 adcq $0, %rdx / p += r[1]
360 addq %r9, %rax
361 adcq $0, %rdx / p += cy
362 movq %rax, 8(%rdi) / r[1] = lo(p)
363 movq %rdx, %r9 / cy = hi(p)
364 decq %r8
365 jz .L27
367 movq 16(%rsi), %rax
368 movq 16(%rdi), %r10
369 mulq %rcx / p = a[2] * digit
370 addq %r10, %rax
371 adcq $0, %rdx / p += r[2]
372 addq %r9, %rax
373 adcq $0, %rdx / p += cy
374 movq %rax, 16(%rdi) / r[2] = lo(p)
375 movq %rdx, %r9 / cy = hi(p)
376 decq %r8
377 jz .L27
379 movq 24(%rsi), %rax
380 movq 24(%rdi), %r10
381 mulq %rcx / p = a[3] * digit
382 addq %r10, %rax
383 adcq $0, %rdx / p += r[3]
384 addq %r9, %rax
385 adcq $0, %rdx / p += cy
386 movq %rax, 24(%rdi) / r[3] = lo(p)
387 movq %rdx, %r9 / cy = hi(p)
388 decq %r8
389 jz .L27
391 movq 32(%rsi), %rax
392 movq 32(%rdi), %r10
393 mulq %rcx / p = a[4] * digit
394 addq %r10, %rax
395 adcq $0, %rdx / p += r[4]
396 addq %r9, %rax
397 adcq $0, %rdx / p += cy
398 movq %rax, 32(%rdi) / r[4] = lo(p)
399 movq %rdx, %r9 / cy = hi(p)
400 decq %r8
401 jz .L27
403 movq 40(%rsi), %rax
404 movq 40(%rdi), %r10
405 mulq %rcx / p = a[5] * digit
406 addq %r10, %rax
407 adcq $0, %rdx / p += r[5]
408 addq %r9, %rax
409 adcq $0, %rdx / p += cy
410 movq %rax, 40(%rdi) / r[5] = lo(p)
411 movq %rdx, %r9 / cy = hi(p)
412 decq %r8
413 jz .L27
415 movq 48(%rsi), %rax
416 movq 48(%rdi), %r10
417 mulq %rcx / p = a[6] * digit
418 addq %r10, %rax
419 adcq $0, %rdx / p += r[6]
420 addq %r9, %rax
421 adcq $0, %rdx / p += cy
422 movq %rax, 48(%rdi) / r[6] = lo(p)
423 movq %rdx, %r9 / cy = hi(p)
424 decq %r8
425 jz .L27
428 .L27:
429 movq %r9, %rax
431 SET_SIZE(big_mul_add_vec)
434 / void
435 / big_sqr_vec(uint64_t *r, uint64_t *a, int len)
437 ENTRY(big_sqr_vec)
438 pushq %rbx
439 pushq %rbp
440 pushq %r12
441 pushq %r13
442 pushq %r14
443 pushq %r15
444 pushq %rdx / save arg3, len
445 pushq %rsi / save arg2, a
446 pushq %rdi / save arg1, r
448 leaq 8(%rdi), %r13 / tr = r + 1
449 movq %rsi, %r14 / ta = a
450 movq %rdx, %r15 / tlen = len
451 decq %r15 / tlen = len - 1
452 movq %r13, %rdi / arg1 = tr
453 leaq 8(%r14), %rsi / arg2 = ta + 1
454 movq %r15, %rdx / arg3 = tlen
455 movq 0(%r14), %rcx / arg4 = ta[0]
456 call big_mul_set_vec
457 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
458 .L31:
459 decq %r15 / --tlen
460 jz .L32 / while (--tlen != 0)
462 addq $16, %r13 / tr += 2
463 addq $8, %r14 / ++ta
464 movq %r13, %rdi / arg1 = tr
465 leaq 8(%r14), %rsi / arg2 = ta + 1
466 movq %r15, %rdx / arg3 = tlen
467 movq 0(%r14), %rcx / arg4 = ta[0]
468 call big_mul_add_vec
469 movq %rax, 0(%r13, %r15, 8) / tr[tlen] = cy
470 jmp .L31
472 .L32:
474 / No more function calls after this.
475 / Restore arguments to registers.
476 / However, don't use %rdx for arg3, len, because it is heavily
477 / used by the hardware MUL instruction. Use %r8, instead.
478 movq 0(%rsp), %rdi / %rdi == arg1 == r
479 movq 8(%rsp), %rsi / %rsi == arg2 == a
480 movq 16(%rsp), %r8 / %r8 == arg3 == len
482 movq 0(%rsi), %rax / %rax = a[0];
483 mulq %rax / s = %edx:%eax = a[0]**2
484 movq %rax, 0(%rdi) / r[0] = lo64(s)
485 movq %rdx, %r9 / cy = hi64(s)
486 xorq %rdx, %rdx
487 movq 8(%rdi), %rax / p = %rdx:%rax = r[1]
488 addq %rax, %rax
489 adcq $0, %rdx / p = p << 1
490 addq %r9, %rax
491 adcq $0, %rdx / p = (r[1] << 1) + cy
492 movq %rax, 8(%rdi) / r[1] = lo64(p)
493 movq %rdx, %r9 / cy = hi64(p)
494 movq $1, %r11 / row = 1
495 movq $2, %r12 / col = 2
496 movq %r8, %r15
497 decq %r15 / tlen = len - 1
498 .L33:
499 cmpq %r8, %r11 / len - row
500 jae .L34 / while (row < len)
502 movq 0(%rsi, %r11, 8), %rax / s = (uint128_t)a[row]
503 mulq %rax / s = s * s
504 xorq %rbx, %rbx
505 movq 0(%rdi, %r12, 8), %rcx / p = (uint128_t)r[col]
506 addq %rcx, %rcx
507 adcq $0, %rbx / p = p << 1
508 addq %rcx, %rax
509 adcq %rbx, %rdx / t = p + s
510 xorq %r10, %r10
511 movq %rax, %rbp / t2 = 0:lo64(t)
512 addq %r9, %rbp
513 adcq $0, %r10 / t2 = %r10:%rbp = lo64(t) + cy
514 movq %rbp, 0(%rdi, %r12, 8) / r[col] = lo64(t2)
515 xorq %rcx, %rcx
516 movq %rdx, %r9
517 addq %r10, %r9
518 adcq $0, %rcx / cy = hi64(t) + hi64(t2)
519 cmpq %r11, %r15
520 je .L34 / if (row == len - 1) break
521 xorq %rdx, %rdx
522 movq 8(%rdi, %r12, 8), %rax
523 addq %rax, %rax
524 adcq $0, %rdx
525 addq %r9, %rax
526 adcq %rcx, %rdx / p = (lo64(r[col+1]) << 1) + cy
527 movq %rax, 8(%rdi, %r12, 8) / r[col+1] = lo64(p)
528 movq %rdx, %r9 / cy = hi64(p)
530 incq %r11 / ++row
531 addq $2, %r12 / col += 2
532 jmp .L33
534 .L34:
535 movq %r9, 8(%rdi, %r12, 8) / r[col+1] = lo64(cy)
537 addq $24, %rsp / skip %rdi, %rsi, %rdx
538 popq %r15
539 popq %r14
540 popq %r13
541 popq %r12
542 popq %rbp
543 popq %rbx
547 SET_SIZE(big_sqr_vec)
549 #endif /* lint */