Import from 1.9a8 tarball
[mozilla-nss.git] / security / nss / lib / freebl / mpi / mpi_amd64_sun.s
blobac71074154dee2c943a1d48b7eee90c2bef7a456
1 / ***** BEGIN LICENSE BLOCK *****
2 / Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 /
4 / The contents of this file are subject to the Mozilla Public License Version
5 / 1.1 (the "License"); you may not use this file except in compliance with
6 / the License. You may obtain a copy of the License at
7 / http://www.mozilla.org/MPL/
8 /
9 / Software distributed under the License is distributed on an "AS IS" basis,
10 / WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 / for the specific language governing rights and limitations under the
12 / License.
14 / The Original Code is the Solaris software cryptographic token.
16 / The Initial Developer of the Original Code is
17 / Sun Microsystems, Inc.
18 / Portions created by the Initial Developer are Copyright (C) 2005
19 / the Initial Developer. All Rights Reserved.
21 / Contributor(s):
22 / Sun Microsystems, Inc.
24 / Alternatively, the contents of this file may be used under the terms of
25 / either the GNU General Public License Version 2 or later (the "GPL"), or
26 / the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 / in which case the provisions of the GPL or the LGPL are applicable instead
28 / of those above. If you wish to allow use of your version of this file only
29 / under the terms of either the GPL or the LGPL, and not to allow others to
30 / use your version of this file under the terms of the MPL, indicate your
31 / decision by deleting the provisions above and replace them with the notice
32 / and other provisions required by the GPL or the LGPL. If you do not delete
33 / the provisions above, a recipient may use your version of this file under
34 / the terms of any one of the MPL, the GPL or the LGPL.
36 / ***** END LICENSE BLOCK ***** */
39 / ------------------------------------------------------------------------
41 / Implementation of s_mpv_mul_set_vec which exploits
42 / the 64X64->128 bit unsigned multiply instruction.
44 / ------------------------------------------------------------------------
46 / r = a * digit, r and a are vectors of length len
47 / returns the carry digit
48 / r and a are 64 bit aligned.
50 / uint64_t
51 / s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
54 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
56 xorq %rax, %rax / if (len == 0) return (0)
57 testq %rdx, %rdx
58 jz .L17
60 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
61 xorq %r9, %r9 / cy = 0
63 .L15:
64 cmpq $8, %r8 / 8 - len
65 jb .L16
66 movq 0(%rsi), %rax / rax = a[0]
67 movq 8(%rsi), %r11 / prefetch a[1]
68 mulq %rcx / p = a[0] * digit
69 addq %r9, %rax
70 adcq $0, %rdx / p += cy
71 movq %rax, 0(%rdi) / r[0] = lo(p)
72 movq %rdx, %r9 / cy = hi(p)
74 movq %r11, %rax
75 movq 16(%rsi), %r11 / prefetch a[2]
76 mulq %rcx / p = a[1] * digit
77 addq %r9, %rax
78 adcq $0, %rdx / p += cy
79 movq %rax, 8(%rdi) / r[1] = lo(p)
80 movq %rdx, %r9 / cy = hi(p)
82 movq %r11, %rax
83 movq 24(%rsi), %r11 / prefetch a[3]
84 mulq %rcx / p = a[2] * digit
85 addq %r9, %rax
86 adcq $0, %rdx / p += cy
87 movq %rax, 16(%rdi) / r[2] = lo(p)
88 movq %rdx, %r9 / cy = hi(p)
90 movq %r11, %rax
91 movq 32(%rsi), %r11 / prefetch a[4]
92 mulq %rcx / p = a[3] * digit
93 addq %r9, %rax
94 adcq $0, %rdx / p += cy
95 movq %rax, 24(%rdi) / r[3] = lo(p)
96 movq %rdx, %r9 / cy = hi(p)
98 movq %r11, %rax
99 movq 40(%rsi), %r11 / prefetch a[5]
100 mulq %rcx / p = a[4] * digit
101 addq %r9, %rax
102 adcq $0, %rdx / p += cy
103 movq %rax, 32(%rdi) / r[4] = lo(p)
104 movq %rdx, %r9 / cy = hi(p)
106 movq %r11, %rax
107 movq 48(%rsi), %r11 / prefetch a[6]
108 mulq %rcx / p = a[5] * digit
109 addq %r9, %rax
110 adcq $0, %rdx / p += cy
111 movq %rax, 40(%rdi) / r[5] = lo(p)
112 movq %rdx, %r9 / cy = hi(p)
114 movq %r11, %rax
115 movq 56(%rsi), %r11 / prefetch a[7]
116 mulq %rcx / p = a[6] * digit
117 addq %r9, %rax
118 adcq $0, %rdx / p += cy
119 movq %rax, 48(%rdi) / r[6] = lo(p)
120 movq %rdx, %r9 / cy = hi(p)
122 movq %r11, %rax
123 mulq %rcx / p = a[7] * digit
124 addq %r9, %rax
125 adcq $0, %rdx / p += cy
126 movq %rax, 56(%rdi) / r[7] = lo(p)
127 movq %rdx, %r9 / cy = hi(p)
129 addq $64, %rsi
130 addq $64, %rdi
131 subq $8, %r8
133 jz .L17
134 jmp .L15
136 .L16:
137 movq 0(%rsi), %rax
138 mulq %rcx / p = a[0] * digit
139 addq %r9, %rax
140 adcq $0, %rdx / p += cy
141 movq %rax, 0(%rdi) / r[0] = lo(p)
142 movq %rdx, %r9 / cy = hi(p)
143 decq %r8
144 jz .L17
146 movq 8(%rsi), %rax
147 mulq %rcx / p = a[1] * digit
148 addq %r9, %rax
149 adcq $0, %rdx / p += cy
150 movq %rax, 8(%rdi) / r[1] = lo(p)
151 movq %rdx, %r9 / cy = hi(p)
152 decq %r8
153 jz .L17
155 movq 16(%rsi), %rax
156 mulq %rcx / p = a[2] * digit
157 addq %r9, %rax
158 adcq $0, %rdx / p += cy
159 movq %rax, 16(%rdi) / r[2] = lo(p)
160 movq %rdx, %r9 / cy = hi(p)
161 decq %r8
162 jz .L17
164 movq 24(%rsi), %rax
165 mulq %rcx / p = a[3] * digit
166 addq %r9, %rax
167 adcq $0, %rdx / p += cy
168 movq %rax, 24(%rdi) / r[3] = lo(p)
169 movq %rdx, %r9 / cy = hi(p)
170 decq %r8
171 jz .L17
173 movq 32(%rsi), %rax
174 mulq %rcx / p = a[4] * digit
175 addq %r9, %rax
176 adcq $0, %rdx / p += cy
177 movq %rax, 32(%rdi) / r[4] = lo(p)
178 movq %rdx, %r9 / cy = hi(p)
179 decq %r8
180 jz .L17
182 movq 40(%rsi), %rax
183 mulq %rcx / p = a[5] * digit
184 addq %r9, %rax
185 adcq $0, %rdx / p += cy
186 movq %rax, 40(%rdi) / r[5] = lo(p)
187 movq %rdx, %r9 / cy = hi(p)
188 decq %r8
189 jz .L17
191 movq 48(%rsi), %rax
192 mulq %rcx / p = a[6] * digit
193 addq %r9, %rax
194 adcq $0, %rdx / p += cy
195 movq %rax, 48(%rdi) / r[6] = lo(p)
196 movq %rdx, %r9 / cy = hi(p)
197 decq %r8
198 jz .L17
201 .L17:
202 movq %r9, %rax
205 .size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
207 / ------------------------------------------------------------------------
209 / Implementation of s_mpv_mul_add_vec which exploits
210 / the 64X64->128 bit unsigned multiply instruction.
212 / ------------------------------------------------------------------------
214 / r += a * digit, r and a are vectors of length len
215 / returns the carry digit
216 / r and a are 64 bit aligned.
218 / uint64_t
219 / s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
222 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
224 xorq %rax, %rax / if (len == 0) return (0)
225 testq %rdx, %rdx
226 jz .L27
228 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
229 xorq %r9, %r9 / cy = 0
231 .L25:
232 cmpq $8, %r8 / 8 - len
233 jb .L26
234 movq 0(%rsi), %rax / rax = a[0]
235 movq 0(%rdi), %r10 / r10 = r[0]
236 movq 8(%rsi), %r11 / prefetch a[1]
237 mulq %rcx / p = a[0] * digit
238 addq %r10, %rax
239 adcq $0, %rdx / p += r[0]
240 movq 8(%rdi), %r10 / prefetch r[1]
241 addq %r9, %rax
242 adcq $0, %rdx / p += cy
243 movq %rax, 0(%rdi) / r[0] = lo(p)
244 movq %rdx, %r9 / cy = hi(p)
246 movq %r11, %rax
247 movq 16(%rsi), %r11 / prefetch a[2]
248 mulq %rcx / p = a[1] * digit
249 addq %r10, %rax
250 adcq $0, %rdx / p += r[1]
251 movq 16(%rdi), %r10 / prefetch r[2]
252 addq %r9, %rax
253 adcq $0, %rdx / p += cy
254 movq %rax, 8(%rdi) / r[1] = lo(p)
255 movq %rdx, %r9 / cy = hi(p)
257 movq %r11, %rax
258 movq 24(%rsi), %r11 / prefetch a[3]
259 mulq %rcx / p = a[2] * digit
260 addq %r10, %rax
261 adcq $0, %rdx / p += r[2]
262 movq 24(%rdi), %r10 / prefetch r[3]
263 addq %r9, %rax
264 adcq $0, %rdx / p += cy
265 movq %rax, 16(%rdi) / r[2] = lo(p)
266 movq %rdx, %r9 / cy = hi(p)
268 movq %r11, %rax
269 movq 32(%rsi), %r11 / prefetch a[4]
270 mulq %rcx / p = a[3] * digit
271 addq %r10, %rax
272 adcq $0, %rdx / p += r[3]
273 movq 32(%rdi), %r10 / prefetch r[4]
274 addq %r9, %rax
275 adcq $0, %rdx / p += cy
276 movq %rax, 24(%rdi) / r[3] = lo(p)
277 movq %rdx, %r9 / cy = hi(p)
279 movq %r11, %rax
280 movq 40(%rsi), %r11 / prefetch a[5]
281 mulq %rcx / p = a[4] * digit
282 addq %r10, %rax
283 adcq $0, %rdx / p += r[4]
284 movq 40(%rdi), %r10 / prefetch r[5]
285 addq %r9, %rax
286 adcq $0, %rdx / p += cy
287 movq %rax, 32(%rdi) / r[4] = lo(p)
288 movq %rdx, %r9 / cy = hi(p)
290 movq %r11, %rax
291 movq 48(%rsi), %r11 / prefetch a[6]
292 mulq %rcx / p = a[5] * digit
293 addq %r10, %rax
294 adcq $0, %rdx / p += r[5]
295 movq 48(%rdi), %r10 / prefetch r[6]
296 addq %r9, %rax
297 adcq $0, %rdx / p += cy
298 movq %rax, 40(%rdi) / r[5] = lo(p)
299 movq %rdx, %r9 / cy = hi(p)
301 movq %r11, %rax
302 movq 56(%rsi), %r11 / prefetch a[7]
303 mulq %rcx / p = a[6] * digit
304 addq %r10, %rax
305 adcq $0, %rdx / p += r[6]
306 movq 56(%rdi), %r10 / prefetch r[7]
307 addq %r9, %rax
308 adcq $0, %rdx / p += cy
309 movq %rax, 48(%rdi) / r[6] = lo(p)
310 movq %rdx, %r9 / cy = hi(p)
312 movq %r11, %rax
313 mulq %rcx / p = a[7] * digit
314 addq %r10, %rax
315 adcq $0, %rdx / p += r[7]
316 addq %r9, %rax
317 adcq $0, %rdx / p += cy
318 movq %rax, 56(%rdi) / r[7] = lo(p)
319 movq %rdx, %r9 / cy = hi(p)
321 addq $64, %rsi
322 addq $64, %rdi
323 subq $8, %r8
325 jz .L27
326 jmp .L25
328 .L26:
329 movq 0(%rsi), %rax
330 movq 0(%rdi), %r10
331 mulq %rcx / p = a[0] * digit
332 addq %r10, %rax
333 adcq $0, %rdx / p += r[0]
334 addq %r9, %rax
335 adcq $0, %rdx / p += cy
336 movq %rax, 0(%rdi) / r[0] = lo(p)
337 movq %rdx, %r9 / cy = hi(p)
338 decq %r8
339 jz .L27
341 movq 8(%rsi), %rax
342 movq 8(%rdi), %r10
343 mulq %rcx / p = a[1] * digit
344 addq %r10, %rax
345 adcq $0, %rdx / p += r[1]
346 addq %r9, %rax
347 adcq $0, %rdx / p += cy
348 movq %rax, 8(%rdi) / r[1] = lo(p)
349 movq %rdx, %r9 / cy = hi(p)
350 decq %r8
351 jz .L27
353 movq 16(%rsi), %rax
354 movq 16(%rdi), %r10
355 mulq %rcx / p = a[2] * digit
356 addq %r10, %rax
357 adcq $0, %rdx / p += r[2]
358 addq %r9, %rax
359 adcq $0, %rdx / p += cy
360 movq %rax, 16(%rdi) / r[2] = lo(p)
361 movq %rdx, %r9 / cy = hi(p)
362 decq %r8
363 jz .L27
365 movq 24(%rsi), %rax
366 movq 24(%rdi), %r10
367 mulq %rcx / p = a[3] * digit
368 addq %r10, %rax
369 adcq $0, %rdx / p += r[3]
370 addq %r9, %rax
371 adcq $0, %rdx / p += cy
372 movq %rax, 24(%rdi) / r[3] = lo(p)
373 movq %rdx, %r9 / cy = hi(p)
374 decq %r8
375 jz .L27
377 movq 32(%rsi), %rax
378 movq 32(%rdi), %r10
379 mulq %rcx / p = a[4] * digit
380 addq %r10, %rax
381 adcq $0, %rdx / p += r[4]
382 addq %r9, %rax
383 adcq $0, %rdx / p += cy
384 movq %rax, 32(%rdi) / r[4] = lo(p)
385 movq %rdx, %r9 / cy = hi(p)
386 decq %r8
387 jz .L27
389 movq 40(%rsi), %rax
390 movq 40(%rdi), %r10
391 mulq %rcx / p = a[5] * digit
392 addq %r10, %rax
393 adcq $0, %rdx / p += r[5]
394 addq %r9, %rax
395 adcq $0, %rdx / p += cy
396 movq %rax, 40(%rdi) / r[5] = lo(p)
397 movq %rdx, %r9 / cy = hi(p)
398 decq %r8
399 jz .L27
401 movq 48(%rsi), %rax
402 movq 48(%rdi), %r10
403 mulq %rcx / p = a[6] * digit
404 addq %r10, %rax
405 adcq $0, %rdx / p += r[6]
406 addq %r9, %rax
407 adcq $0, %rdx / p += cy
408 movq %rax, 48(%rdi) / r[6] = lo(p)
409 movq %rdx, %r9 / cy = hi(p)
410 decq %r8
411 jz .L27
414 .L27:
415 movq %r9, %rax
418 .size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]