1 / ***** BEGIN LICENSE BLOCK
*****
2 / Version
: MPL
1.1/GPL
2.0/LGPL
2.1
4 / The contents of this file are subject to the Mozilla Public License Version
5 / 1.1 (the
"License"); you may
not use this file except in compliance with
6 / the License. You may obtain
a copy of the License at
7 / http
://www.mozilla.org
/MPL
/
9 / Software distributed under the License is distributed on an
"AS IS" basis
,
10 / WITHOUT WARRANTY OF ANY KIND
, either express
or implied. See the License
11 / for the specific language governing rights
and limitations under the
14 / The Original Code is the Solaris software cryptographic token.
16 / The Initial Developer of the Original Code is
17 / Sun Microsystems
, Inc.
18 / Portions created by the Initial Developer are Copyright
(C
) 2005
19 / the Initial Developer. All Rights Reserved.
22 / Sun Microsystems
, Inc.
24 / Alternatively
, the contents of this file may
be used under the terms of
25 / either the GNU General Public License Version
2 or later
(the
"GPL"), or
26 / the GNU Lesser General Public License Version
2.1 or later
(the
"LGPL"),
27 / in which case the provisions of the GPL
or the LGPL are applicable instead
28 / of those above. If you wish to allow use of your version of this file only
29 / under the terms of either the GPL
or the LGPL
, and not to allow others to
30 / use your version of this file under the terms of the MPL
, indicate your
31 / decision by deleting the provisions above
and replace them with the notice
32 / and other provisions required by the GPL
or the LGPL. If you do
not delete
33 / the provisions above
, a recipient may use your version of this file under
34 / the terms of any one of the MPL
, the GPL
or the LGPL.
36 / ***** END LICENSE BLOCK
***** */
39 / ------------------------------------------------------------------------
41 / Implementation of s_mpv_mul_set_vec which exploits
42 / the
64X64-
>128 bit unsigned multiply instruction.
44 / ------------------------------------------------------------------------
46 / r
= a * digit
, r
and a are vectors of length len
47 / returns the carry digit
48 / r
and a are
64 bit aligned.
51 / s_mpv_mul_set_vec64
(uint64_t
*r
, uint64_t
*a, int len
, uint64_t digit
)
54 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
56 xorq
%rax
, %rax
/ if
(len
== 0) return
(0)
60 movq
%rdx
, %r8 / Use
r8 for len;
%rdx is used by
mul
61 xorq
%r9, %r9 / cy
= 0
64 cmpq $
8, %r8 / 8 - len
66 movq
0(%rsi
), %rax
/ rax
= a[0]
67 movq
8(%rsi
), %r11 / prefetch
a[1]
68 mulq
%rcx
/ p
= a[0] * digit
70 adcq $
0, %rdx
/ p
+= cy
71 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
72 movq
%rdx
, %r9 / cy
= hi
(p
)
75 movq
16(%rsi
), %r11 / prefetch
a[2]
76 mulq
%rcx
/ p
= a[1] * digit
78 adcq $
0, %rdx
/ p
+= cy
79 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
80 movq
%rdx
, %r9 / cy
= hi
(p
)
83 movq
24(%rsi
), %r11 / prefetch
a[3]
84 mulq
%rcx
/ p
= a[2] * digit
86 adcq $
0, %rdx
/ p
+= cy
87 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
88 movq
%rdx
, %r9 / cy
= hi
(p
)
91 movq
32(%rsi
), %r11 / prefetch
a[4]
92 mulq
%rcx
/ p
= a[3] * digit
94 adcq $
0, %rdx
/ p
+= cy
95 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
96 movq
%rdx
, %r9 / cy
= hi
(p
)
99 movq
40(%rsi
), %r11 / prefetch
a[5]
100 mulq
%rcx
/ p
= a[4] * digit
102 adcq $
0, %rdx
/ p
+= cy
103 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
104 movq
%rdx
, %r9 / cy
= hi
(p
)
107 movq
48(%rsi
), %r11 / prefetch
a[6]
108 mulq
%rcx
/ p
= a[5] * digit
110 adcq $
0, %rdx
/ p
+= cy
111 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
112 movq
%rdx
, %r9 / cy
= hi
(p
)
115 movq
56(%rsi
), %r11 / prefetch
a[7]
116 mulq
%rcx
/ p
= a[6] * digit
118 adcq $
0, %rdx
/ p
+= cy
119 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
120 movq
%rdx
, %r9 / cy
= hi
(p
)
123 mulq
%rcx
/ p
= a[7] * digit
125 adcq $
0, %rdx
/ p
+= cy
126 movq
%rax
, 56(%rdi
) / r
[7] = lo
(p
)
127 movq
%rdx
, %r9 / cy
= hi
(p
)
138 mulq
%rcx
/ p
= a[0] * digit
140 adcq $
0, %rdx
/ p
+= cy
141 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
142 movq
%rdx
, %r9 / cy
= hi
(p
)
147 mulq
%rcx
/ p
= a[1] * digit
149 adcq $
0, %rdx
/ p
+= cy
150 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
151 movq
%rdx
, %r9 / cy
= hi
(p
)
156 mulq
%rcx
/ p
= a[2] * digit
158 adcq $
0, %rdx
/ p
+= cy
159 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
160 movq
%rdx
, %r9 / cy
= hi
(p
)
165 mulq
%rcx
/ p
= a[3] * digit
167 adcq $
0, %rdx
/ p
+= cy
168 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
169 movq
%rdx
, %r9 / cy
= hi
(p
)
174 mulq
%rcx
/ p
= a[4] * digit
176 adcq $
0, %rdx
/ p
+= cy
177 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
178 movq
%rdx
, %r9 / cy
= hi
(p
)
183 mulq
%rcx
/ p
= a[5] * digit
185 adcq $
0, %rdx
/ p
+= cy
186 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
187 movq
%rdx
, %r9 / cy
= hi
(p
)
192 mulq
%rcx
/ p
= a[6] * digit
194 adcq $
0, %rdx
/ p
+= cy
195 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
196 movq
%rdx
, %r9 / cy
= hi
(p
)
205 .size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
207 / ------------------------------------------------------------------------
209 / Implementation of s_mpv_mul_add_vec which exploits
210 / the
64X64-
>128 bit unsigned multiply instruction.
212 / ------------------------------------------------------------------------
214 / r
+= a * digit
, r
and a are vectors of length len
215 / returns the carry digit
216 / r
and a are
64 bit aligned.
219 / s_mpv_mul_add_vec64
(uint64_t
*r
, uint64_t
*a, int len
, uint64_t digit
)
222 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
224 xorq
%rax
, %rax
/ if
(len
== 0) return
(0)
228 movq
%rdx
, %r8 / Use
r8 for len;
%rdx is used by
mul
229 xorq
%r9, %r9 / cy
= 0
232 cmpq $
8, %r8 / 8 - len
234 movq
0(%rsi
), %rax
/ rax
= a[0]
235 movq
0(%rdi
), %r10 / r10 = r
[0]
236 movq
8(%rsi
), %r11 / prefetch
a[1]
237 mulq
%rcx
/ p
= a[0] * digit
239 adcq $
0, %rdx
/ p
+= r
[0]
240 movq
8(%rdi
), %r10 / prefetch r
[1]
242 adcq $
0, %rdx
/ p
+= cy
243 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
244 movq
%rdx
, %r9 / cy
= hi
(p
)
247 movq
16(%rsi
), %r11 / prefetch
a[2]
248 mulq
%rcx
/ p
= a[1] * digit
250 adcq $
0, %rdx
/ p
+= r
[1]
251 movq
16(%rdi
), %r10 / prefetch r
[2]
253 adcq $
0, %rdx
/ p
+= cy
254 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
255 movq
%rdx
, %r9 / cy
= hi
(p
)
258 movq
24(%rsi
), %r11 / prefetch
a[3]
259 mulq
%rcx
/ p
= a[2] * digit
261 adcq $
0, %rdx
/ p
+= r
[2]
262 movq
24(%rdi
), %r10 / prefetch r
[3]
264 adcq $
0, %rdx
/ p
+= cy
265 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
266 movq
%rdx
, %r9 / cy
= hi
(p
)
269 movq
32(%rsi
), %r11 / prefetch
a[4]
270 mulq
%rcx
/ p
= a[3] * digit
272 adcq $
0, %rdx
/ p
+= r
[3]
273 movq
32(%rdi
), %r10 / prefetch r
[4]
275 adcq $
0, %rdx
/ p
+= cy
276 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
277 movq
%rdx
, %r9 / cy
= hi
(p
)
280 movq
40(%rsi
), %r11 / prefetch
a[5]
281 mulq
%rcx
/ p
= a[4] * digit
283 adcq $
0, %rdx
/ p
+= r
[4]
284 movq
40(%rdi
), %r10 / prefetch r
[5]
286 adcq $
0, %rdx
/ p
+= cy
287 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
288 movq
%rdx
, %r9 / cy
= hi
(p
)
291 movq
48(%rsi
), %r11 / prefetch
a[6]
292 mulq
%rcx
/ p
= a[5] * digit
294 adcq $
0, %rdx
/ p
+= r
[5]
295 movq
48(%rdi
), %r10 / prefetch r
[6]
297 adcq $
0, %rdx
/ p
+= cy
298 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
299 movq
%rdx
, %r9 / cy
= hi
(p
)
302 movq
56(%rsi
), %r11 / prefetch
a[7]
303 mulq
%rcx
/ p
= a[6] * digit
305 adcq $
0, %rdx
/ p
+= r
[6]
306 movq
56(%rdi
), %r10 / prefetch r
[7]
308 adcq $
0, %rdx
/ p
+= cy
309 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
310 movq
%rdx
, %r9 / cy
= hi
(p
)
313 mulq
%rcx
/ p
= a[7] * digit
315 adcq $
0, %rdx
/ p
+= r
[7]
317 adcq $
0, %rdx
/ p
+= cy
318 movq
%rax
, 56(%rdi
) / r
[7] = lo
(p
)
319 movq
%rdx
, %r9 / cy
= hi
(p
)
331 mulq
%rcx
/ p
= a[0] * digit
333 adcq $
0, %rdx
/ p
+= r
[0]
335 adcq $
0, %rdx
/ p
+= cy
336 movq
%rax
, 0(%rdi
) / r
[0] = lo
(p
)
337 movq
%rdx
, %r9 / cy
= hi
(p
)
343 mulq
%rcx
/ p
= a[1] * digit
345 adcq $
0, %rdx
/ p
+= r
[1]
347 adcq $
0, %rdx
/ p
+= cy
348 movq
%rax
, 8(%rdi
) / r
[1] = lo
(p
)
349 movq
%rdx
, %r9 / cy
= hi
(p
)
355 mulq
%rcx
/ p
= a[2] * digit
357 adcq $
0, %rdx
/ p
+= r
[2]
359 adcq $
0, %rdx
/ p
+= cy
360 movq
%rax
, 16(%rdi
) / r
[2] = lo
(p
)
361 movq
%rdx
, %r9 / cy
= hi
(p
)
367 mulq
%rcx
/ p
= a[3] * digit
369 adcq $
0, %rdx
/ p
+= r
[3]
371 adcq $
0, %rdx
/ p
+= cy
372 movq
%rax
, 24(%rdi
) / r
[3] = lo
(p
)
373 movq
%rdx
, %r9 / cy
= hi
(p
)
379 mulq
%rcx
/ p
= a[4] * digit
381 adcq $
0, %rdx
/ p
+= r
[4]
383 adcq $
0, %rdx
/ p
+= cy
384 movq
%rax
, 32(%rdi
) / r
[4] = lo
(p
)
385 movq
%rdx
, %r9 / cy
= hi
(p
)
391 mulq
%rcx
/ p
= a[5] * digit
393 adcq $
0, %rdx
/ p
+= r
[5]
395 adcq $
0, %rdx
/ p
+= cy
396 movq
%rax
, 40(%rdi
) / r
[5] = lo
(p
)
397 movq
%rdx
, %r9 / cy
= hi
(p
)
403 mulq
%rcx
/ p
= a[6] * digit
405 adcq $
0, %rdx
/ p
+= r
[6]
407 adcq $
0, %rdx
/ p
+= cy
408 movq
%rax
, 48(%rdi
) / r
[6] = lo
(p
)
409 movq
%rdx
, %r9 / cy
= hi
(p
)
418 .size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]