1 # ***** BEGIN LICENSE BLOCK *****
2 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 # The contents of this file are subject to the Mozilla Public License Version
5 # 1.1 (the "License"); you may not use this file except in compliance with
6 # the License. You may obtain a copy of the License at
7 # http://www.mozilla.org/MPL/
9 # Software distributed under the License is distributed on an "AS IS" basis,
10 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 # for the specific language governing rights and limitations under the
14 # The Original Code is the Solaris software cryptographic token.
16 # The Initial Developer of the Original Code is
17 # Sun Microsystems, Inc.
18 # Portions created by the Initial Developer are Copyright (C) 2005
19 # the Initial Developer. All Rights Reserved.
22 # Sun Microsystems, Inc.
24 # Alternatively, the contents of this file may be used under the terms of
25 # either the GNU General Public License Version 2 or later (the "GPL"), or
26 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 # in which case the provisions of the GPL or the LGPL are applicable instead
28 # of those above. If you wish to allow use of your version of this file only
29 # under the terms of either the GPL or the LGPL, and not to allow others to
30 # use your version of this file under the terms of the MPL, indicate your
31 # decision by deleting the provisions above and replace them with the notice
32 # and other provisions required by the GPL or the LGPL. If you do not delete
33 # the provisions above, a recipient may use your version of this file under
34 # the terms of any one of the MPL, the GPL or the LGPL.
36 # ***** END LICENSE BLOCK ***** */
39 # ------------------------------------------------------------------------
41 # Implementation of s_mpv_mul_set_vec which exploits
42 # the 64X64->128 bit unsigned multiply instruction.
44 # ------------------------------------------------------------------------
46 # r = a * digit, r and a are vectors of length len
47 # returns the carry digit
48 # r and a are 64 bit aligned.
51 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
54 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
56 xorq
%rax
, %rax
# if (len == 0) return (0)
60 movq
%rdx
, %r8 # Use r8 for len; %rdx is used by mul
61 xorq
%r9, %r9 # cy = 0
64 cmpq $
8, %r8 # 8 - len
66 movq
0(%rsi
), %rax
# rax = a[0]
67 movq
8(%rsi
), %r11 # prefetch a[1]
68 mulq
%rcx
# p = a[0] * digit
70 adcq $
0, %rdx
# p += cy
71 movq
%rax
, 0(%rdi
) # r[0] = lo(p)
72 movq
%rdx
, %r9 # cy = hi(p)
75 movq
16(%rsi
), %r11 # prefetch a[2]
76 mulq
%rcx
# p = a[1] * digit
78 adcq $
0, %rdx
# p += cy
79 movq
%rax
, 8(%rdi
) # r[1] = lo(p)
80 movq
%rdx
, %r9 # cy = hi(p)
83 movq
24(%rsi
), %r11 # prefetch a[3]
84 mulq
%rcx
# p = a[2] * digit
86 adcq $
0, %rdx
# p += cy
87 movq
%rax
, 16(%rdi
) # r[2] = lo(p)
88 movq
%rdx
, %r9 # cy = hi(p)
91 movq
32(%rsi
), %r11 # prefetch a[4]
92 mulq
%rcx
# p = a[3] * digit
94 adcq $
0, %rdx
# p += cy
95 movq
%rax
, 24(%rdi
) # r[3] = lo(p)
96 movq
%rdx
, %r9 # cy = hi(p)
99 movq
40(%rsi
), %r11 # prefetch a[5]
100 mulq
%rcx
# p = a[4] * digit
102 adcq $
0, %rdx
# p += cy
103 movq
%rax
, 32(%rdi
) # r[4] = lo(p)
104 movq
%rdx
, %r9 # cy = hi(p)
107 movq
48(%rsi
), %r11 # prefetch a[6]
108 mulq
%rcx
# p = a[5] * digit
110 adcq $
0, %rdx
# p += cy
111 movq
%rax
, 40(%rdi
) # r[5] = lo(p)
112 movq
%rdx
, %r9 # cy = hi(p)
115 movq
56(%rsi
), %r11 # prefetch a[7]
116 mulq
%rcx
# p = a[6] * digit
118 adcq $
0, %rdx
# p += cy
119 movq
%rax
, 48(%rdi
) # r[6] = lo(p)
120 movq
%rdx
, %r9 # cy = hi(p)
123 mulq
%rcx
# p = a[7] * digit
125 adcq $
0, %rdx
# p += cy
126 movq
%rax
, 56(%rdi
) # r[7] = lo(p)
127 movq
%rdx
, %r9 # cy = hi(p)
138 mulq
%rcx
# p = a[0] * digit
140 adcq $
0, %rdx
# p += cy
141 movq
%rax
, 0(%rdi
) # r[0] = lo(p)
142 movq
%rdx
, %r9 # cy = hi(p)
147 mulq
%rcx
# p = a[1] * digit
149 adcq $
0, %rdx
# p += cy
150 movq
%rax
, 8(%rdi
) # r[1] = lo(p)
151 movq
%rdx
, %r9 # cy = hi(p)
156 mulq
%rcx
# p = a[2] * digit
158 adcq $
0, %rdx
# p += cy
159 movq
%rax
, 16(%rdi
) # r[2] = lo(p)
160 movq
%rdx
, %r9 # cy = hi(p)
165 mulq
%rcx
# p = a[3] * digit
167 adcq $
0, %rdx
# p += cy
168 movq
%rax
, 24(%rdi
) # r[3] = lo(p)
169 movq
%rdx
, %r9 # cy = hi(p)
174 mulq
%rcx
# p = a[4] * digit
176 adcq $
0, %rdx
# p += cy
177 movq
%rax
, 32(%rdi
) # r[4] = lo(p)
178 movq
%rdx
, %r9 # cy = hi(p)
183 mulq
%rcx
# p = a[5] * digit
185 adcq $
0, %rdx
# p += cy
186 movq
%rax
, 40(%rdi
) # r[5] = lo(p)
187 movq
%rdx
, %r9 # cy = hi(p)
192 mulq
%rcx
# p = a[6] * digit
194 adcq $
0, %rdx
# p += cy
195 movq
%rax
, 48(%rdi
) # r[6] = lo(p)
196 movq
%rdx
, %r9 # cy = hi(p)
205 .size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
207 # ------------------------------------------------------------------------
209 # Implementation of s_mpv_mul_add_vec which exploits
210 # the 64X64->128 bit unsigned multiply instruction.
212 # ------------------------------------------------------------------------
214 # r += a * digit, r and a are vectors of length len
215 # returns the carry digit
216 # r and a are 64 bit aligned.
219 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
222 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
224 xorq
%rax
, %rax
# if (len == 0) return (0)
228 movq
%rdx
, %r8 # Use r8 for len; %rdx is used by mul
229 xorq
%r9, %r9 # cy = 0
232 cmpq $
8, %r8 # 8 - len
234 movq
0(%rsi
), %rax
# rax = a[0]
235 movq
0(%rdi
), %r10 # r10 = r[0]
236 movq
8(%rsi
), %r11 # prefetch a[1]
237 mulq
%rcx
# p = a[0] * digit
239 adcq $
0, %rdx
# p += r[0]
240 movq
8(%rdi
), %r10 # prefetch r[1]
242 adcq $
0, %rdx
# p += cy
243 movq
%rax
, 0(%rdi
) # r[0] = lo(p)
244 movq
%rdx
, %r9 # cy = hi(p)
247 movq
16(%rsi
), %r11 # prefetch a[2]
248 mulq
%rcx
# p = a[1] * digit
250 adcq $
0, %rdx
# p += r[1]
251 movq
16(%rdi
), %r10 # prefetch r[2]
253 adcq $
0, %rdx
# p += cy
254 movq
%rax
, 8(%rdi
) # r[1] = lo(p)
255 movq
%rdx
, %r9 # cy = hi(p)
258 movq
24(%rsi
), %r11 # prefetch a[3]
259 mulq
%rcx
# p = a[2] * digit
261 adcq $
0, %rdx
# p += r[2]
262 movq
24(%rdi
), %r10 # prefetch r[3]
264 adcq $
0, %rdx
# p += cy
265 movq
%rax
, 16(%rdi
) # r[2] = lo(p)
266 movq
%rdx
, %r9 # cy = hi(p)
269 movq
32(%rsi
), %r11 # prefetch a[4]
270 mulq
%rcx
# p = a[3] * digit
272 adcq $
0, %rdx
# p += r[3]
273 movq
32(%rdi
), %r10 # prefetch r[4]
275 adcq $
0, %rdx
# p += cy
276 movq
%rax
, 24(%rdi
) # r[3] = lo(p)
277 movq
%rdx
, %r9 # cy = hi(p)
280 movq
40(%rsi
), %r11 # prefetch a[5]
281 mulq
%rcx
# p = a[4] * digit
283 adcq $
0, %rdx
# p += r[4]
284 movq
40(%rdi
), %r10 # prefetch r[5]
286 adcq $
0, %rdx
# p += cy
287 movq
%rax
, 32(%rdi
) # r[4] = lo(p)
288 movq
%rdx
, %r9 # cy = hi(p)
291 movq
48(%rsi
), %r11 # prefetch a[6]
292 mulq
%rcx
# p = a[5] * digit
294 adcq $
0, %rdx
# p += r[5]
295 movq
48(%rdi
), %r10 # prefetch r[6]
297 adcq $
0, %rdx
# p += cy
298 movq
%rax
, 40(%rdi
) # r[5] = lo(p)
299 movq
%rdx
, %r9 # cy = hi(p)
302 movq
56(%rsi
), %r11 # prefetch a[7]
303 mulq
%rcx
# p = a[6] * digit
305 adcq $
0, %rdx
# p += r[6]
306 movq
56(%rdi
), %r10 # prefetch r[7]
308 adcq $
0, %rdx
# p += cy
309 movq
%rax
, 48(%rdi
) # r[6] = lo(p)
310 movq
%rdx
, %r9 # cy = hi(p)
313 mulq
%rcx
# p = a[7] * digit
315 adcq $
0, %rdx
# p += r[7]
317 adcq $
0, %rdx
# p += cy
318 movq
%rax
, 56(%rdi
) # r[7] = lo(p)
319 movq
%rdx
, %r9 # cy = hi(p)
331 mulq
%rcx
# p = a[0] * digit
333 adcq $
0, %rdx
# p += r[0]
335 adcq $
0, %rdx
# p += cy
336 movq
%rax
, 0(%rdi
) # r[0] = lo(p)
337 movq
%rdx
, %r9 # cy = hi(p)
343 mulq
%rcx
# p = a[1] * digit
345 adcq $
0, %rdx
# p += r[1]
347 adcq $
0, %rdx
# p += cy
348 movq
%rax
, 8(%rdi
) # r[1] = lo(p)
349 movq
%rdx
, %r9 # cy = hi(p)
355 mulq
%rcx
# p = a[2] * digit
357 adcq $
0, %rdx
# p += r[2]
359 adcq $
0, %rdx
# p += cy
360 movq
%rax
, 16(%rdi
) # r[2] = lo(p)
361 movq
%rdx
, %r9 # cy = hi(p)
367 mulq
%rcx
# p = a[3] * digit
369 adcq $
0, %rdx
# p += r[3]
371 adcq $
0, %rdx
# p += cy
372 movq
%rax
, 24(%rdi
) # r[3] = lo(p)
373 movq
%rdx
, %r9 # cy = hi(p)
379 mulq
%rcx
# p = a[4] * digit
381 adcq $
0, %rdx
# p += r[4]
383 adcq $
0, %rdx
# p += cy
384 movq
%rax
, 32(%rdi
) # r[4] = lo(p)
385 movq
%rdx
, %r9 # cy = hi(p)
391 mulq
%rcx
# p = a[5] * digit
393 adcq $
0, %rdx
# p += r[5]
395 adcq $
0, %rdx
# p += cy
396 movq
%rax
, 40(%rdi
) # r[5] = lo(p)
397 movq
%rdx
, %r9 # cy = hi(p)
403 mulq
%rcx
# p = a[6] * digit
405 adcq $
0, %rdx
# p += r[6]
407 adcq $
0, %rdx
# p += cy
408 movq
%rax
, 48(%rdi
) # r[6] = lo(p)
409 movq
%rdx
, %r9 # cy = hi(p)
418 .size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]
420 # Magic indicating no need for an executable stack
421 .section .note.GNU-stack, "", @progbits