3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13 # for undertaken effort are multiple. First of all, UltraSPARC is not
14 # the whole SPARCv9 universe and other VIS-free implementations deserve
15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18 # several integrated RSA/DSA accelerator circuits accessible through
19 # kernel driver [only(*)], but having decent user-land software
20 # implementation is important too. Finally, reasons like desire to
21 # experiment with dedicated squaring procedure. Yes, this module
22 # implements one, because it was easiest to draft it in SPARCv9
25 # (*) Engine accessing the driver in question is on my TODO list.
26 # For reference, acceleator is estimated to give 6 to 10 times
27 # improvement on single-threaded RSA sign. It should be noted
28 # that 6-10x improvement coefficient does not actually mean
29 # something extraordinary in terms of absolute [single-threaded]
30 # performance, as SPARCv9 instruction set is by all means least
31 # suitable for high performance crypto among other 64 bit
32 # platforms. 6-10x factor simply places T1 in same performance
33 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
34 # appear impressive at all, but it's the sign operation which is
35 # far more critical/interesting.
37 # You might notice that inner loops are modulo-scheduled:-) This has
38 # essentially negligible impact on UltraSPARC performance, it's
39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42 # module still have hidden potential [see TODO list there], which is
43 # estimated to be larger than 20%...
46 $rp="%i0"; # BN_ULONG *rp,
47 $ap="%i1"; # const BN_ULONG *ap,
48 $bp="%i2"; # const BN_ULONG *bp,
49 $np="%i3"; # const BN_ULONG *np,
50 $n0="%i4"; # const BN_ULONG *n0,
51 $num="%i5"; # int num);
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=128; }
63 $mask="%g1"; # 32 bits, what a waste...
76 $fname="bn_mul_mont_int";
79 .section
".text",#alloc,#execinstr
84 cmp %o5,4 ! 128 bits minimum
86 sethi
%hi(0xffffffff),$mask
92 sll
$num,2,$num ! num
*=4
93 or $mask,%lo(0xffffffff),$mask
97 ld
[$bp],$mul0 ! bp
[0]
100 add
%sp,$bias,%o7 ! real top of stack
101 ld
[$ap],$car0 ! ap
[0] ! redundant
in squaring context
103 ld
[$ap+4],$apj ! ap
[1]
105 ld
[$np],$car1 ! np
[0]
106 sub %o7,$bias,%sp ! alloca
107 ld
[$np+4],$npj ! np
[1]
108 be
,pt
`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
111 mulx
$car0,$mul0,$car0 ! ap
[0]*bp
[0]
112 mulx
$apj,$mul0,$tmp0 !prologue
! ap
[1]*bp
[0]
113 and $car0,$mask,$acc0
114 add
%sp,$bias+$frame,$tp
115 ld
[$ap+8],$apj !prologue
!
117 mulx
$n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
120 mulx
$car1,$mul1,$car1 ! np
[0]*"t[0]"*n0
121 mulx
$npj,$mul1,$acc1 !prologue
! np
[1]*"t[0]"*n0
123 add
$acc0,$car1,$car1
124 ld
[$np+8],$npj !prologue
!
126 mov
$tmp0,$acc0 !prologue
!
129 mulx
$apj,$mul0,$tmp0
130 mulx
$npj,$mul1,$tmp1
131 add
$acc0,$car0,$car0
132 ld
[$ap+$j],$apj ! ap
[j
]
133 and $car0,$mask,$acc0
134 add
$acc1,$car1,$car1
135 ld
[$np+$j],$npj ! np
[j
]
137 add
$acc0,$car1,$car1
148 mulx
$apj,$mul0,$tmp0 !epilogue
!
149 mulx
$npj,$mul1,$tmp1
150 add
$acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add
$acc1,$car1,$car1
154 add
$acc0,$car1,$car1
158 add
$tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add
$tmp1,$car1,$car1
162 add
$acc0,$car1,$car1
166 add
$car0,$car1,$car1
171 ld
[$bp+4],$mul0 ! bp
[1]
173 add
%sp,$bias+$frame,$tp
174 ld
[$ap],$car0 ! ap
[0]
175 ld
[$ap+4],$apj ! ap
[1]
176 ld
[$np],$car1 ! np
[0]
177 ld
[$np+4],$npj ! np
[1]
178 ld
[$tp],$tmp1 ! tp
[0]
179 ld
[$tp+4],$tpj ! tp
[1]
182 mulx
$car0,$mul0,$car0
183 mulx
$apj,$mul0,$tmp0 !prologue
!
184 add
$tmp1,$car0,$car0
185 ld
[$ap+8],$apj !prologue
!
186 and $car0,$mask,$acc0
189 and $mul1,$mask,$mul1
191 mulx
$car1,$mul1,$car1
192 mulx
$npj,$mul1,$acc1 !prologue
!
194 add
$acc0,$car1,$car1
195 ld
[$np+8],$npj !prologue
!
197 mov
$tmp0,$acc0 !prologue
!
200 mulx
$apj,$mul0,$tmp0
201 mulx
$npj,$mul1,$tmp1
203 ld
[$ap+$j],$apj ! ap
[j
]
204 add
$acc0,$car0,$car0
205 add
$acc1,$car1,$car1
206 ld
[$np+$j],$npj ! np
[j
]
207 and $car0,$mask,$acc0
208 ld
[$tp+8],$tpj ! tp
[j
]
210 add
$acc0,$car1,$car1
213 st
$car1,[$tp] ! tp
[j
-1]
221 mulx
$apj,$mul0,$tmp0 !epilogue
!
222 mulx
$npj,$mul1,$tmp1
224 add
$acc0,$car0,$car0
225 ld
[$tp+8],$tpj ! tp
[j
]
226 and $car0,$mask,$acc0
227 add
$acc1,$car1,$car1
229 add
$acc0,$car1,$car1
230 st
$car1,[$tp] ! tp
[j
-1]
234 add
$tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add
$tmp1,$car1,$car1
237 add
$acc0,$car1,$car1
238 st
$car1,[$tp+4] ! tp
[j
-1]
243 add
$car0,$car1,$car1
245 add
$car2,$car1,$car1
250 ld
[$bp+$i],$mul0 ! bp
[i
]
259 sub %g0,$num,%o7 ! k
=-num
261 subcc
%g0,%g0,%g0 ! clear
%icc.c
266 subccc
%o0,%o1,%o1 ! tp
[j
]-np
[j
]
271 subc
$car2,0,$car2 ! handle upmost overflow bit
278 ld
[$ap+%o7],%o0 ! copy
or in-place refresh
279 st
%g0,[$tp+%o7] ! zap tp
290 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291 ######## code without following dedicated squaring procedure.
293 $sbit="%i2"; # re-use $bp!
298 mulx
$mul0,$mul0,$car0 ! ap
[0]*ap
[0]
299 mulx
$apj,$mul0,$tmp0 !prologue
!
300 and $car0,$mask,$acc0
301 add
%sp,$bias+$frame,$tp
302 ld
[$ap+8],$apj !prologue
!
304 mulx
$n0,$acc0,$mul1 ! "t[0]"*n0
306 and $mul1,$mask,$mul1
308 mulx
$car1,$mul1,$car1 ! np
[0]*"t[0]"*n0
309 mulx
$npj,$mul1,$acc1 !prologue
!
311 ld
[$np+8],$npj !prologue
!
313 add
$acc0,$car1,$car1
315 mov
$tmp0,$acc0 !prologue
!
318 mulx
$apj,$mul0,$tmp0
319 mulx
$npj,$mul1,$tmp1
320 add
$acc0,$car0,$car0 ! ap
[j
]*a0
+c0
321 add
$acc1,$car1,$car1
322 ld
[$ap+$j],$apj ! ap
[j
]
323 and $car0,$mask,$acc0
324 ld
[$np+$j],$npj ! np
[j
]
326 add
$acc0,$acc0,$acc0
331 and $acc0,$mask,$acc0
333 add
$acc0,$car1,$car1
341 mulx
$apj,$mul0,$tmp0 ! epilogue
342 mulx
$npj,$mul1,$tmp1
343 add
$acc0,$car0,$car0 ! ap
[j
]*a0
+c0
344 add
$acc1,$car1,$car1
345 and $car0,$mask,$acc0
347 add
$acc0,$acc0,$acc0
350 and $acc0,$mask,$acc0
351 add
$acc0,$car1,$car1
355 add
$tmp0,$car0,$car0 ! ap
[j
]*a0
+c0
356 add
$tmp1,$car1,$car1
357 and $car0,$mask,$acc0
359 add
$acc0,$acc0,$acc0
362 and $acc0,$mask,$acc0
363 add
$acc0,$car1,$car1
367 add
$car0,$car0,$car0
369 add
$car0,$car1,$car1
373 ld
[%sp+$bias+$frame],$tmp0 ! tp
[0]
374 ld
[%sp+$bias+$frame+4],$tmp1 ! tp
[1]
375 ld
[%sp+$bias+$frame+8],$tpj ! tp
[2]
376 ld
[$ap+4],$mul0 ! ap
[1]
377 ld
[$ap+8],$apj ! ap
[2]
378 ld
[$np],$car1 ! np
[0]
379 ld
[$np+4],$npj ! np
[1]
382 mulx
$mul0,$mul0,$car0
383 and $mul1,$mask,$mul1
385 mulx
$car1,$mul1,$car1
386 mulx
$npj,$mul1,$acc1
387 add
$tmp0,$car1,$car1
388 and $car0,$mask,$acc0
389 ld
[$np+8],$npj ! np
[2]
391 add
$tmp1,$car1,$car1
393 add
$acc0,$car1,$car1
395 add
$acc1,$car1,$car1
398 st
$car1,[%sp+$bias+$frame] ! tp
[0]=
400 add
%sp,$bias+$frame+4,$tp
403 mulx
$apj,$mul0,$acc0
404 mulx
$npj,$mul1,$acc1
405 add
$acc0,$car0,$car0
407 ld
[$ap+$j],$apj ! ap
[j
]
408 and $car0,$mask,$acc0
409 ld
[$np+$j],$npj ! np
[j
]
411 add
$acc1,$car1,$car1
412 ld
[$tp+8],$tpj ! tp
[j
]
413 add
$acc0,$acc0,$acc0
417 and $acc0,$mask,$acc0
419 add
$acc0,$car1,$car1
420 st
$car1,[$tp] ! tp
[j
-1]
426 mulx
$apj,$mul0,$acc0
427 mulx
$npj,$mul1,$acc1
428 add
$acc0,$car0,$car0
430 and $car0,$mask,$acc0
432 add
$acc1,$car1,$car1
433 add
$acc0,$acc0,$acc0
436 and $acc0,$mask,$acc0
437 add
$acc0,$car1,$car1
438 st
$car1,[$tp] ! tp
[j
-1]
441 add
$car0,$car0,$car0
443 add
$car0,$car1,$car1
444 add
$car2,$car1,$car1
448 ld
[%sp+$bias+$frame],$tmp1 ! tp
[0]
449 ld
[%sp+$bias+$frame+4],$tpj ! tp
[1]
450 ld
[$ap+8],$mul0 ! ap
[2]
451 ld
[$np],$car1 ! np
[0]
452 ld
[$np+4],$npj ! np
[1]
454 and $mul1,$mask,$mul1
457 mulx
$mul0,$mul0,$car0
458 mulx
$car1,$mul1,$car1
459 and $car0,$mask,$acc0
460 add
$tmp1,$car1,$car1
462 add
%sp,$bias+$frame,$tp
470 mulx
$npj,$mul1,$acc1
475 add
$acc1,$car1,$car1
484 ld
[$ap+$j],$apj ! ap
[j
]
485 mulx
$npj,$mul1,$acc1
487 ld
[$np+$j],$npj ! np
[j
]
488 add
$acc0,$car1,$car1
489 ld
[$tp+8],$tpj ! tp
[j
]
490 add
$acc1,$car1,$car1
496 be
,pn
%icc,.Lsqr_no_inner2
500 mulx
$apj,$mul0,$acc0
501 mulx
$npj,$mul1,$acc1
503 add
$acc0,$car0,$car0
504 ld
[$ap+$j],$apj ! ap
[j
]
505 and $car0,$mask,$acc0
506 ld
[$np+$j],$npj ! np
[j
]
508 add
$acc0,$acc0,$acc0
509 ld
[$tp+8],$tpj ! tp
[j
]
513 and $acc0,$mask,$acc0
515 add
$acc0,$car1,$car1
516 add
$acc1,$car1,$car1
517 st
$car1,[$tp] ! tp
[j
-1]
523 mulx
$apj,$mul0,$acc0
524 mulx
$npj,$mul1,$acc1
526 add
$acc0,$car0,$car0
527 and $car0,$mask,$acc0
529 add
$acc0,$acc0,$acc0
532 and $acc0,$mask,$acc0
533 add
$acc0,$car1,$car1
534 add
$acc1,$car1,$car1
535 st
$car1,[$tp] ! tp
[j
-1]
538 add
$car0,$car0,$car0
540 add
$car0,$car1,$car1
541 add
$car2,$car1,$car1
546 ld
[%sp+$bias+$frame],$tmp1 ! tp
[0]
547 ld
[%sp+$bias+$frame+4],$tpj ! tp
[1]
548 ld
[$ap+$i],$mul0 ! ap
[j
]
549 ld
[$np],$car1 ! np
[0]
550 ld
[$np+4],$npj ! np
[1]
552 and $mul1,$mask,$mul1
555 mulx
$mul0,$mul0,$car0
556 mulx
$car1,$mul1,$car1
557 and $car0,$mask,$acc0
558 add
$tmp1,$car1,$car1
560 add
%sp,$bias+$frame,$tp
565 cmp $tmp0,$num ! i
<num
-1
570 mulx
$npj,$mul1,$acc1
575 add
$acc1,$car1,$car1
583 mulx
$npj,$mul1,$acc1
585 add
$acc0,$car1,$car1
586 add
$acc1,$car1,$car1
590 add
$car0,$car0,$car0 ! recover
$car0
592 add
$car0,$car1,$car1
593 add
$car2,$car1,$car1
599 .type
$fname,#function
600 .size
$fname,(.-$fname)
601 .asciz
"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
604 $code =~ s/\`([^\`]*)\`/eval($1)/gem;