1 .section ".text",#alloc,#execinstr
3 .global bn_mul_mont_fpu
11 andcc %i5,1,%g0 ! %i5 has to be even...
13 clr %i0 ! signal "unsupported input value"
17 ld [%i4+0],%g4 ! %g4 reassigned, remember?
18 or %l7,%lo(0xffff),%l7
21 or %o0,%g4,%g4 ! %g4=n0[1].n0[0]
23 sll %i5,3,%i5 ! num*=8
25 add %sp,2047,%o0 ! real top of stack
27 add %o1,%i5,%o1 ! %o1=num*5
29 and %o0,-2048,%o0 ! optimize TLB utilization
30 sub %o0,2047,%sp ! alloca(5*num*8)
32 rd %asi,%o7 ! save %asi
33 add %sp,2047+192+64,%l0
35 add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends !
40 wr %g0,210,%asi ! setup %asi for 16-bit FP loads
42 add %i0,%i5,%i0 ! readjust input pointers to point
43 add %i1,%i5,%i1 ! at the ends too...
47 stx %o7,[%sp+2047+192+48] ! save %asi
49 sub %g0,%i5,%l5 ! i=-num
50 sub %g0,%i5,%l6 ! j=-num
55 ld [%o3+4],%g1 ! bp[0]
57 ld [%o4+4],%g5 ! ap[0]
66 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
67 mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0
68 stx %o0,[%sp+2047+192+0]
70 ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words
71 .word 0xa1b00c20 ! fzeros %f16
73 .word 0xa5b00c20 ! fzeros %f18
74 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words
75 .word 0xa9b00c20 ! fzeros %f20
77 .word 0xadb00c20 ! fzeros %f22
79 ! transfer b[i] to FPU as 4x16-bit values
89 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
90 ldda [%sp+2047+192+6]%asi,%f8
92 ldda [%sp+2047+192+4]%asi,%f10
94 ldda [%sp+2047+192+2]%asi,%f12
96 ldda [%sp+2047+192+0]%asi,%f14
99 std %f16,[%l1+%l6] ! save smashed ap[j] in double format
103 std %f20,[%l3+%l6] ! save smashed np[j] in double format
131 faddd %f44,%f60,%f24 ! %f60
132 faddd %f46,%f62,%f26 ! %f62
142 std %f48,[%sp+2047+192+0]
144 std %f50,[%sp+2047+192+8]
146 std %f52,[%sp+2047+192+16]
148 std %f54,[%sp+2047+192+24]
150 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words
151 .word 0xa1b00c20 ! fzeros %f16
153 .word 0xa5b00c20 ! fzeros %f18
154 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words
155 .word 0xa9b00c20 ! fzeros %f20
157 .word 0xadb00c20 ! fzeros %f22
164 ldx [%sp+2047+192+0],%o0
166 ldx [%sp+2047+192+8],%o1
168 ldx [%sp+2047+192+16],%o2
170 ldx [%sp+2047+192+24],%o3
174 std %f16,[%l1+%l6] ! save smashed ap[j] in double format
181 std %f20,[%l3+%l6] ! save smashed np[j] in double format
189 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
200 !or %o7,%o0,%o0 ! 64-bit result
201 srlx %o3,16,%g1 ! 34-bit carry
215 faddd %f44,%f60,%f24 ! %f60
216 faddd %f46,%f62,%f26 ! %f62
226 std %f48,[%sp+2047+192+0]
227 std %f50,[%sp+2047+192+8]
229 std %f52,[%sp+2047+192+16]
231 std %f54,[%sp+2047+192+24]
233 .align 32 ! incidentally already aligned !
237 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words
238 .word 0xa1b00c20 ! fzeros %f16
240 .word 0xa5b00c20 ! fzeros %f18
241 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words
242 .word 0xa9b00c20 ! fzeros %f20
244 .word 0xadb00c20 ! fzeros %f22
251 ldx [%sp+2047+192+0],%o0
253 ldx [%sp+2047+192+8],%o1
255 ldx [%sp+2047+192+16],%o2
257 ldx [%sp+2047+192+24],%o3
261 std %f16,[%l1+%l6] ! save smashed ap[j] in double format
268 std %f20,[%l3+%l6] ! save smashed np[j] in double format
276 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
294 or %o7,%o0,%o0 ! 64-bit result
299 srlx %o3,16,%g1 ! 34-bit carry
304 stx %o0,[%l0] ! tp[j-1]=
306 faddd %f44,%f60,%f24 ! %f60
307 faddd %f46,%f62,%f26 ! %f62
317 std %f48,[%sp+2047+192+0]
318 std %f50,[%sp+2047+192+8]
319 std %f52,[%sp+2047+192+16]
320 std %f54,[%sp+2047+192+24]
330 ldx [%sp+2047+192+0],%o0
331 ldx [%sp+2047+192+8],%o1
332 ldx [%sp+2047+192+16],%o2
333 ldx [%sp+2047+192+24],%o3
336 std %f24,[%sp+2047+192+32]
338 std %f26,[%sp+2047+192+40]
342 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
351 or %o7,%o0,%o0 ! 64-bit result
352 ldx [%sp+2047+192+32],%o4
354 ldx [%sp+2047+192+40],%o5
355 srlx %o3,16,%g1 ! 34-bit carry
359 stx %o0,[%l0] ! tp[j-1]=
373 stx %o4,[%l0] ! tp[num-1]=
379 sub %g0,%i5,%l6 ! j=-num
380 add %sp,2047+192+64,%l0
385 ld [%o3+4],%g1 ! bp[i]
387 ld [%o4+4],%g5 ! ap[0]
394 ldx [%l0],%o2 ! tp[0]
397 mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
398 stx %o0,[%sp+2047+192+0]
400 ! transfer b[i] to FPU as 4x16-bit values
406 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
407 ldda [%sp+2047+192+6]%asi,%f8
409 ldda [%sp+2047+192+4]%asi,%f10
411 ldda [%sp+2047+192+2]%asi,%f12
413 ldda [%sp+2047+192+0]%asi,%f14
415 ldd [%l1+%l6],%f16 ! load a[j] in double format
419 ldd [%l3+%l6],%f20 ! load n[j] in double format
447 faddd %f44,%f60,%f24 ! %f60
448 faddd %f46,%f62,%f26 ! %f62
458 std %f48,[%sp+2047+192+0]
459 std %f50,[%sp+2047+192+8]
460 std %f52,[%sp+2047+192+16]
462 std %f54,[%sp+2047+192+24]
464 ldd [%l1+%l6],%f16 ! load a[j] in double format
466 ldd [%l3+%l6],%f20 ! load n[j] in double format
474 ldx [%sp+2047+192+0],%o0
477 ldx [%sp+2047+192+8],%o1
479 ldx [%sp+2047+192+16],%o2
482 ldx [%sp+2047+192+24],%o3
498 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
512 faddd %f44,%f60,%f24 ! %f60
514 faddd %f46,%f62,%f26 ! %f62
515 or %o7,%o0,%o0 ! 64-bit result
521 srlx %o3,16,%g1 ! 34-bit carry
530 std %f48,[%sp+2047+192+0]
531 std %f50,[%sp+2047+192+8]
533 std %f52,[%sp+2047+192+16]
534 bz,pn %icc,.Linnerskip
535 std %f54,[%sp+2047+192+24]
541 ldd [%l1+%l6],%f16 ! load a[j] in double format
543 ldd [%l3+%l6],%f20 ! load n[j] in double format
551 ldx [%sp+2047+192+0],%o0
554 ldx [%sp+2047+192+8],%o1
556 ldx [%sp+2047+192+16],%o2
559 ldx [%sp+2047+192+24],%o3
575 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
588 faddd %f44,%f60,%f24 ! %f60
590 faddd %f46,%f62,%f26 ! %f62
591 or %o7,%o0,%o0 ! 64-bit result
594 ldx [%l0+8],%o7 ! tp[j]
596 srlx %o3,16,%g1 ! 34-bit carry
606 stx %o0,[%l0] ! tp[j-1]
609 std %f48,[%sp+2047+192+0]
610 std %f50,[%sp+2047+192+8]
611 std %f52,[%sp+2047+192+16]
613 std %f54,[%sp+2047+192+24]
621 ldx [%sp+2047+192+0],%o0
622 ldx [%sp+2047+192+8],%o1
623 ldx [%sp+2047+192+16],%o2
624 ldx [%sp+2047+192+24],%o3
627 std %f24,[%sp+2047+192+32]
629 std %f26,[%sp+2047+192+40]
633 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
642 ldx [%sp+2047+192+32],%o4
643 or %o7,%o0,%o0 ! 64-bit result
644 ldx [%sp+2047+192+40],%o5
646 ldx [%l0+8],%o7 ! tp[j]
647 srlx %o3,16,%g1 ! 34-bit carry
655 stx %o0,[%l0] ! tp[j-1]
669 stx %o4,[%l0] ! tp[num-1]
678 add %l0,8,%l0 ! adjust tp to point at the end
680 sub %g0,%i5,%o7 ! n=-num
682 subcc %g0,%g0,%g0 ! clear %icc.c
699 sub %g0,%i5,%o7 ! n=-num
720 sub %g0,%i5,%o7 ! n=-num
731 ldx [%sp+2047+192+48],%o7
732 wr %g0,%o7,%asi ! restore %asi
738 .type bn_mul_mont_fpu,#function
739 .size bn_mul_mont_fpu,(.-bn_mul_mont_fpu)
740 .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>"