2 ; PA-RISC
2.0 implementation of bn_asm code
, based on the
3 ;
64-bit version of the code. This code is effectively the
4 ; same as the
64-bit version except the register model is
5 ; slightly different given all values must
be 32-bit between
6 ; function calls. Thus the
64-bit return values are returned
7 ; in
%ret0
and %ret1 vs just
%ret0 as is done in
64-bit
10 ; This code is approximately
2x faster than the C version
13 ; See http
://devresource.hp.com
/ for more details on the PA-RISC
14 ; architecture. Also see the book
"PA-RISC 2.0 Architecture"
15 ; by Gerry Kane for information on the instruction set architecture.
17 ; Code written by Chris Ruemmler
(with some help from the HP C
20 ; The code compiles with HP
's assembler
25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
28 ; Global Register definitions used for the routines.
30 ; Some information about HP's runtime architecture for
32-bits.
32 ;
"Caller save" means the calling function must save the register
33 ; if it wants the register to
be preserved.
34 ;
"Callee save" means if
a function uses the register
, it must save
35 ; the value before using it.
37 ; For the floating point registers
39 ;
"caller save" registers
: fr4-fr11
, fr22-fr31
40 ;
"callee save" registers
: fr12-fr21
41 ;
"special" registers
: fr0-fr3
(status
and exception registers
)
43 ; For the integer registers
45 ;
"caller save" registers
: r1,r19-
r26
46 ;
"callee save" registers
: r3-
r18
47 ; return register
: r2 (rp
)
48 ; return values ;
r28,r29 (ret0
,ret1
)
49 ; Stack pointer ;
r30 (sp
)
50 ; millicode return ptr ;
r31 (also
a caller save register
)
54 ; Arguments to the routines
63 ; Note that the
"w" argument for bn_mul_add_words
and bn_mul_words
64 ; is passed on the stack at
a delta of
-56 from the top of stack
65 ; as the routine is entered.
69 ; Globals used in some routines
72 top_overflow
.reg %r23
73 high_mask
.reg %r22 ; value 0xffffffff80000000L
76 ;
------------------------------------------------------------------------------
80 ;BN_ULONG bn_mul_add_words
(BN_ULONG
*r_ptr
, BN_ULONG
*a_ptr
,
81 ; int num
, BN_ULONG w
)
88 ; Local register definitions
126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
132 STD %r3,0(%sp
) ; save
r3
133 STD %r4,8(%sp
) ; save
r4
134 NOP ; Needed to make the loop
16-byte aligned
135 NOP ; needed to make the loop
16-byte aligned
137 STD %r5,16(%sp
) ; save
r5
139 STD %r6,24(%sp
) ; save
r6
140 STD %r7,32(%sp
) ; save
r7
142 STD %r8,40(%sp
) ; save
r8
143 STD %r9,48(%sp
) ; save
r9
144 COPY
%r0,%ret1 ; return
0 by default
145 DEPDI
,Z
1,31,1,top_overflow ; top_overflow
= 1 << 32
147 CMPIB
,>= 0,num
,bn_mul_add_words_exit ; if
(num
<= 0) then exit
148 LDO
128(%sp
),%sp ; bump stack
151 ; The loop is unrolled twice
, so if there is only
1 number
152 ; then go straight to the cleanup code.
154 CMPIB
,= 1,num
,bn_mul_add_words_single_top
155 FLDD
-184(%sp
),fw ;
(-56-128) load up w into fw
(fw_h
/fw_l
)
158 ; This loop is unrolled
2 times
(64-byte aligned as well
)
160 ; PA-RISC
2.0 chips have two fully pipelined multipliers
, thus
161 ; two
32-bit mutiplies can
be issued per cycle.
163 bn_mul_add_words_unroll2
165 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
166 FLDD
8(a_ptr
),t_float_1 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
167 LDD
0(r_ptr
),rp_val ; rp
[0]
168 LDD
8(r_ptr
),rp_val_1 ; rp
[1]
170 XMPYU fht_0
,fw_l
,fm1 ; m1
[0] = fht_0
*fw_l
171 XMPYU fht_1
,fw_l
,fm1_1 ; m1
[1] = fht_1
*fw_l
172 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
[0]
173 FSTD fm1_1
,-48(%sp
) ;
-48(sp
) = m1
[1]
175 XMPYU flt_0
,fw_h
,fm ; m
[0] = flt_0
*fw_h
176 XMPYU flt_1
,fw_h
,fm_1 ; m
[1] = flt_1
*fw_h
177 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
[0]
178 FSTD fm_1
,-40(%sp
) ;
-40(sp
) = m
[1]
180 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= fht_0
*fw_h
181 XMPYU fht_1
,fw_h
,ht_temp_1 ; ht_temp_1
= fht_1
*fw_h
182 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht_temp
183 FSTD ht_temp_1
,-56(%sp
) ;
-56(sp
) = ht_temp_1
185 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
186 XMPYU flt_1
,fw_l
,lt_temp_1 ; lt_temp
= lt
*fw_l
187 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt_temp
188 FSTD lt_temp_1
,-64(%sp
) ;
-64(sp
) = lt_temp_1
190 LDD
-8(%sp
),m_0 ; m
[0]
191 LDD
-40(%sp
),m_1 ; m
[1]
192 LDD
-16(%sp
),m1_0 ; m1
[0]
193 LDD
-48(%sp
),m1_1 ; m1
[1]
195 LDD
-24(%sp
),ht_0 ; ht
[0]
196 LDD
-56(%sp
),ht_1 ; ht
[1]
197 ADD,L m1_0
,m_0
,tmp_0 ; tmp_0
= m
[0] + m1
[0];
198 ADD,L m1_1
,m_1
,tmp_1 ; tmp_1
= m
[1] + m1
[1];
202 CMPCLR
,*>>= tmp_0
,m1_0
, %r0 ; if
(m
[0] < m1
[0])
203 ADD,L ht_0
,top_overflow
,ht_0 ; ht
[0] += (1<<32)
205 CMPCLR
,*>>= tmp_1
,m1_1
,%r0 ; if
(m
[1] < m1
[1])
206 ADD,L ht_1
,top_overflow
,ht_1 ; ht
[1] += (1<<32)
207 EXTRD
,U tmp_0
,31,32,m_0 ; m
[0]>>32
208 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
[0] = m
[0]<<32
210 EXTRD
,U tmp_1
,31,32,m_1 ; m
[1]>>32
211 DEPD
,Z tmp_1
,31,32,m1_1 ; m1
[1] = m
[1]<<32
212 ADD,L ht_0
,m_0
,ht_0 ; ht
[0]+= (m
[0]>>32)
213 ADD,L ht_1
,m_1
,ht_1 ; ht
[1]+= (m
[1]>>32)
215 ADD lt_0
,m1_0
,lt_0 ; lt
[0] = lt
[0]+m1
[0];
216 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
217 ADD lt_1
,m1_1
,lt_1 ; lt
[1] = lt
[1]+m1
[1];
218 ADD,DC ht_1
,%r0,ht_1 ; ht
[1]++
220 ADD %ret1
,lt_0
,lt_0 ; lt
[0] = lt
[0] + c;
221 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
222 ADD lt_0
,rp_val
,lt_0 ; lt
[0] = lt
[0]+rp
[0]
223 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
225 LDO
-2(num
),num ; num
= num
- 2;
226 ADD ht_0
,lt_1
,lt_1 ; lt
[1] = lt
[1] + ht_0
(c
);
227 ADD,DC ht_1
,%r0,ht_1 ; ht
[1]++
228 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
[0]
230 ADD lt_1
,rp_val_1
,lt_1 ; lt
[1] = lt
[1]+rp
[1]
231 ADD,DC ht_1
,%r0,%ret1 ; ht
[1]++
232 LDO
16(a_ptr
),a_ptr ; a_ptr
+= 2
234 STD lt_1
,8(r_ptr
) ; rp
[1] = lt
[1]
235 CMPIB
,<= 2,num
,bn_mul_add_words_unroll2 ; go again if more to do
236 LDO
16(r_ptr
),r_ptr ; r_ptr
+= 2
238 CMPIB
,=,N
0,num
,bn_mul_add_words_exit ; are we done
, or cleanup last one
241 ; Top of loop aligned on
64-byte boundary
243 bn_mul_add_words_single_top
244 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
245 LDD
0(r_ptr
),rp_val ; rp
[0]
246 LDO
8(a_ptr
),a_ptr ; a_ptr+
+
247 XMPYU fht_0
,fw_l
,fm1 ; m1
= ht
*fw_l
248 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
249 XMPYU flt_0
,fw_h
,fm ; m
= lt
*fw_h
250 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
251 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= ht
*fw_h
252 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht
253 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
254 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt
257 LDD
-16(%sp
),m1_0 ; m1
= temp1
258 ADD,L m_0
,m1_0
,tmp_0 ; tmp_0
= m
+ m1;
262 CMPCLR
,*>>= tmp_0
,m1_0
,%r0 ; if
(m
< m1
)
263 ADD,L ht_0
,top_overflow
,ht_0 ; ht
+= (1<<32)
265 EXTRD
,U tmp_0
,31,32,m_0 ; m
>>32
266 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
= m
<<32
268 ADD,L ht_0
,m_0
,ht_0 ; ht+
= (m
>>32)
269 ADD lt_0
,m1_0
,tmp_0 ; tmp_0
= lt+m1;
270 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
271 ADD %ret1
,tmp_0
,lt_0 ; lt
= lt
+ c;
272 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
273 ADD lt_0
,rp_val
,lt_0 ; lt
= lt+rp
[0]
274 ADD,DC ht_0
,%r0,%ret1 ; ht+
+
275 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
277 bn_mul_add_words_exit
280 EXTRD
,U
%ret1
,31,32,%ret0 ; for
32-bit
, return in ret0
/ret1
281 LDD
-80(%sp
),%r9 ; restore
r9
282 LDD
-88(%sp
),%r8 ; restore
r8
283 LDD
-96(%sp
),%r7 ; restore
r7
284 LDD
-104(%sp
),%r6 ; restore
r6
285 LDD
-112(%sp
),%r5 ; restore
r5
286 LDD
-120(%sp
),%r4 ; restore
r4
288 LDD
,MB
-128(%sp
),%r3 ; restore
r3
289 .PROCEND ;in=23,24,25,26,29;out=28;
291 ;
----------------------------------------------------------------------------
293 ;BN_ULONG bn_mul_words
(BN_ULONG
*rp
, BN_ULONG
*ap
, int num
, BN_ULONG w
)
298 ; w on stack at
-56(sp
)
304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
307 STD %r3,0(%sp
) ; save
r3
308 STD %r4,8(%sp
) ; save
r4
310 STD %r5,16(%sp
) ; save
r5
312 STD %r6,24(%sp
) ; save
r6
313 STD %r7,32(%sp
) ; save
r7
314 COPY
%r0,%ret1 ; return
0 by default
315 DEPDI
,Z
1,31,1,top_overflow ; top_overflow
= 1 << 32
317 CMPIB
,>= 0,num
,bn_mul_words_exit
318 LDO
128(%sp
),%sp ; bump stack
321 ; See if only
1 word to do
, thus just do cleanup
323 CMPIB
,= 1,num
,bn_mul_words_single_top
324 FLDD
-184(%sp
),fw ;
(-56-128) load up w into fw
(fw_h
/fw_l
)
327 ; This loop is unrolled
2 times
(64-byte aligned as well
)
329 ; PA-RISC
2.0 chips have two fully pipelined multipliers
, thus
330 ; two
32-bit mutiplies can
be issued per cycle.
334 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
335 FLDD
8(a_ptr
),t_float_1 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
336 XMPYU fht_0
,fw_l
,fm1 ; m1
[0] = fht_0
*fw_l
337 XMPYU fht_1
,fw_l
,fm1_1 ; m1
[1] = ht
*fw_l
339 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
340 FSTD fm1_1
,-48(%sp
) ;
-48(sp
) = m1
341 XMPYU flt_0
,fw_h
,fm ; m
= lt
*fw_h
342 XMPYU flt_1
,fw_h
,fm_1 ; m
= lt
*fw_h
344 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
345 FSTD fm_1
,-40(%sp
) ;
-40(sp
) = m
346 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= fht_0
*fw_h
347 XMPYU fht_1
,fw_h
,ht_temp_1 ; ht_temp
= ht
*fw_h
349 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht
350 FSTD ht_temp_1
,-56(%sp
) ;
-56(sp
) = ht
351 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
352 XMPYU flt_1
,fw_l
,lt_temp_1 ; lt_temp
= lt
*fw_l
354 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt
355 FSTD lt_temp_1
,-64(%sp
) ;
-64(sp
) = lt
364 ADD,L m1_0
,m_0
,tmp_0 ; tmp_0
= m
+ m1;
365 ADD,L m1_1
,m_1
,tmp_1 ; tmp_1
= m
+ m1;
369 CMPCLR
,*>>= tmp_0
,m1_0
, %r0 ; if
(m
< m1
)
370 ADD,L ht_0
,top_overflow
,ht_0 ; ht
+= (1<<32)
371 CMPCLR
,*>>= tmp_1
,m1_1
,%r0 ; if
(m
< m1
)
372 ADD,L ht_1
,top_overflow
,ht_1 ; ht
+= (1<<32)
374 EXTRD
,U tmp_0
,31,32,m_0 ; m
>>32
375 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
= m
<<32
376 EXTRD
,U tmp_1
,31,32,m_1 ; m
>>32
377 DEPD
,Z tmp_1
,31,32,m1_1 ; m1
= m
<<32
379 ADD,L ht_0
,m_0
,ht_0 ; ht+
= (m
>>32)
380 ADD,L ht_1
,m_1
,ht_1 ; ht+
= (m
>>32)
381 ADD lt_0
,m1_0
,lt_0 ; lt
= lt+m1;
382 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
384 ADD lt_1
,m1_1
,lt_1 ; lt
= lt+m1;
385 ADD,DC ht_1
,%r0,ht_1 ; ht+
+
386 ADD %ret1
,lt_0
,lt_0 ; lt
= lt
+ c
(ret1
);
387 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
389 ADD ht_0
,lt_1
,lt_1 ; lt
= lt
+ c
(ht_0
)
390 ADD,DC ht_1
,%r0,ht_1 ; ht+
+
391 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
392 STD lt_1
,8(r_ptr
) ; rp
[1] = lt
394 COPY ht_1
,%ret1 ; carry
= ht
395 LDO
-2(num
),num ; num
= num
- 2;
396 LDO
16(a_ptr
),a_ptr ; ap
+= 2
397 CMPIB
,<= 2,num
,bn_mul_words_unroll2
398 LDO
16(r_ptr
),r_ptr ; rp+
+
400 CMPIB
,=,N
0,num
,bn_mul_words_exit ; are we done?
403 ; Top of loop aligned on
64-byte boundary
405 bn_mul_words_single_top
406 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
408 XMPYU fht_0
,fw_l
,fm1 ; m1
= ht
*fw_l
409 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
410 XMPYU flt_0
,fw_h
,fm ; m
= lt
*fw_h
411 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
412 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= ht
*fw_h
413 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht
414 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
415 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt
419 ADD,L m_0
,m1_0
,tmp_0 ; tmp_0
= m
+ m1;
423 CMPCLR
,*>>= tmp_0
,m1_0
,%r0 ; if
(m
< m1
)
424 ADD,L ht_0
,top_overflow
,ht_0 ; ht
+= (1<<32)
426 EXTRD
,U tmp_0
,31,32,m_0 ; m
>>32
427 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
= m
<<32
429 ADD,L ht_0
,m_0
,ht_0 ; ht+
= (m
>>32)
430 ADD lt_0
,m1_0
,lt_0 ; lt
= lt+m1;
431 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
433 ADD %ret1
,lt_0
,lt_0 ; lt
= lt
+ c;
434 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
436 COPY ht_0
,%ret1 ; copy carry
437 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
441 EXTRD
,U
%ret1
,31,32,%ret0 ; for
32-bit
, return in ret0
/ret1
442 LDD
-96(%sp
),%r7 ; restore
r7
443 LDD
-104(%sp
),%r6 ; restore
r6
444 LDD
-112(%sp
),%r5 ; restore
r5
445 LDD
-120(%sp
),%r4 ; restore
r4
447 LDD
,MB
-128(%sp
),%r3 ; restore
r3
450 ;
----------------------------------------------------------------------------
452 ;void bn_sqr_words
(BN_ULONG
*rp
, BN_ULONG
*ap
, int num
)
461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
466 STD %r3,0(%sp
) ; save
r3
467 STD %r4,8(%sp
) ; save
r4
469 STD %r5,16(%sp
) ; save
r5
471 CMPIB
,>= 0,num
,bn_sqr_words_exit
472 LDO
128(%sp
),%sp ; bump stack
475 ; If only
1, the goto straight to cleanup
477 CMPIB
,= 1,num
,bn_sqr_words_single_top
478 DEPDI
,Z
-1,32,33,high_mask ; Create Mask
0xffffffff80000000L
481 ; This loop is unrolled
2 times
(64-byte aligned as well
)
485 FLDD
0(a_ptr
),t_float_0 ;
a[0]
486 FLDD
8(a_ptr
),t_float_1 ;
a[1]
487 XMPYU fht_0
,flt_0
,fm ; m
[0]
488 XMPYU fht_1
,flt_1
,fm_1 ; m
[1]
490 FSTD
fm,-24(%sp
) ; store m
[0]
491 FSTD fm_1
,-56(%sp
) ; store m
[1]
492 XMPYU flt_0
,flt_0
,lt_temp ; lt
[0]
493 XMPYU flt_1
,flt_1
,lt_temp_1 ; lt
[1]
495 FSTD lt_temp
,-16(%sp
) ; store lt
[0]
496 FSTD lt_temp_1
,-48(%sp
) ; store lt
[1]
497 XMPYU fht_0
,fht_0
,ht_temp ; ht
[0]
498 XMPYU fht_1
,fht_1
,ht_temp_1 ; ht
[1]
500 FSTD ht_temp
,-8(%sp
) ; store ht
[0]
501 FSTD ht_temp_1
,-40(%sp
) ; store ht
[1]
505 AND m_0
,high_mask
,tmp_0 ; m
[0] & Mask
506 AND m_1
,high_mask
,tmp_1 ; m
[1] & Mask
507 DEPD
,Z m_0
,30,31,m_0 ; m
[0] << 32+1
508 DEPD
,Z m_1
,30,31,m_1 ; m
[1] << 32+1
512 EXTRD
,U tmp_0
,32,33,tmp_0 ; tmp_0
= m
[0]&Mask
>> 32-1
513 EXTRD
,U tmp_1
,32,33,tmp_1 ; tmp_1
= m
[1]&Mask
>> 32-1
517 ADD,L ht_0
,tmp_0
,ht_0 ; ht
[0] += tmp_0
518 ADD,L ht_1
,tmp_1
,ht_1 ; ht
[1] += tmp_1
520 ADD lt_0
,m_0
,lt_0 ; lt
= lt+m
521 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
522 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
[0]
523 STD ht_0
,8(r_ptr
) ; rp
[1] = ht
[1]
525 ADD lt_1
,m_1
,lt_1 ; lt
= lt+m
526 ADD,DC ht_1
,%r0,ht_1 ; ht
[1]++
527 STD lt_1
,16(r_ptr
) ; rp
[2] = lt
[1]
528 STD ht_1
,24(r_ptr
) ; rp
[3] = ht
[1]
530 LDO
-2(num
),num ; num
= num
- 2;
531 LDO
16(a_ptr
),a_ptr ; ap
+= 2
532 CMPIB
,<= 2,num
,bn_sqr_words_unroll2
533 LDO
32(r_ptr
),r_ptr ; rp
+= 4
535 CMPIB
,=,N
0,num
,bn_sqr_words_exit ; are we done?
538 ; Top of loop aligned on
64-byte boundary
540 bn_sqr_words_single_top
541 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
543 XMPYU fht_0
,flt_0
,fm ; m
544 FSTD
fm,-24(%sp
) ; store m
546 XMPYU flt_0
,flt_0
,lt_temp ; lt
547 FSTD lt_temp
,-16(%sp
) ; store lt
549 XMPYU fht_0
,fht_0
,ht_temp ; ht
550 FSTD ht_temp
,-8(%sp
) ; store ht
552 LDD
-24(%sp
),m_0 ; load m
553 AND m_0
,high_mask
,tmp_0 ; m
& Mask
554 DEPD
,Z m_0
,30,31,m_0 ; m
<< 32+1
555 LDD
-16(%sp
),lt_0 ; lt
557 LDD
-8(%sp
),ht_0 ; ht
558 EXTRD
,U tmp_0
,32,33,tmp_0 ; tmp_0
= m
&Mask
>> 32-1
559 ADD m_0
,lt_0
,lt_0 ; lt
= lt+m
560 ADD,L ht_0
,tmp_0
,ht_0 ; ht
+= tmp_0
561 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
563 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
564 STD ht_0
,8(r_ptr
) ; rp
[1] = ht
568 LDD
-112(%sp
),%r5 ; restore
r5
569 LDD
-120(%sp
),%r4 ; restore
r4
572 .PROCEND ;in=23,24,25,26,29;out=28;
575 ;
----------------------------------------------------------------------------
577 ;BN_ULONG bn_add_words
(BN_ULONG
*r
, BN_ULONG
*a, BN_ULONG
*b, int n
)
592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
595 CMPIB
,>= 0,n
,bn_add_words_exit
596 COPY
%r0,%ret1 ; return
0 by default
599 ; If
2 or more numbers do the loop
601 CMPIB
,= 1,n
,bn_add_words_single_top
605 ; This loop is unrolled
2 times
(64-byte aligned as well
)
610 ADD t,%ret1
,t ;
t = t+c;
611 ADD,DC
%r0,%r0,%ret1 ; set c to carry
612 ADD t,b,l ;
l = t + b[0]
613 ADD,DC
%ret1
,%r0,%ret1 ; c+
= carry
618 ADD t,%ret1
,t ;
t = t+c;
619 ADD,DC
%r0,%r0,%ret1 ; set c to carry
620 ADD t,b,l ;
l = t + b[0]
621 ADD,DC
%ret1
,%r0,%ret1 ; c+
= carry
628 CMPIB
,<= 2,n
,bn_add_words_unroll2
631 CMPIB
,=,N
0,n
,bn_add_words_exit ; are we done?
633 bn_add_words_single_top
637 ADD t,%ret1
,t ;
t = t+c;
638 ADD,DC
%r0,%r0,%ret1 ; set c to carry
(could use CMPCLR??
)
639 ADD t,b,l ;
l = t + b[0]
640 ADD,DC
%ret1
,%r0,%ret1 ; c+
= carry
646 EXTRD
,U
%ret1
,31,32,%ret0 ; for
32-bit
, return in ret0
/ret1
647 .PROCEND ;in=23,24,25,26,29;out=28;
649 ;
----------------------------------------------------------------------------
651 ;BN_ULONG bn_sub_words
(BN_ULONG
*r
, BN_ULONG
*a, BN_ULONG
*b, int n
)
667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
671 CMPIB
,>= 0,n
,bn_sub_words_exit
672 COPY
%r0,%ret1 ; return
0 by default
675 ; If
2 or more numbers do the loop
677 CMPIB
,= 1,n
,bn_sub_words_single_top
681 ; This loop is unrolled
2 times
(64-byte aligned as well
)
686 SUB t1
,t2
,sub_tmp1 ; t3
= t1-t2;
687 SUB sub_tmp1
,%ret1
,sub_tmp1 ; t3
= t3- c;
689 CMPCLR
,*>> t1
,t2
,sub_tmp2 ; clear if t1
> t2
694 STD sub_tmp1
,0(r_ptr
)
698 SUB t1
,t2
,sub_tmp1 ; t3
= t1-t2;
699 SUB sub_tmp1
,%ret1
,sub_tmp1 ; t3
= t3- c;
700 CMPCLR
,*>> t1
,t2
,sub_tmp2 ; clear if t1
> t2
705 STD sub_tmp1
,8(r_ptr
)
711 CMPIB
,<= 2,n
,bn_sub_words_unroll2
714 CMPIB
,=,N
0,n
,bn_sub_words_exit ; are we done?
716 bn_sub_words_single_top
719 SUB t1
,t2
,sub_tmp1 ; t3
= t1-t2;
720 SUB sub_tmp1
,%ret1
,sub_tmp1 ; t3
= t3- c;
721 CMPCLR
,*>> t1
,t2
,sub_tmp2 ; clear if t1
> t2
727 STD sub_tmp1
,0(r_ptr
)
732 EXTRD
,U
%ret1
,31,32,%ret0 ; for
32-bit
, return in ret0
/ret1
733 .PROCEND ;in=23,24,25,26,29;out=28;
735 ;
------------------------------------------------------------------------------
737 ; unsigned long bn_div_words
(unsigned long h
, unsigned long
l, unsigned long d
)
743 ; This is mainly just output from the HP C compiler.
745 ;
------------------------------------------------------------------------------
748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
749 .IMPORT BN_num_bits_word,CODE
750 ;
--- not PIC
.IMPORT __iob,DATA
751 ;
--- not PIC
.IMPORT fprintf,CODE
753 .IMPORT $$div2U,MILLICODE
754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
756 STW %r2,-20(%r30) ;offset
0x8ec
757 STW,MA
%r3,192(%r30) ;offset
0x8f0
758 STW %r4,-188(%r30) ;offset
0x8f4
759 DEPD
%r5,31,32,%r6 ;offset
0x8f8
760 STD %r6,-184(%r30) ;offset
0x8fc
761 DEPD
%r7,31,32,%r8 ;offset
0x900
762 STD %r8,-176(%r30) ;offset
0x904
763 STW %r9,-168(%r30) ;offset
0x908
764 LDD
-248(%r30),%r3 ;offset
0x90c
765 COPY
%r26,%r4 ;offset
0x910
766 COPY
%r24,%r5 ;offset
0x914
767 DEPD
%r25,31,32,%r4 ;offset
0x918
768 CMPB
,*<> %r3,%r0,$
0006000C ;offset
0x91c
769 DEPD
%r23,31,32,%r5 ;offset
0x920
770 MOVIB
,TR
-1,%r29,$
00060002 ;offset
0x924
771 EXTRD
,U
%r29,31,32,%r28 ;offset
0x928
773 LDO
-1(%r29),%r29 ;offset
0x92c
774 SUB %r23,%r7,%r23 ;offset
0x930
776 SUB %r4,%r31,%r25 ;offset
0x934
777 AND %r25,%r19,%r26 ;offset
0x938
778 CMPB
,*<>,N
%r0,%r26,$
00060046 ;offset
0x93c
779 DEPD
,Z
%r25,31,32,%r20 ;offset
0x940
780 OR %r20,%r24,%r21 ;offset
0x944
781 CMPB
,*<<,N
%r21,%r23,$
0006002A ;offset
0x948
782 SUB %r31,%r2,%r31 ;offset
0x94c
785 DEPD
,Z
%r23,31,32,%r25 ;offset
0x950
786 EXTRD
,U
%r23,31,32,%r26 ;offset
0x954
787 AND %r25,%r19,%r24 ;offset
0x958
788 ADD,L %r31,%r26,%r31 ;offset
0x95c
789 CMPCLR
,*>>= %r5,%r24,%r0 ;offset
0x960
790 LDO
1(%r31),%r31 ;offset
0x964
792 CMPB
,*<<=,N
%r31,%r4,$
00060036 ;offset
0x968
793 LDO
-1(%r29),%r29 ;offset
0x96c
794 ADD,L %r4,%r3,%r4 ;offset
0x970
796 ADDIB
,=,N
-1,%r8,$D0 ;offset
0x974
797 SUB %r5,%r24,%r28 ;offset
0x978
799 SUB %r4,%r31,%r24 ;offset
0x97c
800 SHRPD
%r24,%r28,32,%r4 ;offset
0x980
801 DEPD
,Z
%r29,31,32,%r9 ;offset
0x984
802 DEPD
,Z
%r28,31,32,%r5 ;offset
0x988
804 EXTRD
,U
%r4,31,32,%r31 ;offset
0x98c
805 CMPB
,*<>,N
%r31,%r2,$
00060020 ;offset
0x990
806 MOVB
,TR
%r6,%r29,$D1 ;offset
0x994
807 STD %r29,-152(%r30) ;offset
0x998
809 EXTRD
,U
%r3,31,32,%r25 ;offset
0x99c
810 COPY
%r3,%r26 ;offset
0x9a0
811 EXTRD
,U
%r3,31,32,%r9 ;offset
0x9a4
812 EXTRD
,U
%r4,31,32,%r8 ;offset
0x9a8
813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
814 B,L BN_num_bits_word
,%r2 ;offset
0x9ac
815 EXTRD
,U
%r5,31,32,%r7 ;offset
0x9b0
816 LDI
64,%r20 ;offset
0x9b4
817 DEPD
%r7,31,32,%r5 ;offset
0x9b8
818 DEPD
%r8,31,32,%r4 ;offset
0x9bc
819 DEPD
%r9,31,32,%r3 ;offset
0x9c0
820 CMPB
,= %r28,%r20,$
00060012 ;offset
0x9c4
821 COPY
%r28,%r24 ;offset
0x9c8
822 MTSARCM
%r24 ;offset
0x9cc
823 DEPDI
,Z
-1,%sar
,1,%r19 ;offset
0x9d0
824 CMPB
,*>>,N
%r4,%r19,$D2 ;offset
0x9d4
826 SUBI 64,%r24,%r31 ;offset
0x9d8
827 CMPCLR
,*<< %r4,%r3,%r0 ;offset
0x9dc
828 SUB %r4,%r3,%r4 ;offset
0x9e0
830 CMPB
,= %r31,%r0,$
0006001A ;offset
0x9e4
831 COPY
%r0,%r9 ;offset
0x9e8
832 MTSARCM
%r31 ;offset
0x9ec
833 DEPD
,Z
%r3,%sar
,64,%r3 ;offset
0x9f0
834 SUBI 64,%r31,%r26 ;offset
0x9f4
835 MTSAR
%r26 ;offset
0x9f8
836 SHRPD
%r4,%r5,%sar
,%r4 ;offset
0x9fc
837 MTSARCM
%r31 ;offset
0xa00
838 DEPD
,Z
%r5,%sar
,64,%r5 ;offset
0xa04
840 DEPDI
,Z
-1,31,32,%r19 ;offset
0xa08
841 AND %r3,%r19,%r29 ;offset
0xa0c
842 EXTRD
,U
%r29,31,32,%r2 ;offset
0xa10
843 DEPDI
,Z
-1,63,32,%r6 ;offset
0xa14
844 MOVIB
,TR
2,%r8,$
0006001C ;offset
0xa18
845 EXTRD
,U
%r3,63,32,%r7 ;offset
0xa1c
847 ;
--- not PIC ADDIL
LR'__iob-$global$,%r27,%r1 ;offset 0xa20
848 ;--- not PIC LDIL LR'C$
7,%r21 ;offset
0xa24
849 ;
--- not PIC LDO RR
'__iob-$global$+32(%r1),%r26 ;offset 0xa28
850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
852 ;--- not PIC LDO RR'C$
7(%r21),%r25 ;offset
0xa30
854 B,L abort
,%r2 ;offset
0xa34
857 LDW
-212(%r30),%r2 ;offset
0xa40
859 COPY
%r4,%r26 ;offset
0xa44
860 EXTRD
,U
%r4,31,32,%r25 ;offset
0xa48
861 COPY
%r2,%r24 ;offset
0xa4c
862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
863 B,L $$div2U
,%r31 ;offset
0xa50
864 EXTRD
,U
%r2,31,32,%r23 ;offset
0xa54
865 DEPD
%r28,31,32,%r29 ;offset
0xa58
867 STD %r29,-152(%r30) ;offset
0xa5c
869 AND %r5,%r19,%r24 ;offset
0xa60
870 EXTRD
,U
%r24,31,32,%r24 ;offset
0xa64
871 STW %r2,-160(%r30) ;offset
0xa68
872 STW %r7,-128(%r30) ;offset
0xa6c
873 FLDD
-152(%r30),%fr4 ;offset
0xa70
874 FLDD
-152(%r30),%fr7 ;offset
0xa74
875 FLDW
-160(%r30),%fr8L ;offset
0xa78
876 FLDW
-128(%r30),%fr5L ;offset
0xa7c
877 XMPYU
%fr8L
,%fr7L
,%fr10 ;offset
0xa80
878 FSTD
%fr10
,-136(%r30) ;offset
0xa84
879 XMPYU
%fr8L
,%fr7R
,%fr22 ;offset
0xa88
880 FSTD
%fr22
,-144(%r30) ;offset
0xa8c
881 XMPYU
%fr5L
,%fr4L
,%fr11 ;offset
0xa90
882 XMPYU
%fr5L
,%fr4R
,%fr23 ;offset
0xa94
883 FSTD
%fr11
,-112(%r30) ;offset
0xa98
884 FSTD
%fr23
,-120(%r30) ;offset
0xa9c
885 LDD
-136(%r30),%r28 ;offset
0xaa0
886 DEPD
,Z
%r28,31,32,%r31 ;offset
0xaa4
887 LDD
-144(%r30),%r20 ;offset
0xaa8
888 ADD,L %r20,%r31,%r31 ;offset
0xaac
889 LDD
-112(%r30),%r22 ;offset
0xab0
890 DEPD
,Z
%r22,31,32,%r22 ;offset
0xab4
891 LDD
-120(%r30),%r21 ;offset
0xab8
892 B $
00060024 ;offset
0xabc
893 ADD,L %r21,%r22,%r23 ;offset
0xac0
895 OR %r9,%r29,%r29 ;offset
0xac4
897 EXTRD
,U
%r29,31,32,%r28 ;offset
0xac8
900 LDW
-212(%r30),%r2 ;offset
0xacc
902 LDW
-168(%r30),%r9 ;offset
0xad0
903 LDD
-176(%r30),%r8 ;offset
0xad4
904 EXTRD
,U
%r8,31,32,%r7 ;offset
0xad8
905 LDD
-184(%r30),%r6 ;offset
0xadc
906 EXTRD
,U
%r6,31,32,%r5 ;offset
0xae0
907 LDW
-188(%r30),%r4 ;offset
0xae4
908 BVE
(%r2) ;offset
0xae8
910 LDW
,MB
-192(%r30),%r3 ;offset
0xaec
911 .PROCEND ;in=23,25;out=28,29;fpin=105,107;
916 ;
----------------------------------------------------------------------------
918 ; Registers to hold
64-bit values to manipulate. The
"L" part
919 ; of the register corresponds to the upper
32-bits
, while the
"R"
920 ; part corresponds to the lower
32-bits
922 ; Note
, that when using b6
and b7
, the code must save these before
923 ; using them because they are callee save registers
926 ; Floating point registers to use to save values that
927 ; are manipulated. These don
't collide with ftemp1-6 and
928 ; are all caller save registers
971 ; Temporary floating point variables, these are all caller save
980 ; The B set of registers when used.
1007 c1 .reg %r21 ; only reg
1008 temp1 .reg %r20 ; only reg
1009 temp2 .reg %r19 ; only reg
1010 temp3 .reg %r31 ; only reg
1020 SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1021 XMPYU A0L,A0R,ftemp1 ; m
1022 FSTD ftemp1,-24(%sp) ; store m
1024 XMPYU A0R,A0R,ftemp2 ; lt
1025 FSTD ftemp2,-16(%sp) ; store lt
1027 XMPYU A0L,A0L,ftemp3 ; ht
1028 FSTD ftemp3,-8(%sp) ; store ht
1030 LDD -24(%sp),m ; load m
1031 AND m,high_mask,temp2 ; m & Mask
1032 DEPD,Z m,30,31,temp3 ; m << 32+1
1033 LDD -16(%sp),lt ; lt
1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1037 ADD temp3,lt,lt ; lt = lt+m
1038 ADD,L ht,temp1,ht ; ht += temp1
1039 ADD,DC ht,%r0,ht ; ht++
1041 ADD C1,lt,C1 ; c1=c1+lt
1042 ADD,DC ht,%r0,ht ; ht++
1044 ADD C2,ht,C2 ; c2=c2+ht
1045 ADD,DC C3,%r0,C3 ; c3++
1048 SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1050 FSTD ftemp1,-16(%sp) ;
1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1052 FSTD ftemp2,-8(%sp) ;
1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1054 FSTD ftemp3,-32(%sp)
1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1056 FSTD ftemp4,-24(%sp) ;
1058 LDD -8(%sp),m ; r21 = m
1059 LDD -16(%sp),m1 ; r19 = m1
1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1063 LDD -24(%sp),ht ; r24 = ht
1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1066 ADD,L ht,high_one,ht ; ht+=high_one
1068 EXTRD,U m,31,32,temp1 ; m >> 32
1069 LDD -32(%sp),lt ; lt
1070 ADD,L ht,temp1,ht ; ht+= m>>32
1071 ADD lt,temp3,lt ; lt = lt+m1
1072 ADD,DC ht,%r0,ht ; ht++
1074 ADD ht,ht,ht ; ht=ht+ht;
1075 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1077 ADD lt,lt,lt ; lt=lt+lt;
1078 ADD,DC ht,%r0,ht ; add in carry (ht++)
1080 ADD C1,lt,C1 ; c1=c1+lt
1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1084 ADD C2,ht,C2 ; c2 = c2 + ht
1085 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1089 ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1101 STD %r3,0(%sp) ; save r3
1102 STD %r4,8(%sp) ; save r4
1103 STD %r5,16(%sp) ; save r5
1104 STD %r6,24(%sp) ; save r6
1113 LDO 128(%sp),%sp ; bump stack
1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1118 ; Load up all of the values we are going to use
1129 SQR_ADD_C a0L,a0R,c1,c2,c3
1130 STD c1,0(r_ptr) ; r[0] = c1;
1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1134 STD c2,8(r_ptr) ; r[1] = c2;
1137 SQR_ADD_C a1L,a1R,c3,c1,c2
1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1139 STD c3,16(r_ptr) ; r[2] = c3;
1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1144 STD c1,24(r_ptr) ; r[3] = c1;
1147 SQR_ADD_C a2L,a2R,c2,c3,c1
1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1150 STD c2,32(r_ptr) ; r[4] = c2;
1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1156 STD c3,40(r_ptr) ; r[5] = c3;
1159 SQR_ADD_C a3L,a3R,c1,c2,c3
1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1163 STD c1,48(r_ptr) ; r[6] = c1;
1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1170 STD c2,56(r_ptr) ; r[7] = c2;
1173 SQR_ADD_C a4L,a4R,c3,c1,c2
1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1177 STD c3,64(r_ptr) ; r[8] = c3;
1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1183 STD c1,72(r_ptr) ; r[9] = c1;
1186 SQR_ADD_C a5L,a5R,c2,c3,c1
1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1189 STD c2,80(r_ptr) ; r[10] = c2;
1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1194 STD c3,88(r_ptr) ; r[11] = c3;
1197 SQR_ADD_C a6L,a6R,c1,c2,c3
1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1199 STD c1,96(r_ptr) ; r[12] = c1;
1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1203 STD c2,104(r_ptr) ; r[13] = c2;
1206 SQR_ADD_C a7L,a7R,c3,c1,c2
1207 STD c3, 112(r_ptr) ; r[14] = c3
1208 STD c1, 120(r_ptr) ; r[15] = c1
1211 LDD -104(%sp),%r6 ; restore r6
1212 LDD -112(%sp),%r5 ; restore r5
1213 LDD -120(%sp),%r4 ; restore r4
1215 LDD,MB -128(%sp),%r3
1219 ;-----------------------------------------------------------------------------
1221 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1232 STD %r3,0(%sp) ; save r3
1233 STD %r4,8(%sp) ; save r4
1234 STD %r5,16(%sp) ; save r5
1235 STD %r6,24(%sp) ; save r6
1244 LDO 128(%sp),%sp ; bump stack
1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1249 ; Load up all of the values we are going to use
1260 SQR_ADD_C a0L,a0R,c1,c2,c3
1262 STD c1,0(r_ptr) ; r[0] = c1;
1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1267 STD c2,8(r_ptr) ; r[1] = c2;
1270 SQR_ADD_C a1L,a1R,c3,c1,c2
1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1273 STD c3,16(r_ptr) ; r[2] = c3;
1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1279 STD c1,24(r_ptr) ; r[3] = c1;
1282 SQR_ADD_C a2L,a2R,c2,c3,c1
1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1285 STD c2,32(r_ptr) ; r[4] = c2;
1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1289 STD c3,40(r_ptr) ; r[5] = c3;
1292 SQR_ADD_C a3L,a3R,c1,c2,c3
1293 STD c1,48(r_ptr) ; r[6] = c1;
1294 STD c2,56(r_ptr) ; r[7] = c2;
1297 LDD -104(%sp),%r6 ; restore r6
1298 LDD -112(%sp),%r5 ; restore r5
1299 LDD -120(%sp),%r4 ; restore r4
1301 LDD,MB -128(%sp),%r3
1306 ;---------------------------------------------------------------------------
1308 MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1310 FSTD ftemp1,-16(%sp) ;
1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1312 FSTD ftemp2,-8(%sp) ;
1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1314 FSTD ftemp3,-32(%sp)
1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1316 FSTD ftemp4,-24(%sp) ;
1318 LDD -8(%sp),m ; r21 = m
1319 LDD -16(%sp),m1 ; r19 = m1
1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1323 LDD -24(%sp),ht ; r24 = ht
1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1326 ADD,L ht,high_one,ht ; ht+=high_one
1328 EXTRD,U m,31,32,temp1 ; m >> 32
1329 LDD -32(%sp),lt ; lt
1330 ADD,L ht,temp1,ht ; ht+= m>>32
1331 ADD lt,temp3,lt ; lt = lt+m1
1332 ADD,DC ht,%r0,ht ; ht++
1334 ADD C1,lt,C1 ; c1=c1+lt
1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1337 ADD C2,ht,C2 ; c2 = c2 + ht
1338 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1343 ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1356 STD %r3,0(%sp) ; save r3
1357 STD %r4,8(%sp) ; save r4
1358 STD %r5,16(%sp) ; save r5
1359 STD %r6,24(%sp) ; save r6
1360 FSTD %fr12,32(%sp) ; save r6
1361 FSTD %fr13,40(%sp) ; save r7
1370 LDO 128(%sp),%sp ; bump stack
1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1374 ; Load up all of the values we are going to use
1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1506 LDD -104(%sp),%r6 ; restore r6
1507 LDD -112(%sp),%r5 ; restore r5
1508 LDD -120(%sp),%r4 ; restore r4
1510 LDD,MB -128(%sp),%r3
1514 ;-----------------------------------------------------------------------------
1516 ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1529 STD %r3,0(%sp) ; save r3
1530 STD %r4,8(%sp) ; save r4
1531 STD %r5,16(%sp) ; save r5
1532 STD %r6,24(%sp) ; save r6
1533 FSTD %fr12,32(%sp) ; save r6
1534 FSTD %fr13,40(%sp) ; save r7
1543 LDO 128(%sp),%sp ; bump stack
1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1547 ; Load up all of the values we are going to use
1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1599 LDD -104(%sp),%r6 ; restore r6
1600 LDD -112(%sp),%r5 ; restore r5
1601 LDD -120(%sp),%r4 ; restore r4
1603 LDD,MB -128(%sp),%r3
1608 ;--- not PIC .SPACE $TEXT$
1609 ;--- not PIC .SUBSPA $CODE$
1610 ;--- not PIC .SPACE $PRIVATE$,SORT=16
1611 ;--- not PIC .IMPORT $global$,DATA
1612 ;--- not PIC .SPACE $TEXT$
1613 ;--- not PIC .SUBSPA $CODE$
1614 ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
1616 ;--- not PIC .ALIGN 8
1617 ;--- not PIC .STRINGZ "Division would overflow (%d)\n"