3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # "Teaser" Montgomery multiplication module for IA-64. There are
13 # several possibilities for improvement:
15 # - modulo-scheduling outer loop would eliminate quite a number of
16 # stalls after ldf8, xma and getf.sig outside inner loop and
17 # improve shorter key performance;
18 # - shorter vector support [with input vectors being fetched only
19 # once] should be added;
20 # - 2x unroll with help of n0[1] would make the code scalable on
21 # "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22 # acute interest, because upcoming Tukwila's individual cores are
23 # reportedly based on Itanium 2 design;
24 # - dedicated squaring procedure(?);
28 # Shorter vector support is implemented by zero-padding ap and np
29 # vectors up to 8 elements, or 512 bits. This means that 256-bit
30 # inputs will be processed only 2 times faster than 512-bit inputs,
31 # not 4 [as one would expect, because algorithm complexity is n^2].
32 # The reason for padding is that inputs shorter than 512 bits won't
33 # be processed faster anyway, because minimal critical path of the
34 # core loop happens to match 512-bit timing. Either way, it resulted
35 # in >100% improvement of 512-bit RSA sign benchmark and 50% - of
36 # 1024-bit one [in comparison to original version of *this* module].
38 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
40 # sign verify sign/s verify/s
41 # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
42 # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
43 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
44 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
45 # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
46 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
47 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
49 # ... and *without* (but still with ia64.S):
51 # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
52 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
53 # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
54 # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
55 # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
56 # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
57 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
59 # As it can be seen, RSA sign performance improves by 130-30%,
60 # hereafter less for longer keys, while verify - by 74-13%.
61 # DSA performance improves by 115-30%.
65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
66 } else { $ADDP="add"; }
72 // int bn_mul_mont
(BN_ULONG
*rp
,const BN_ULONG
*ap
,
73 // const BN_ULONG
*bp
,const BN_ULONG
*np
,
74 // const BN_ULONG
*n0p
,int num
);
81 { .mmi
; cmp4
.le p6
,p7
=2,r37
;;
82 (p6
) cmp4
.lt.unc p8
,p9
=8,r37
85 (p9
) br
.cond
.dptk
.many bn_mul_mont_8
86 (p8
) br
.cond
.dpnt
.many bn_mul_mont_general
87 (p7
) br
.ret
.spnt
.many b0
};;
90 prevfs
=r2
; prevpr
=r3
; prevlc
=r10
; prevsp
=r11
;
92 rptr
=r8
; aptr
=r9
; bptr
=r14
; nptr
=r15
;
95 num
=r18
; len
=r19
; lc=r20
;
96 topbit
=r21
; // carry bit from tmp
[num
]
103 .local bn_mul_mont_general
#
104 .proc bn_mul_mont_general
#
107 { .mmi
; .save ar
.pfs
,prevfs
108 alloc prevfs
=ar
.pfs
,6,2,0,8
112 { .mmi
; .vframe prevsp
119 .rotf alo
[6],nlo
[4],ahi
[8],nhi
[6]
122 { .mmi
; ldf8 bi
=[bptr
],8 // (*bp
++)
123 ldf8 alo
[4]=[aptr
],16 // ap
[0]
125 { .mmi
; ldf8 alo
[3]=[r30
],16 // ap
[1]
126 ldf8 alo
[2]=[aptr
],16 // ap
[2]
128 { .mmi
; ldf8 alo
[1]=[r30
] // ap
[3]
131 { .mmi
; $ADDP nptr
=0,in3
134 { .mmi
; ldf8 nlo
[2]=[nptr
],8 // np
[0]
136 shladd r31
=num
,3,r31
};;
137 { .mmi
; ldf8 nlo
[1]=[nptr
],8 // np
[1]
140 { .mfb
; and sp
=-16,r31
// alloca
141 xmpy
.hu ahi
[2]=alo
[4],bi
// ap
[0]*bp
[0]
144 xmpy
.lu alo
[4]=alo
[4],bi
145 brp
.loop.imp
.L1st_ctop
,.L1st_cend
-16
148 xma
.hu ahi
[1]=alo
[3],bi
,ahi
[2] // ap
[1]*bp
[0]
151 xma
.lu alo
[3]=alo
[3],bi
,ahi
[2]
152 mov pr
.rot
=0x20001f<<16
153 // ------^----- (p40
) at first
(p23
)
154 // ----------^^ p
[16:20]=1
157 xmpy
.lu m0
=alo
[4],n0
// (ap
[0]*bp
[0])*n0
160 fcvt
.fxu
.s1 nhi
[1]=f0
165 .pred
.rel
"mutex",p40
,p42
166 { .mfi
; (p16
) ldf8 alo
[0]=[aptr
],8 // *(aptr
++)
167 (p18
) xma
.hu ahi
[0]=alo
[2],bi
,ahi
[1]
168 (p40
) add n
[2]=n
[2],a
[2] } // (p23
) }
169 { .mfi
; (p18
) ldf8 nlo
[0]=[nptr
],8 // *(nptr
++)(p16
)
170 (p18
) xma
.lu alo
[2]=alo
[2],bi
,ahi
[1]
171 (p42
) add n
[2]=n
[2],a
[2],1 };; // (p23
)
172 { .mfi
; (p21
) getf
.sig a
[0]=alo
[5]
173 (p20
) xma
.hu nhi
[0]=nlo
[2],m0
,nhi
[1]
174 (p42
) cmp.leu p41
,p39
=n
[2],a
[2] } // (p23
)
175 { .mfi
; (p23
) st8
[tp_1
]=n
[2],8
176 (p20
) xma
.lu nlo
[2]=nlo
[2],m0
,nhi
[1]
177 (p40
) cmp.ltu p41
,p39
=n
[2],a
[2] } // (p23
)
178 { .mmb
; (p21
) getf
.sig n
[0]=nlo
[3]
180 br
.ctop
.sptk
.L1st_ctop
};;
183 { .mmi
; getf
.sig a
[0]=ahi
[6] // (p24
)
185 add num
=-1,num
};; // num
--
186 { .mmi
; .pred
.rel
"mutex",p40
,p42
187 (p40
) add n
[0]=n
[0],a
[0]
188 (p42
) add n
[0]=n
[0],a
[0],1
189 sub aptr
=aptr
,len
};; // rewind
190 { .mmi
; .pred
.rel
"mutex",p40
,p42
191 (p40
) cmp.ltu p41
,p39
=n
[0],a
[0]
192 (p42
) cmp.leu p41
,p39
=n
[0],a
[0]
193 sub nptr
=nptr
,len
};;
194 { .mmi
; .pred
.rel
"mutex",p39
,p41
195 (p39
) add topbit
=r0
,r0
196 (p41
) add topbit
=r0
,r0
,1
198 { .mmi
; st8
[tp_1
]=n
[0]
203 { .mmi
; ldf8 bi
=[bptr
],8 // (*bp
++)
204 ldf8 ahi
[3]=[tptr
] // tp
[0]
206 { .mmi
; ldf8 alo
[4]=[aptr
],16 // ap
[0]
207 ldf8 alo
[3]=[r30
],16 // ap
[1]
209 { .mfb
; ldf8 alo
[2]=[aptr
],16 // ap
[2]
210 xma
.hu ahi
[2]=alo
[4],bi
,ahi
[3] // ap
[0]*bp
[i
]+tp
[0]
211 brp
.loop.imp
.Linner_ctop
,.Linner_cend
-16
213 { .mfb
; ldf8 alo
[1]=[r30
] // ap
[3]
214 xma
.lu alo
[4]=alo
[4],bi
,ahi
[3]
216 { .mfi
; ldf8 nlo
[2]=[nptr
],16 // np
[0]
217 xma
.hu ahi
[1]=alo
[3],bi
,ahi
[2] // ap
[1]*bp
[i
]
219 { .mfi
; ldf8 nlo
[1]=[r31
] // np
[1]
220 xma
.lu alo
[3]=alo
[3],bi
,ahi
[2]
221 mov pr
.rot
=0x20101f<<16
222 // ------^----- (p40
) at first
(p23
)
223 // --------^--- (p30
) at first
(p22
)
224 // ----------^^ p
[16:20]=1
226 { .mfi
; st8
[tptr
]=r0
// tp
[0] is already accounted
227 xmpy
.lu m0
=alo
[4],n0
// (ap
[0]*bp
[i
]+tp
[0])*n0
230 fcvt
.fxu
.s1 nhi
[1]=f0
233 // This
loop spins
in 4*(n
+7) ticks on Itanium
2 and should spin
in
234 // 7*(n
+7) ticks on Itanium
(the one codenamed Merced
). Factor of
7
235 // in latter case accounts
for two
-tick pipeline stall
, which means
236 // that its performance would be
~20% lower than optimal one
. No
237 // attempt was made to address this
, because original Itanium is
238 // hardly represented out
in the wild
...
241 .pred
.rel
"mutex",p40
,p42
242 .pred
.rel
"mutex",p30
,p32
243 { .mfi
; (p16
) ldf8 alo
[0]=[aptr
],8 // *(aptr
++)
244 (p18
) xma
.hu ahi
[0]=alo
[2],bi
,ahi
[1]
245 (p40
) add n
[2]=n
[2],a
[2] } // (p23
)
246 { .mfi
; (p16
) nop
.m
0
247 (p18
) xma
.lu alo
[2]=alo
[2],bi
,ahi
[1]
248 (p42
) add n
[2]=n
[2],a
[2],1 };; // (p23
)
249 { .mfi
; (p21
) getf
.sig a
[0]=alo
[5]
251 (p40
) cmp.ltu p41
,p39
=n
[2],a
[2] } // (p23
)
252 { .mfi
; (p21
) ld8 t
[0]=[tptr
],8
254 (p42
) cmp.leu p41
,p39
=n
[2],a
[2] };; // (p23
)
255 { .mfi
; (p18
) ldf8 nlo
[0]=[nptr
],8 // *(nptr
++)
256 (p20
) xma
.hu nhi
[0]=nlo
[2],m0
,nhi
[1]
257 (p30
) add a
[1]=a
[1],t
[1] } // (p22
)
258 { .mfi
; (p16
) nop
.m
0
259 (p20
) xma
.lu nlo
[2]=nlo
[2],m0
,nhi
[1]
260 (p32
) add a
[1]=a
[1],t
[1],1 };; // (p22
)
261 { .mmi
; (p21
) getf
.sig n
[0]=nlo
[3]
263 (p30
) cmp.ltu p31
,p29
=a
[1],t
[1] } // (p22
)
264 { .mmb
; (p23
) st8
[tp_1
]=n
[2],8
265 (p32
) cmp.leu p31
,p29
=a
[1],t
[1] // (p22
)
266 br
.ctop
.sptk
.Linner_ctop
};;
269 { .mmi
; getf
.sig a
[0]=ahi
[6] // (p24
)
273 { .mmi
; .pred
.rel
"mutex",p31
,p33
274 (p31
) add a
[0]=a
[0],topbit
275 (p33
) add a
[0]=a
[0],topbit
,1
277 { .mfi
; .pred
.rel
"mutex",p31
,p33
278 (p31
) cmp.ltu p32
,p30
=a
[0],topbit
279 (p33
) cmp.leu p32
,p30
=a
[0],topbit
281 { .mfi
; .pred
.rel
"mutex",p40
,p42
282 (p40
) add n
[0]=n
[0],a
[0]
283 (p42
) add n
[0]=n
[0],a
[0],1
285 { .mmi
; .pred
.rel
"mutex",p44
,p46
286 (p40
) cmp.ltu p41
,p39
=n
[0],a
[0]
287 (p42
) cmp.leu p41
,p39
=n
[0],a
[0]
288 (p32
) add topbit
=r0
,r0
,1 }
290 { .mmi
; st8
[tp_1
]=n
[0],8
292 sub aptr
=aptr
,len
};; // rewind
293 { .mmi
; sub nptr
=nptr
,len
294 (p41
) add topbit
=r0
,r0
,1
296 { .mmb
; add tp_1
=8,sp
297 add num
=-1,num
// num
--
298 (p6
) br
.cond
.sptk
.many
.Louter
};;
301 brp
.loop.imp
.Lsub_ctop
,.Lsub_cend
-16
304 mov pr
.rot
=0x10001<<16
305 // ------^---- (p33
) at first
(p17
)
312 .pred
.rel
"mutex",p33
,p35
313 { .mfi
; (p16
) ld8 t
[0]=[tptr
],8 // t
=*(tp
++)
315 (p33
) sub n
[1]=t
[1],n
[1] } // (p17
)
316 { .mfi
; (p16
) ld8 n
[0]=[nptr
],8 // n
=*(np
++)
318 (p35
) sub n
[1]=t
[1],n
[1],1 };; // (p17
)
319 { .mib
; (p18
) st8
[rptr
]=n
[2],8 // *(rp
++)=r
320 (p33
) cmp.gtu p34
,p32
=n
[1],t
[1] // (p17
)
322 { .mib
; (p18
) nop
.m
0
323 (p35
) cmp.geu p34
,p32
=n
[1],t
[1] // (p17
)
324 br
.ctop
.sptk
.Lsub_ctop
};;
327 { .mmb
; .pred
.rel
"mutex",p34
,p36
328 (p34
) sub topbit
=topbit
,r0
// (p19
)
329 (p36
) sub topbit
=topbit
,r0
,1
330 brp
.loop.imp
.Lcopy_ctop
,.Lcopy_cend
-16
332 { .mmb
; sub rptr
=rptr
,len
// rewind
335 { .mmi
; and aptr
=tptr
,topbit
336 andcm bptr
=rptr
,topbit
338 { .mii
; or nptr
=aptr
,bptr
343 { .mmb
; (p16
) ld8 n
[0]=[nptr
],8
344 (p18
) st8
[tptr
]=r0
,8
346 { .mmb
; (p16
) nop
.m
0
347 (p18
) st8
[rptr
]=n
[2],8
348 br
.ctop
.sptk
.Lcopy_ctop
};;
351 { .mmi
; mov ret0
=1 // signal
"handled"
352 rum
1<<5 // clear um
.mfh
356 mov pr
=prevpr
,0x1ffff
357 br
.ret
.sptk
.many b0
};;
358 .endp bn_mul_mont_general
#
360 a1
=r16
; a2
=r17
; a3
=r18
; a4
=r19
; a5
=r20
; a6
=r21
; a7
=r22
; a8
=r23
;
361 n1
=r24
; n2
=r25
; n3
=r26
; n4
=r27
; n5
=r28
; n6
=r29
; n7
=r30
; n8
=r31
;
364 ai0
=f8
; ai1
=f9
; ai2
=f10
; ai3
=f11
; ai4
=f12
; ai5
=f13
; ai6
=f14
; ai7
=f15
;
365 ni0
=f16
; ni1
=f17
; ni2
=f18
; ni3
=f19
; ni4
=f20
; ni5
=f21
; ni6
=f22
; ni7
=f23
;
368 .skip
48 // aligns
loop body
369 .local bn_mul_mont_8
#
373 { .mmi
; .save ar
.pfs
,prevfs
374 alloc prevfs
=ar
.pfs
,6,2,0,8
379 { .mmi
; add r17
=-6*16,sp
384 { .mmi
; .save
.gf
0,0x10
385 stf
.spill
[sp
]=f16
,-16
387 stf
.spill
[r17
]=f17
,32
388 add r16
=-5*16,prevsp
};;
389 { .mmi
; .save
.gf
0,0x40
390 stf
.spill
[r16
]=f18
,32
392 stf
.spill
[r17
]=f19
,32
394 { .mmi
; .save
.gf
0,0x100
395 stf
.spill
[r16
]=f20
,32
397 stf
.spill
[r17
]=f21
,32
399 { .mmi
; .save
.gf
0,0x400
406 .rotf bj
[8],mj
[2],tf
[2],alo
[10],ahi
[10],nlo
[10],nhi
[10]
409 // load input vectors padding them to
8 elements
410 { .mmi
; ldf8 ai0
=[aptr
],16 // ap
[0]
411 ldf8 ai1
=[r29
],16 // ap
[1]
413 { .mmi
; $ADDP r30
=8,in2
416 { .mmi
; ldf8 bj
[7]=[bptr
],16 // bp
[0]
417 ldf8 bj
[6]=[r30
],16 // bp
[1]
418 cmp4
.le p4
,p5
=3,in5
}
419 { .mmi
; ldf8 ni0
=[nptr
],16 // np
[0]
420 ldf8 ni1
=[r31
],16 // np
[1]
421 cmp4
.le p6
,p7
=4,in5
};;
423 { .mfi
; (p4
)ldf8 ai2
=[aptr
],16 // ap
[2]
425 cmp4
.le p8
,p9
=5,in5
}
426 { .mfi
; (p6
)ldf8 ai3
=[r29
],16 // ap
[3]
428 cmp4
.le p10
,p11
=6,in5
}
429 { .mfi
; (p4
)ldf8 bj
[5]=[bptr
],16 // bp
[2]
430 (p5
)fcvt
.fxu bj
[5]=f0
431 cmp4
.le p12
,p13
=7,in5
}
432 { .mfi
; (p6
)ldf8 bj
[4]=[r30
],16 // bp
[3]
433 (p7
)fcvt
.fxu bj
[4]=f0
434 cmp4
.le p14
,p15
=8,in5
}
435 { .mfi
; (p4
)ldf8 ni2
=[nptr
],16 // np
[2]
438 { .mfi
; (p6
)ldf8 ni3
=[r31
],16 // np
[3]
442 { .mfi
; ldf8 n0
=[in4
]
446 { .mfi
; (p8
)ldf8 ai4
=[aptr
],16 // ap
[4]
449 { .mfi
; (p10
)ldf8 ai5
=[r29
],16 // ap
[5]
452 { .mfi
; (p8
)ldf8 bj
[3]=[bptr
],16 // bp
[4]
453 (p9
)fcvt
.fxu bj
[3]=f0
455 { .mfi
; (p10
)ldf8 bj
[2]=[r30
],16 // bp
[5]
456 (p11
)fcvt
.fxu bj
[2]=f0
458 { .mfi
; (p8
)ldf8 ni4
=[nptr
],16 // np
[4]
461 { .mfi
; (p10
)ldf8 ni5
=[r31
],16 // np
[5]
465 { .mfi
; (p12
)ldf8 ai6
=[aptr
],16 // ap
[6]
468 { .mfi
; (p14
)ldf8 ai7
=[r29
],16 // ap
[7]
471 { .mfi
; (p12
)ldf8 bj
[1]=[bptr
],16 // bp
[6]
472 (p13
)fcvt
.fxu bj
[1]=f0
474 { .mfi
; (p14
)ldf8 bj
[0]=[r30
],16 // bp
[7]
475 (p15
)fcvt
.fxu bj
[0]=f0
477 { .mfi
; (p12
)ldf8 ni6
=[nptr
],16 // np
[6]
480 { .mfb
; (p14
)ldf8 ni7
=[r31
],16 // np
[7]
482 brp
.loop.imp
.Louter_8_ctop
,.Louter_8_cend
-16
485 // The
loop is scheduled
for 32*n ticks on Itanium
2. Actual attempt
486 // to measure with help of Interval Time Counter indicated that the
487 // factor is a tad higher
: 33 or 34, if not 35. Exact measurement
and
488 // addressing the issue is problematic
, because I don
't have access
489 // to platform-specific instruction-level profiler. On Itanium it
490 // should run in 56*n ticks, because of higher xma latency...
492 .pred.rel "mutex",p40,p42
493 .pred.rel "mutex",p48,p50
494 { .mfi; (p16) nop.m 0 // 0:
495 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
496 (p40) add a3=a3,n3 } // (p17) a3+=n3
497 { .mfi; (p42) add a3=a3,n3,1
498 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
500 { .mii; (p17) getf.sig a7=alo[8] // 1:
501 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
502 (p50) add t[6]=t[6],a3,1 };;
503 { .mfi; (p17) getf.sig a8=ahi[8] // 2:
504 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
505 (p40) cmp.ltu p43,p41=a3,n3 }
506 { .mfi; (p42) cmp.leu p43,p41=a3,n3
507 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
509 { .mii; (p17) getf.sig n5=nlo[6] // 3:
510 (p48) cmp.ltu p51,p49=t[6],a3
511 (p50) cmp.leu p51,p49=t[6],a3 };;
512 .pred.rel "mutex",p41,p43
513 .pred.rel "mutex",p49,p51
514 { .mfi; (p16) nop.m 0 // 4:
515 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
516 (p41) add a4=a4,n4 } // (p17) a4+=n4
517 { .mfi; (p43) add a4=a4,n4,1
518 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
520 { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
521 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
522 (p51) add t[5]=t[5],a4,1 };;
523 { .mfi; (p16) nop.m 0 // 6:
524 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
525 (p41) cmp.ltu p42,p40=a4,n4 }
526 { .mfi; (p43) cmp.leu p42,p40=a4,n4
527 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
529 { .mii; (p17) getf.sig n6=nlo[7] // 7:
530 (p49) cmp.ltu p50,p48=t[5],a4
531 (p51) cmp.leu p50,p48=t[5],a4 };;
532 .pred.rel "mutex",p40,p42
533 .pred.rel "mutex",p48,p50
534 { .mfi; (p16) nop.m 0 // 8:
535 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
536 (p40) add a5=a5,n5 } // (p17) a5+=n5
537 { .mfi; (p42) add a5=a5,n5,1
538 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
540 { .mii; (p16) getf.sig a1=alo[1] // 9:
541 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
542 (p50) add t[4]=t[4],a5,1 };;
543 { .mfi; (p16) nop.m 0 // 10:
544 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
545 (p40) cmp.ltu p43,p41=a5,n5 }
546 { .mfi; (p42) cmp.leu p43,p41=a5,n5
547 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
549 { .mii; (p17) getf.sig n7=nlo[8] // 11:
550 (p48) cmp.ltu p51,p49=t[4],a5
551 (p50) cmp.leu p51,p49=t[4],a5 };;
552 .pred.rel "mutex",p41,p43
553 .pred.rel "mutex",p49,p51
554 { .mfi; (p17) getf.sig n8=nhi[8] // 12:
555 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
556 (p41) add a6=a6,n6 } // (p17) a6+=n6
557 { .mfi; (p43) add a6=a6,n6,1
558 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
560 { .mii; (p16) getf.sig a2=alo[2] // 13:
561 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
562 (p51) add t[3]=t[3],a6,1 };;
563 { .mfi; (p16) nop.m 0 // 14:
564 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
565 (p41) cmp.ltu p42,p40=a6,n6 }
566 { .mfi; (p43) cmp.leu p42,p40=a6,n6
567 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
569 { .mii; (p16) nop.m 0 // 15:
570 (p49) cmp.ltu p50,p48=t[3],a6
571 (p51) cmp.leu p50,p48=t[3],a6 };;
572 .pred.rel "mutex",p40,p42
573 .pred.rel "mutex",p48,p50
574 { .mfi; (p16) nop.m 0 // 16:
575 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
576 (p40) add a7=a7,n7 } // (p17) a7+=n7
577 { .mfi; (p42) add a7=a7,n7,1
578 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
580 { .mii; (p16) getf.sig a3=alo[3] // 17:
581 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
582 (p50) add t[2]=t[2],a7,1 };;
583 { .mfi; (p16) nop.m 0 // 18:
584 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
585 (p40) cmp.ltu p43,p41=a7,n7 }
586 { .mfi; (p42) cmp.leu p43,p41=a7,n7
587 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
589 { .mii; (p16) getf.sig n1=nlo[1] // 19:
590 (p48) cmp.ltu p51,p49=t[2],a7
591 (p50) cmp.leu p51,p49=t[2],a7 };;
592 .pred.rel "mutex",p41,p43
593 .pred.rel "mutex",p49,p51
594 { .mfi; (p16) nop.m 0 // 20:
595 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
596 (p41) add a8=a8,n8 } // (p17) a8+=n8
597 { .mfi; (p43) add a8=a8,n8,1
598 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
600 { .mii; (p16) getf.sig a4=alo[4] // 21:
601 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
602 (p51) add t[1]=t[1],a8,1 };;
603 { .mfi; (p16) nop.m 0 // 22:
604 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
605 (p41) cmp.ltu p42,p40=a8,n8 }
606 { .mfi; (p43) cmp.leu p42,p40=a8,n8
607 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
609 { .mii; (p16) getf.sig n2=nlo[2] // 23:
610 (p49) cmp.ltu p50,p48=t[1],a8
611 (p51) cmp.leu p50,p48=t[1],a8 };;
612 { .mfi; (p16) nop.m 0 // 24:
613 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
614 (p16) add a1=a1,n1 } // (p16) a1+=n1
615 { .mfi; (p16) nop.m 0
616 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
617 (p17) mov t[0]=r0 };;
618 { .mii; (p16) getf.sig a5=alo[5] // 25:
619 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
620 (p42) add t[0]=t[0],r0,1 };;
621 { .mfi; (p16) setf.sig tf[0]=t0 // 26:
622 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
623 (p50) add t[0]=t[0],r0,1 }
624 { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
625 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
627 { .mii; (p16) getf.sig n3=nlo[3] // 27:
628 (p16) cmp.ltu.unc p50,p48=t0,a1
630 .pred.rel "mutex",p40,p42
631 .pred.rel "mutex",p48,p50
632 { .mfi; (p16) nop.m 0 // 28:
633 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
634 (p40) add a2=a2,n2 } // (p16) a2+=n2
635 { .mfi; (p42) add a2=a2,n2,1
636 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
638 { .mii; (p16) getf.sig a6=alo[6] // 29:
639 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
640 (p50) add t[6]=t[6],a2,1 };;
641 { .mfi; (p16) nop.m 0 // 30:
642 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
643 (p40) cmp.ltu p41,p39=a2,n2 }
644 { .mfi; (p42) cmp.leu p41,p39=a2,n2
645 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
647 { .mfi; (p16) getf.sig n4=nlo[4] // 31:
649 (p48) cmp.ltu p49,p47=t[6],a2 }
650 { .mfb; (p50) cmp.leu p49,p47=t[6],a2
652 br.ctop.sptk.many .Louter_8_ctop };;
655 // above loop has to execute one more time, without (p16), which is
656 // replaced with merged move of np[8] to GPR bank
657 .pred.rel "mutex",p40,p42
658 .pred.rel "mutex",p48,p50
659 { .mmi; (p0) getf.sig n1=ni0 // 0:
660 (p40) add a3=a3,n3 // (p17) a3+=n3
661 (p42) add a3=a3,n3,1 };;
662 { .mii; (p17) getf.sig a7=alo[8] // 1:
663 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
664 (p50) add t[6]=t[6],a3,1 };;
665 { .mfi; (p17) getf.sig a8=ahi[8] // 2:
666 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
667 (p40) cmp.ltu p43,p41=a3,n3 }
668 { .mfi; (p42) cmp.leu p43,p41=a3,n3
669 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
671 { .mii; (p17) getf.sig n5=nlo[6] // 3:
672 (p48) cmp.ltu p51,p49=t[6],a3
673 (p50) cmp.leu p51,p49=t[6],a3 };;
674 .pred.rel "mutex",p41,p43
675 .pred.rel "mutex",p49,p51
676 { .mmi; (p0) getf.sig n2=ni1 // 4:
677 (p41) add a4=a4,n4 // (p17) a4+=n4
678 (p43) add a4=a4,n4,1 };;
679 { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
681 (p51) add t[5]=t[5],a4,1 };;
682 { .mfi; (p0) getf.sig n3=ni2 // 6:
683 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
684 (p41) cmp.ltu p42,p40=a4,n4 }
685 { .mfi; (p43) cmp.leu p42,p40=a4,n4
686 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
688 { .mii; (p17) getf.sig n6=nlo[7] // 7:
689 (p49) cmp.ltu p50,p48=t[5],a4
690 (p51) cmp.leu p50,p48=t[5],a4 };;
691 .pred.rel "mutex",p40,p42
692 .pred.rel "mutex",p48,p50
693 { .mii; (p0) getf.sig n4=ni3 // 8:
694 (p40) add a5=a5,n5 // (p17) a5+=n5
695 (p42) add a5=a5,n5,1 };;
696 { .mii; (p0) nop.m 0 // 9:
697 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
698 (p50) add t[4]=t[4],a5,1 };;
699 { .mii; (p0) nop.m 0 // 10:
700 (p40) cmp.ltu p43,p41=a5,n5
701 (p42) cmp.leu p43,p41=a5,n5 };;
702 { .mii; (p17) getf.sig n7=nlo[8] // 11:
703 (p48) cmp.ltu p51,p49=t[4],a5
704 (p50) cmp.leu p51,p49=t[4],a5 };;
705 .pred.rel "mutex",p41,p43
706 .pred.rel "mutex",p49,p51
707 { .mii; (p17) getf.sig n8=nhi[8] // 12:
708 (p41) add a6=a6,n6 // (p17) a6+=n6
709 (p43) add a6=a6,n6,1 };;
710 { .mii; (p0) getf.sig n5=ni4 // 13:
711 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
712 (p51) add t[3]=t[3],a6,1 };;
713 { .mii; (p0) nop.m 0 // 14:
714 (p41) cmp.ltu p42,p40=a6,n6
715 (p43) cmp.leu p42,p40=a6,n6 };;
716 { .mii; (p0) getf.sig n6=ni5 // 15:
717 (p49) cmp.ltu p50,p48=t[3],a6
718 (p51) cmp.leu p50,p48=t[3],a6 };;
719 .pred.rel "mutex",p40,p42
720 .pred.rel "mutex",p48,p50
721 { .mii; (p0) nop.m 0 // 16:
722 (p40) add a7=a7,n7 // (p17) a7+=n7
723 (p42) add a7=a7,n7,1 };;
724 { .mii; (p0) nop.m 0 // 17:
725 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
726 (p50) add t[2]=t[2],a7,1 };;
727 { .mii; (p0) nop.m 0 // 18:
728 (p40) cmp.ltu p43,p41=a7,n7
729 (p42) cmp.leu p43,p41=a7,n7 };;
730 { .mii; (p0) getf.sig n7=ni6 // 19:
731 (p48) cmp.ltu p51,p49=t[2],a7
732 (p50) cmp.leu p51,p49=t[2],a7 };;
733 .pred.rel "mutex",p41,p43
734 .pred.rel "mutex",p49,p51
735 { .mii; (p0) nop.m 0 // 20:
736 (p41) add a8=a8,n8 // (p17) a8+=n8
737 (p43) add a8=a8,n8,1 };;
738 { .mmi; (p0) nop.m 0 // 21:
739 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
740 (p51) add t[1]=t[1],a8,1 }
741 { .mmi; (p17) mov t[0]=r0
742 (p41) cmp.ltu p42,p40=a8,n8
743 (p43) cmp.leu p42,p40=a8,n8 };;
744 { .mmi; (p0) getf.sig n8=ni7 // 22:
745 (p49) cmp.ltu p50,p48=t[1],a8
746 (p51) cmp.leu p50,p48=t[1],a8 }
747 { .mmi; (p42) add t[0]=t[0],r0,1
748 (p0) add r16=-7*16,prevsp
749 (p0) add r17=-6*16,prevsp };;
751 // subtract np[8] from carrybit|tmp[8]
752 // carrybit|tmp[8] layout upon exit from above loop is:
753 // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
754 { .mmi; (p50)add t[0]=t[0],r0,1
757 { .mmi; cmp.gtu p34,p32=n1,t0;;
758 .pred.rel "mutex",p32,p34
760 (p34)sub n2=t[7],n2,1 };;
761 { .mii; (p32)cmp.gtu p35,p33=n2,t[7]
762 (p34)cmp.geu p35,p33=n2,t[7];;
763 .pred.rel "mutex",p33,p35
764 (p33)sub n3=t[6],n3 }
765 { .mmi; (p35)sub n3=t[6],n3,1;;
766 (p33)cmp.gtu p34,p32=n3,t[6]
767 (p35)cmp.geu p34,p32=n3,t[6] };;
768 .pred.rel "mutex",p32,p34
769 { .mii; (p32)sub n4=t[5],n4
770 (p34)sub n4=t[5],n4,1;;
771 (p32)cmp.gtu p35,p33=n4,t[5] }
772 { .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
773 .pred.rel "mutex",p33,p35
775 (p35)sub n5=t[4],n5,1 };;
776 { .mii; (p33)cmp.gtu p34,p32=n5,t[4]
777 (p35)cmp.geu p34,p32=n5,t[4];;
778 .pred.rel "mutex",p32,p34
779 (p32)sub n6=t[3],n6 }
780 { .mmi; (p34)sub n6=t[3],n6,1;;
781 (p32)cmp.gtu p35,p33=n6,t[3]
782 (p34)cmp.geu p35,p33=n6,t[3] };;
783 .pred.rel "mutex",p33,p35
784 { .mii; (p33)sub n7=t[2],n7
785 (p35)sub n7=t[2],n7,1;;
786 (p33)cmp.gtu p34,p32=n7,t[2] }
787 { .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
788 .pred.rel "mutex",p32,p34
790 (p34)sub n8=t[1],n8,1 };;
791 { .mii; (p32)cmp.gtu p35,p33=n8,t[1]
792 (p34)cmp.geu p35,p33=n8,t[1];;
793 .pred.rel "mutex",p33,p35
794 (p33)sub a8=t[0],r0 }
795 { .mmi; (p35)sub a8=t[0],r0,1;;
796 (p33)cmp.gtu p34,p32=a8,t[0]
797 (p35)cmp.geu p34,p32=a8,t[0] };;
799 // save the result, either tmp[num] or tmp[num]-np[num]
800 .pred.rel "mutex",p32,p34
801 { .mmi; (p32)st8 [rptr]=n1,8
803 add r19=-4*16,prevsp};;
804 { .mmb; (p32)st8 [rptr]=n2,8
805 (p34)st8 [rptr]=t[7],8
806 (p5)br.cond.dpnt.few .Ldone };;
807 { .mmb; (p32)st8 [rptr]=n3,8
808 (p34)st8 [rptr]=t[6],8
809 (p7)br.cond.dpnt.few .Ldone };;
810 { .mmb; (p32)st8 [rptr]=n4,8
811 (p34)st8 [rptr]=t[5],8
812 (p9)br.cond.dpnt.few .Ldone };;
813 { .mmb; (p32)st8 [rptr]=n5,8
814 (p34)st8 [rptr]=t[4],8
815 (p11)br.cond.dpnt.few .Ldone };;
816 { .mmb; (p32)st8 [rptr]=n6,8
817 (p34)st8 [rptr]=t[3],8
818 (p13)br.cond.dpnt.few .Ldone };;
819 { .mmb; (p32)st8 [rptr]=n7,8
820 (p34)st8 [rptr]=t[2],8
821 (p15)br.cond.dpnt.few .Ldone };;
822 { .mmb; (p32)st8 [rptr]=n8,8
823 (p34)st8 [rptr]=t[1],8
826 { .mmi; ldf.fill f16=[r16],64
827 ldf.fill f17=[r17],64
829 { .mmi; ldf.fill f18=[r18],64
830 ldf.fill f19=[r19],64
831 mov pr=prevpr,0x1ffff };;
832 { .mmi; ldf.fill f20=[r16]
835 { .mmi; ldf.fill f22=[r18]
837 mov ret0=1 } // signal "handled"
841 br.ret.sptk.many b0 };;
844 .type copyright#,\@object
846 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
849 $output=shift and open STDOUT,">$output";