3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # "Teaser" Montgomery multiplication module for PowerPC. It's possible
13 # to gain a bit more by modulo-scheduling outer loop, then dedicated
14 # squaring procedure should give further 20% and code can be adapted
15 # for 32-bit application running on 64-bit CPU. As for the latter.
16 # It won't be able to achieve "native" 64-bit performance, because in
17 # 32-bit application context every addc instruction will have to be
18 # expanded as addc, twice right shift by 32 and finally adde, etc.
19 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20 # for 64-bit application running on PPC970/G5 is:
29 if ($flavour =~ /32/) {
36 $LDU= "lwzu"; # load and update
37 $LDX= "lwzx"; # load indexed
39 $STU= "stwu"; # store and update
40 $STX= "stwx"; # store indexed
41 $STUX= "stwux"; # store indexed and update
42 $UMULL= "mullw"; # unsigned multiply low
43 $UMULH= "mulhwu"; # unsigned multiply high
44 $UCMP= "cmplw"; # unsigned compare
45 $SHRI= "srwi"; # unsigned shift right by immediate
48 } elsif ($flavour =~ /64/) {
54 # same as above, but 64-bit mnemonics...
56 $LDU= "ldu"; # load and update
57 $LDX= "ldx"; # load indexed
59 $STU= "stdu"; # store and update
60 $STX= "stdx"; # store indexed
61 $STUX= "stdux"; # store indexed and update
62 $UMULL= "mulld"; # unsigned multiply low
63 $UMULH= "mulhdu"; # unsigned multiply high
64 $UCMP= "cmpld"; # unsigned compare
65 $SHRI= "srdi"; # unsigned shift right by immediate
68 } else { die "nonsense $flavour"; }
70 $FRAME=8*$SIZE_T+$RZONE;
73 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
74 ( $xlate="${dir}ppc-xlate.pl" and -f
$xlate ) or
75 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f
$xlate) or
76 die "can't locate ppc-xlate.pl";
78 open STDOUT
,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
88 $rp="r9"; # $rp is reassigned
92 # non-volatile registers
112 .globl
.bn_mul_mont_int
116 mr
$rp,r3
; $rp is reassigned
120 $code.=<<___
if ($BNSZ==4);
121 cmpwi
$num,32 ; longer key performance is
not better
125 slwi
$num,$num,`log($BNSZ)/log(2)`
127 addi
$ovf,$num,$FRAME
128 subf
$ovf,$ovf,$sp ; $sp-$ovf
129 and $ovf,$ovf,$tj ; minimize TLB usage
130 subf
$ovf,$sp,$ovf ; $ovf-$sp
132 srwi
$num,$num,`log($BNSZ)/log(2)`
135 $PUSH r20
,`-12*$SIZE_T`($tj)
136 $PUSH r21
,`-11*$SIZE_T`($tj)
137 $PUSH r22
,`-10*$SIZE_T`($tj)
138 $PUSH r23
,`-9*$SIZE_T`($tj)
139 $PUSH r24
,`-8*$SIZE_T`($tj)
140 $PUSH r25
,`-7*$SIZE_T`($tj)
141 $PUSH r26
,`-6*$SIZE_T`($tj)
142 $PUSH r27
,`-5*$SIZE_T`($tj)
143 $PUSH r28
,`-4*$SIZE_T`($tj)
144 $PUSH r29
,`-3*$SIZE_T`($tj)
145 $PUSH r30
,`-2*$SIZE_T`($tj)
146 $PUSH r31
,`-1*$SIZE_T`($tj)
148 $LD $n0,0($n0) ; pull n0
[0] value
149 addi
$num,$num,-2 ; adjust
$num for counter register
151 $LD $m0,0($bp) ; m0
=bp
[0]
152 $LD $aj,0($ap) ; ap
[0]
154 $UMULL $lo0,$aj,$m0 ; ap
[0]*bp
[0]
157 $LD $aj,$BNSZ($ap) ; ap
[1]
158 $LD $nj,0($np) ; np
[0]
160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
162 $UMULL $alo,$aj,$m0 ; ap
[1]*bp
[0]
165 $UMULL $lo1,$nj,$m1 ; np
[0]*m1
167 $LD $nj,$BNSZ($np) ; np
[1]
171 $UMULL $nlo,$nj,$m1 ; np
[1]*m1
178 $LDX $aj,$ap,$j ; ap
[j
]
180 $LDX $nj,$np,$j ; np
[j
]
182 $UMULL $alo,$aj,$m0 ; ap
[j
]*bp
[0]
186 $UMULL $nlo,$nj,$m1 ; np
[j
]*m1
187 addc
$lo1,$lo1,$lo0 ; np
[j
]*m1
+ap
[j
]*bp
[0]
190 $ST $lo1,0($tp) ; tp
[j
-1]
192 addi
$j,$j,$BNSZ ; j
++
193 addi
$tp,$tp,$BNSZ ; tp
++
201 addc
$lo1,$lo1,$lo0 ; np
[j
]*m1
+ap
[j
]*bp
[0]
203 $ST $lo1,0($tp) ; tp
[j
-1]
207 addze
$ovf,$ovf ; upmost overflow bit
213 $LDX $m0,$bp,$i ; m0
=bp
[i
]
214 $LD $aj,0($ap) ; ap
[0]
216 $LD $tj,$LOCALS($sp); tp
[0]
217 $UMULL $lo0,$aj,$m0 ; ap
[0]*bp
[i
]
219 $LD $aj,$BNSZ($ap) ; ap
[1]
220 $LD $nj,0($np) ; np
[0]
221 addc
$lo0,$lo0,$tj ; ap
[0]*bp
[i
]+tp
[0]
222 $UMULL $alo,$aj,$m0 ; ap
[j
]*bp
[i
]
224 $UMULL $m1,$lo0,$n0 ; tp
[0]*n0
226 $UMULL $lo1,$nj,$m1 ; np
[0]*m1
228 $LD $nj,$BNSZ($np) ; np
[1]
230 $UMULL $nlo,$nj,$m1 ; np
[1]*m1
238 $LDX $aj,$ap,$j ; ap
[j
]
240 $LD $tj,$BNSZ($tp) ; tp
[j
]
242 $LDX $nj,$np,$j ; np
[j
]
244 $UMULL $alo,$aj,$m0 ; ap
[j
]*bp
[i
]
247 addc
$lo0,$lo0,$tj ; ap
[j
]*bp
[i
]+tp
[j
]
248 $UMULL $nlo,$nj,$m1 ; np
[j
]*m1
251 addc
$lo1,$lo1,$lo0 ; np
[j
]*m1
+ap
[j
]*bp
[i
]+tp
[j
]
252 addi
$j,$j,$BNSZ ; j
++
254 $ST $lo1,0($tp) ; tp
[j
-1]
255 addi
$tp,$tp,$BNSZ ; tp
++
258 $LD $tj,$BNSZ($tp) ; tp
[j
]
261 addc
$lo0,$lo0,$tj ; ap
[j
]*bp
[i
]+tp
[j
]
266 addc
$lo1,$lo1,$lo0 ; np
[j
]*m1
+ap
[j
]*bp
[i
]+tp
[j
]
268 $ST $lo1,0($tp) ; tp
[j
-1]
270 addic
$ovf,$ovf,-1 ; move upmost overflow to XER
[CA
]
276 slwi
$tj,$num,`log($BNSZ)/log(2)`
281 addi
$num,$num,2 ; restore
$num
282 subfc
$j,$j,$j ; j
=0 and "clear" XER
[CA
]
287 Lsub
: $LDX $tj,$tp,$j
289 subfe
$aj,$nj,$tj ; tp
[j
]-np
[j
]
296 subfe
$ovf,$j,$ovf ; handle upmost overflow bit
299 or $ap,$ap,$np ; ap
=borrow?tp
:rp
302 Lcopy
: ; copy
or in-place refresh
305 $STX $j,$tp,$j ; zap at once
311 $POP r20
,`-12*$SIZE_T`($tj)
312 $POP r21
,`-11*$SIZE_T`($tj)
313 $POP r22
,`-10*$SIZE_T`($tj)
314 $POP r23
,`-9*$SIZE_T`($tj)
315 $POP r24
,`-8*$SIZE_T`($tj)
316 $POP r25
,`-7*$SIZE_T`($tj)
317 $POP r26
,`-6*$SIZE_T`($tj)
318 $POP r27
,`-5*$SIZE_T`($tj)
319 $POP r28
,`-4*$SIZE_T`($tj)
320 $POP r29
,`-3*$SIZE_T`($tj)
321 $POP r30
,`-2*$SIZE_T`($tj)
322 $POP r31
,`-1*$SIZE_T`($tj)
326 .byte
0,12,4,0,0x80,12,6,0
329 .asciz
"Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
332 $code =~ s/\`([^\`]*)\`/eval $1/gem;