3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
15 # For reference, [4x] unrolled loop is >40% faster than folded one.
16 # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
17 # is believed to be not sufficient to justify the effort...
19 # Special thanks to polarhome.com for providing HP-UX account.
21 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
25 open STDOUT
,">$output";
27 if ($flavour =~ /64/) {
47 $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
48 # [+ argument transfer]
49 $SZ=1; # defaults to RC4_CHAR
50 if (open CONF
,"<${dir}../../opensslconf.h") {
52 if (m/#\s*define\s+RC4_INT\s+(.*)/) {
53 $SZ = ($1=~/char$/) ?
1 : 4;
60 if ($SZ==1) { # RC4_CHAR
65 } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
90 sub unrolledloopbody
{
91 for ($i=0;$i<4;$i++) {
94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
95 and $mask,$XX[1],$XX[1]
98 $LDX $XX[1]($key),$TX[1]
101 comclr
,<> $XX[1],$YY,%r0 ; conditional
102 copy
$TX[0],$TX[1] ; move
103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
110 push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
114 my ($label,$count)=@_;
124 ldbx
$inp($out),$dat1
126 and $mask,$XX[0],$XX[0]
128 $LDX $XX[0]($key),$TX[0]
133 addib
,<> -1,$count,$label ; $count is always small
141 .SUBSPA \
$CODE\
$,QUAD
=0,ALIGN
=8,ACCESS
=0x2C,CODE_ONLY
143 .EXPORT RC4
,ENTRY
,ARGW0
=GR
,ARGW1
=GR
,ARGW2
=GR
,ARGW3
=GR
146 .CALLINFO FRAME
=`$FRAME-4*$SIZE_T`,NO_CALLS
,SAVE_RP
,ENTRY_GR
=6
148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
149 $PUSHMA %r3,$FRAME(%sp)
150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
154 cmpib
,*= 0,$len,L\
$abort
155 sub $inp,$out,$inp ; distance between
$inp and $out
157 $LD `0*$SZ`($key),$XX[0]
158 $LD `1*$SZ`($key),$YY
159 ldo
`2*$SZ`($key),$key
164 ldo
1($XX[0]),$XX[0] ; warm up
loop
165 and $mask,$XX[0],$XX[0]
166 $LDX $XX[0]($key),$TX[0]
168 cmpib
,*>>= 6,$len,L\
$oop1 ; is
$len large enough to bother?
171 and,<> $out,$dat0,$rem ; is
$out aligned?
176 &foldedloop
("L\$alignout",$rem); # process till $out is aligned
179 L\
$alignedout ; $len is at least
4 here
180 and,<> $inp,$dat0,$acc ; is
$inp aligned?
182 sub $inp,$acc,$rem ; align
$inp
184 sh3addl
$acc,%r0,$acc
186 mtctl
$acc,%cr11 ; load
%sar with vshd align factor
187 ldwx
$rem($out),$dat0
194 ldwx
$rem($out),$dat1
196 or $ix,$acc,$acc ; last piece
, no need to dep
197 vshd
$dat0,$dat1,$iy ; align data
201 cmpib
,*<< 3,$len,L\
$oop4misalignedinp
203 cmpib
,*= 0,$len,L\
$done
214 ldwx
$inp($out),$dat0
216 or $ix,$acc,$acc ; last piece
, no need to dep
219 cmpib
,*<< 3,$len,L\
$oop4
221 cmpib
,*= 0,$len,L\
$done
224 &foldedloop
("L\$oop1",$len);
227 $POP `-$FRAME-$SAVED_RP`(%sp),%r2
228 ldo
-1($XX[0]),$XX[0] ; chill out
loop
230 and $mask,$XX[0],$XX[0]
232 $ST $XX[0],`-2*$SZ`($key)
233 $ST $YY,`-1*$SZ`($key)
234 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
235 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
236 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
240 $POPMB -$FRAME(%sp),%r3
246 .EXPORT private_RC4_set_key
,ENTRY
,ARGW0
=GR
,ARGW1
=GR
,ARGW2
=GR
252 $ST %r0,`0*$SZ`($key)
253 $ST %r0,`1*$SZ`($key)
254 ldo
`2*$SZ`($key),$key
259 bb
,>= @XX[0],`31-8`,L\
$1st ; @XX[0]<256
262 ldo
`-256*$SZ`($key),$key ; rewind
$key
263 addl
$len,$inp,$inp ; $inp to point at the end
264 sub %r0,$len,%r23 ; inverse
index
270 $LDX @XX[0]($key),@TX[0]
271 ldbx
%r23($inp),@TX[1]
272 addi
,nuv
1,%r23,%r23 ; increment
and conditional
273 sub %r0,$len,%r23 ; inverse
index
274 addl
@TX[0],@XX[1],@XX[1]
275 addl
@TX[1],@XX[1],@XX[1]
276 and $mask,@XX[1],@XX[1]
278 $LDX @XX[1]($key),@TX[1]
282 bb
,>= @XX[0],`31-8`,L\
$2nd ; @XX[0]<256
290 .EXPORT RC4_options
,ENTRY
302 ldo L\
$opts-L\
$pic(%r28),%r28
306 .STRINGZ
"rc4(4x,`$SZ==1?"char
":"int"`)"
307 .STRINGZ
"RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
309 $code =~ s/\`([^\`]*)\`/eval $1/gem;
310 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);