2 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
4 # ====================================================================
5 # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
7 # ====================================================================
9 # Poly1305 hash for MIPS.
13 # Numbers are cycles per processed byte with poly1305_blocks alone.
16 # R1x000 ~5.5/+130% (big-endian)
17 # Octeon II 2.50/+70% (little-endian)
21 # Add 32-bit code path.
25 # Modulo-scheduling reduction allows to omit dependency chain at the
26 # end of inner loop and improve performance. Also optimize MIPS32R2
27 # code path for MIPS 1004K core. Per René von Dorst's suggestions.
30 # R1x000 ~9.8/? (big-endian)
31 # Octeon II 3.65/+140% (little-endian)
32 # MT7621/1004K 4.75/? (little-endian)
34 ######################################################################
35 # There is a number of MIPS ABI in use, O32 and N32/64 are most
36 # widely used. Then there is a new contender: NUBI. It appears that if
37 # one picks the latter, it's possible to arrange code in ABI neutral
38 # manner. Therefore let's stick to NUBI register layout:
40 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
45 # The return value is placed in $a0. Following coding rules facilitate
48 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
49 # excluded from the rule, because it's specified volatile];
50 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
52 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
54 # For reference here is register layout for N32/64 MIPS ABIs:
56 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
64 ######################################################################
66 $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
68 $v0 = ($flavour =~ /nubi/i) ?
$a0 : $t0;
70 if ($flavour =~ /64|n32/i) {{{
71 ######################################################################
75 my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76 my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
79 #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80 defined(_MIPS_ARCH_MIPS64R6
)) \\
81 && !defined(_MIPS_ARCH_MIPS64R2
)
82 # define _MIPS_ARCH_MIPS64R2
85 #if defined(_MIPS_ARCH_MIPS64R6)
86 # define dmultu(rs,rt)
87 # define mflo(rd,rs,rt) dmulu rd,rs,rt
88 # define mfhi(rd,rs,rt) dmuhu rd,rs,rt
90 # define dmultu(rs,rt) dmultu rs,rt
91 # define mflo(rd,rs,rt) mflo rd
92 # define mfhi(rd,rs,rt) mfhi rd
96 # define poly1305_init poly1305_init_mips
97 # define poly1305_blocks poly1305_blocks_mips
98 # define poly1305_emit poly1305_emit_mips
101 #if defined(__MIPSEB__) && !defined(MIPSEB)
130 #if defined(_MIPS_ARCH_MIPS64R6)
131 andi
$tmp0,$inp,7 # $inp % 8
132 dsubu
$inp,$inp,$tmp0 # align $inp
133 sll
$tmp0,$tmp0,3 # byte to bit offset
136 beqz
$tmp0,.Laligned_key
139 subu
$tmp1,$zero,$tmp0
141 dsllv
$in0,$in0,$tmp0
142 dsrlv
$tmp3,$in1,$tmp1
143 dsllv
$in1,$in1,$tmp0
144 dsrlv
$tmp2,$tmp2,$tmp1
146 dsrlv
$in0,$in0,$tmp0
147 dsllv
$tmp3,$in1,$tmp1
148 dsrlv
$in1,$in1,$tmp0
149 dsllv
$tmp2,$tmp2,$tmp1
161 # if defined(_MIPS_ARCH_MIPS64R2)
162 dsbh
$in0,$in0 # byte swap
169 or $tmp0,$tmp2 # 0x000000FF000000FF
171 and $tmp1,$in0,$tmp0 # byte swap
179 dsll
$tmp0,8 # 0x0000FF000000FF00
203 dsll
$tmp0,32 # 0x0000000100000000
204 daddiu
$tmp0,-63 # 0x00000000ffffffc1
205 dsll
$tmp0,28 # 0x0ffffffc10000000
206 daddiu
$tmp0,-1 # 0x0ffffffc0fffffff
209 daddiu
$tmp0,-3 # 0x0ffffffc0ffffffc
215 daddu
$tmp0,$in1 # s1 = r1 + (r1 >> 2)
224 my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ?
"0x0003f000" : "0x00030000";
226 my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228 my ($shr,$shl) = ($s6,$s7); # used on R6
232 .globl poly1305_blocks
236 dsrl
$len,4 # number of complete blocks
237 bnez
$len,poly1305_blocks_internal
244 .ent poly1305_blocks_internal
245 poly1305_blocks_internal
:
247 #if defined(_MIPS_ARCH_MIPS64R6)
249 .mask
$SAVED_REGS_MASK|0x000c0000,-8
255 .mask
$SAVED_REGS_MASK,-8
261 $code.=<<___
if ($flavour =~ /nubi/i); # optimize non-nubi prologue
270 #if defined(_MIPS_ARCH_MIPS64R6)
272 dsubu
$inp,$inp,$shr # align $inp
273 sll
$shr,$shr,3 # byte to bit offset
277 ld
$h0,0($ctx) # load hash value
281 ld
$r0,24($ctx) # load key
286 daddu
$len,$inp # end of buffer
291 #if defined(_MIPS_ARCH_MIPS64R6)
292 ld
$in0,0($inp) # load input
294 beqz
$shr,.Laligned_inp
299 dsrlv
$tmp3,$in1,$shl
301 dsrlv
$tmp2,$tmp2,$shl
304 dsllv
$tmp3,$in1,$shl
306 dsllv
$tmp2,$tmp2,$shl
312 ldl
$in0,0+MSB
($inp) # load input
319 # if defined(_MIPS_ARCH_MIPS64R2)
320 dsbh
$in0,$in0 # byte swap
327 or $tmp0,$tmp2 # 0x000000FF000000FF
329 and $tmp1,$in0,$tmp0 # byte swap
337 dsll
$tmp0,8 # 0x0000FF000000FF00
360 dsrl
$tmp1,$h2,2 # modulo-scheduled reduction
364 daddu
$d0,$h0,$in0 # accumulate input
367 daddu
$d0,$d0,$tmp1 # ... and residue
374 dmultu
($r0,$d0) # h0*r0
375 daddu
$d2,$h2,$padbit
380 dmultu
($rs1,$d1) # h1*5*r1
383 mflo
($tmp0,$rs1,$d1)
384 mfhi
($tmp1,$rs1,$d1)
386 dmultu
($r1,$d0) # h0*r1
393 dmultu
($r0,$d1) # h1*r0
399 dmultu
($rs1,$d2) # h2*5*r1
402 mflo
($tmp2,$rs1,$d2)
404 dmultu
($r0,$d2) # h2*r0
418 sd
$h0,0($ctx) # store hash value
423 #if defined(_MIPS_ARCH_MIPS64R6)
427 ld
$s5,40($sp) # epilogue
430 $code.=<<___
if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
438 #if defined(_MIPS_ARCH_MIPS64R6)
443 .end poly1305_blocks_internal
447 my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
461 li
$in0,-4 # final reduction
467 daddu
$tmp0,$tmp0,$in0
469 daddiu
$in0,$tmp0,5 # compare to modulus
470 daddu
$tmp1,$tmp1,$in1
472 sltu
$tmp4,$tmp1,$in1
473 daddu
$in1,$tmp1,$tmp3
474 daddu
$tmp2,$tmp2,$tmp4
475 sltu
$tmp3,$in1,$tmp3
476 daddu
$tmp2,$tmp2,$tmp3
478 dsrl
$tmp2,2 # see if it carried/borrowed
479 dsubu
$tmp2,$zero,$tmp2
488 lwu
$tmp0,0($nonce) # load nonce
497 daddu
$in0,$tmp0 # accumulate nonce
499 sltu
$tmp0,$in0,$tmp0
502 dsrl
$tmp0,$in0,8 # write mac value
537 .asciiz
"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
542 ######################################################################
546 my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547 my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548 ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
551 #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552 defined(_MIPS_ARCH_MIPS32R6
)) \\
553 && !defined(_MIPS_ARCH_MIPS32R2
)
554 # define _MIPS_ARCH_MIPS32R2
557 #if defined(_MIPS_ARCH_MIPS32R6)
558 # define multu(rs,rt)
559 # define mflo(rd,rs,rt) mulu rd,rs,rt
560 # define mfhi(rd,rs,rt) muhu rd,rs,rt
562 # define multu(rs,rt) multu rs,rt
563 # define mflo(rd,rs,rt) mflo rd
564 # define mfhi(rd,rs,rt) mfhi rd
568 # define poly1305_init poly1305_init_mips
569 # define poly1305_blocks poly1305_blocks_mips
570 # define poly1305_emit poly1305_emit_mips
573 #if defined(__MIPSEB__) && !defined(MIPSEB)
604 #if defined(_MIPS_ARCH_MIPS32R6)
605 andi
$tmp0,$inp,3 # $inp % 4
606 subu
$inp,$inp,$tmp0 # align $inp
607 sll
$tmp0,$tmp0,3 # byte to bit offset
612 beqz
$tmp0,.Laligned_key
615 subu
$tmp1,$zero,$tmp0
618 srlv
$tmp3,$in1,$tmp1
621 srlv
$tmp3,$in2,$tmp1
624 srlv
$tmp3,$in3,$tmp1
627 srlv
$tmp2,$tmp2,$tmp1
631 sllv
$tmp3,$in1,$tmp1
634 sllv
$tmp3,$in2,$tmp1
637 sllv
$tmp3,$in3,$tmp1
640 sllv
$tmp2,$tmp2,$tmp1
648 lwl
$in3,12+MSB
($inp)
652 lwr
$in3,12+LSB
($inp)
655 # if defined(_MIPS_ARCH_MIPS32R2)
656 wsbh
$in0,$in0 # byte swap
665 srl
$tmp0,$in0,24 # byte swap
667 andi
$tmp2,$in0,0xFF00
676 andi
$tmp1,$in1,0xFF00
685 andi
$tmp2,$in2,0xFF00
694 andi
$tmp1,$in3,0xFF00
704 ori
$tmp0,0xffff # 0x0fffffff
706 subu
$tmp0,3 # 0x0ffffffc
719 addu
$in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
731 my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ?
"0x00fff000" : "0x00ff0000";
733 my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734 ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735 my ($d0,$d1,$d2,$d3) =
737 my $shr = $t2; # used on R6
738 my $one = $t2; # used on R2
741 .globl poly1305_blocks
746 .mask
$SAVED_REGS_MASK,-4
758 $code.=<<___
if ($flavour =~ /nubi/i); # optimize non-nubi prologue
767 srl
$len,4 # number of complete blocks
771 #if defined(_MIPS_ARCH_MIPS32R6)
773 subu
$inp,$inp,$shr # align $inp
774 sll
$shr,$shr,3 # byte to bit offset
777 lw
$h0,0($ctx) # load hash value
783 lw
$r0,20($ctx) # load key
792 addu
$len,$len,$inp # end of buffer
797 #if defined(_MIPS_ARCH_MIPS32R6)
798 lw
$d0,0($inp) # load input
802 beqz
$shr,.Laligned_inp
835 lwl
$d0,0+MSB
($inp) # load input
845 # if defined(_MIPS_ARCH_MIPS32R2)
846 wsbh
$d0,$d0 # byte swap
855 srl
$at,$d0,24 # byte swap
893 srl
$t0,$h4,2 # modulo-scheduled reduction
897 addu
$d0,$d0,$h0 # accumulate input
900 addu
$d0,$d0,$t0 # ... and residue
904 addu
$h0,$h0,$at # carry
910 addu
$h1,$h1,$h0 # carry
916 addu
$h2,$h2,$h1 # carry
920 #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921 multu
$r0,$d0 # d0*r0
923 maddu
$rs3,$d1 # d1*s3
924 addu
$h3,$h3,$h2 # carry
925 maddu
$rs2,$d2 # d2*s2
927 maddu
$rs1,$d3 # d3*s1
932 multu
$r1,$d0 # d0*r1
933 maddu
$r0,$d1 # d1*r0
934 maddu
$rs3,$d2 # d2*s3
935 maddu
$rs2,$d3 # d3*s2
936 maddu
$rs1,$h4 # h4*s1
937 maddu
$at,$one # hi*1
941 multu
$r2,$d0 # d0*r2
942 maddu
$r1,$d1 # d1*r1
943 maddu
$r0,$d2 # d2*r0
944 maddu
$rs3,$d3 # d3*s3
945 maddu
$rs2,$h4 # h4*s2
946 maddu
$at,$one # hi*1
950 mul
$t0,$r0,$h4 # h4*r0
952 multu
$r3,$d0 # d0*r3
953 maddu
$r2,$d1 # d1*r2
954 maddu
$r1,$d2 # d2*r1
955 maddu
$r0,$d3 # d3*r0
956 maddu
$rs3,$h4 # h4*s3
957 maddu
$at,$one # hi*1
965 multu
($r0,$d0) # d0*r0
970 addu
$h3,$h3,$h2 # carry
972 multu
($rs3,$d1) # d1*s3
980 multu
($rs2,$d2) # d2*s2
985 multu
($rs1,$d3) # d3*s1
993 multu
($r1,$d0) # d0*r1
1002 multu
($r0,$d1) # d1*r0
1010 multu
($rs3,$d2) # d2*s3
1017 multu
($rs2,$d3) # d3*s2
1025 multu
($rs1,$h4) # h4*s1
1032 multu
($r2,$d0) # d0*r2
1041 multu
($r1,$d1) # d1*r1
1048 multu
($r0,$d2) # d2*r0
1055 multu
($rs3,$d3) # d3*s3
1063 multu
($rs2,$h4) # h4*s2
1070 multu
($r3,$d0) # d0*r3
1079 multu
($r2,$d1) # d1*r2
1086 multu
($r0,$d3) # d3*r0
1093 multu
($r1,$d2) # d2*r1
1101 multu
($rs3,$h4) # h4*s3
1108 multu
($r0,$h4) # h4*r0
1119 li
$padbit,1 # if we loop, padbit is 1
1123 sw
$h0,0($ctx) # store hash value
1140 $code.=<<___
if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1149 .end poly1305_blocks
1153 my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1157 .globl poly1305_emit
1169 li
$in0,-4 # final reduction
1175 addu
$tmp0,$tmp0,$ctx
1176 sltu
$ctx,$tmp0,$ctx
1177 addiu
$in0,$tmp0,5 # compare to modulus
1178 addu
$tmp1,$tmp1,$ctx
1180 sltu
$ctx,$tmp1,$ctx
1181 addu
$in1,$in1,$tmp1
1182 addu
$tmp2,$tmp2,$ctx
1183 sltu
$in2,$in1,$tmp1
1184 sltu
$ctx,$tmp2,$ctx
1185 addu
$in2,$in2,$tmp2
1186 addu
$tmp3,$tmp3,$ctx
1187 sltu
$in3,$in2,$tmp2
1188 sltu
$ctx,$tmp3,$ctx
1189 addu
$in3,$in3,$tmp3
1190 addu
$tmp4,$tmp4,$ctx
1191 sltu
$ctx,$in3,$tmp3
1194 srl
$ctx,2 # see if it carried/borrowed
1195 subu
$ctx,$zero,$ctx
1210 lw
$tmp0,0($nonce) # load nonce
1215 addu
$in0,$tmp0 # accumulate nonce
1216 sltu
$ctx,$in0,$tmp0
1219 sltu
$tmp1,$in1,$tmp1
1225 sltu
$tmp2,$in2,$tmp2
1233 srl
$tmp0,$in0,8 # write mac value
1265 .asciiz
"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1271 $output=pop and open STDOUT
,">$output";