3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 # Rights for redistribution and usage in source and binary forms are
8 # granted according to the OpenSSL license. Warranty of any kind is
10 # ====================================================================
15 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
17 # The module is designed to work with either of the "new" MIPS ABI(5),
18 # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19 # IRIX 5.x not only because it doesn't support new ABIs but also
20 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22 # cause illegal instruction exception:-(
24 # In addition the code depends on preprocessor flags set up by MIPSpro
25 # compiler driver (either as or cc) and therefore (probably?) can't be
26 # compiled by the GNU assembler. GNU C driver manages fine though...
27 # I mean as long as -mmips-as is specified or is the default option,
28 # because then it simply invokes /usr/bin/as which in turn takes
29 # perfect care of the preprocessor definitions. Another neat feature
30 # offered by the MIPSpro assembler is an optimization pass. This gave
31 # me the opportunity to have the code looking more regular as all those
32 # architecture dependent instruction rescheduling details were left to
33 # the assembler. Cool, huh?
35 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36 # goes way over 3 times faster!
38 # <appro@fy.chalmers.se>
42 # Adapt the module even for 32-bit ABIs and other OSes. The former was
43 # achieved by mechanical replacement of 64-bit arithmetic instructions
44 # such as dmultu, daddu, etc. with their 32-bit counterparts and
45 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46 # >3x performance improvement naturally does not apply to 32-bit code
47 # [because there is no instruction 32-bit compiler can't use], one
48 # has to content with 40-85% improvement depending on benchmark and
49 # key length, more for longer keys.
52 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53 open STDOUT
,">$output";
55 if ($flavour =~ /64|n32/i) {
88 # Below is N32/64 register layout used in the original module.
90 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
97 # No special adaptation is required for O32. NUBI on the other hand
98 # is treated by saving/restoring ($v1,$t0..$t3).
100 $gp=$v1 if ($flavour =~ /nubi/i);
106 .asciiz
"mips3.s, Version 1.2"
107 .asciiz
"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
113 .globl bn_mul_add_words
114 .ent bn_mul_add_words
117 bgtz
$a2,bn_mul_add_words_internal
121 .end bn_mul_add_words
124 .ent bn_mul_add_words_internal
125 bn_mul_add_words_internal
:
127 $code.=<<___
if ($flavour =~ /nubi/i);
128 .frame
$sp,6*$SZREG,$ra
129 .mask
0x8000f008,-$SZREG
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
144 beqz
$ta0,.L_bn_mul_add_words_tail
146 .L_bn_mul_add_words_loop
:
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
154 sltu
$v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
190 $ST $ta1,-2*$BNSZ($a0)
205 bgtzl
$ta0,.L_bn_mul_add_words_loop
208 beqz
$a2,.L_bn_mul_add_words_return
211 .L_bn_mul_add_words_tail
:
226 beqz
$a2,.L_bn_mul_add_words_return
241 beqz
$a2,.L_bn_mul_add_words_return
256 .L_bn_mul_add_words_return
:
259 $code.=<<___
if ($flavour =~ /nubi/i);
260 $REG_L $t3,4*$SZREG($sp)
261 $REG_L $t2,3*$SZREG($sp)
262 $REG_L $t1,2*$SZREG($sp)
263 $REG_L $t0,1*$SZREG($sp)
264 $REG_L $gp,0*$SZREG($sp)
265 $PTR_ADD $sp,6*$SZREG
270 .end bn_mul_add_words_internal
277 bgtz
$a2,bn_mul_words_internal
284 .ent bn_mul_words_internal
285 bn_mul_words_internal
:
287 $code.=<<___
if ($flavour =~ /nubi/i);
288 .frame
$sp,6*$SZREG,$ra
289 .mask
0x8000f008,-$SZREG
291 $PTR_SUB $sp,6*$SZREG
292 $REG_S $ra,5*$SZREG($sp)
293 $REG_S $t3,4*$SZREG($sp)
294 $REG_S $t2,3*$SZREG($sp)
295 $REG_S $t1,2*$SZREG($sp)
296 $REG_S $t0,1*$SZREG($sp)
297 $REG_S $gp,0*$SZREG($sp)
304 beqz
$ta0,.L_bn_mul_words_tail
306 .L_bn_mul_words_loop
:
309 $LD $ta0,2*$BNSZ($a1)
310 $LD $ta2,3*$BNSZ($a1)
327 $ST $v0,-3*$BNSZ($a0)
335 $ST $v0,-2*$BNSZ($a0)
346 bgtzl
$ta0,.L_bn_mul_words_loop
349 beqz
$a2,.L_bn_mul_words_return
352 .L_bn_mul_words_tail
:
363 beqz
$a2,.L_bn_mul_words_return
374 beqz
$a2,.L_bn_mul_words_return
385 .L_bn_mul_words_return
:
388 $code.=<<___
if ($flavour =~ /nubi/i);
389 $REG_L $t3,4*$SZREG($sp)
390 $REG_L $t2,3*$SZREG($sp)
391 $REG_L $t1,2*$SZREG($sp)
392 $REG_L $t0,1*$SZREG($sp)
393 $REG_L $gp,0*$SZREG($sp)
394 $PTR_ADD $sp,6*$SZREG
399 .end bn_mul_words_internal
406 bgtz
$a2,bn_sqr_words_internal
413 .ent bn_sqr_words_internal
414 bn_sqr_words_internal
:
416 $code.=<<___
if ($flavour =~ /nubi/i);
417 .frame
$sp,6*$SZREG,$ra
418 .mask
0x8000f008,-$SZREG
420 $PTR_SUB $sp,6*$SZREG
421 $REG_S $ra,5*$SZREG($sp)
422 $REG_S $t3,4*$SZREG($sp)
423 $REG_S $t2,3*$SZREG($sp)
424 $REG_S $t1,2*$SZREG($sp)
425 $REG_S $t0,1*$SZREG($sp)
426 $REG_S $gp,0*$SZREG($sp)
433 beqz
$ta0,.L_bn_sqr_words_tail
435 .L_bn_sqr_words_loop
:
438 $LD $ta0,2*$BNSZ($a1)
439 $LD $ta2,3*$BNSZ($a1)
451 $ST $t3,-6*$BNSZ($a0)
452 $ST $t2,-5*$BNSZ($a0)
457 $ST $ta1,-4*$BNSZ($a0)
458 $ST $ta0,-3*$BNSZ($a0)
465 $ST $ta3,-2*$BNSZ($a0)
469 bgtzl
$ta0,.L_bn_sqr_words_loop
472 beqz
$a2,.L_bn_sqr_words_return
475 .L_bn_sqr_words_tail
:
484 beqz
$a2,.L_bn_sqr_words_return
493 beqz
$a2,.L_bn_sqr_words_return
502 .L_bn_sqr_words_return
:
505 $code.=<<___
if ($flavour =~ /nubi/i);
506 $REG_L $t3,4*$SZREG($sp)
507 $REG_L $t2,3*$SZREG($sp)
508 $REG_L $t1,2*$SZREG($sp)
509 $REG_L $t0,1*$SZREG($sp)
510 $REG_L $gp,0*$SZREG($sp)
511 $PTR_ADD $sp,6*$SZREG
517 .end bn_sqr_words_internal
524 bgtz
$a3,bn_add_words_internal
531 .ent bn_add_words_internal
532 bn_add_words_internal
:
534 $code.=<<___
if ($flavour =~ /nubi/i);
535 .frame
$sp,6*$SZREG,$ra
536 .mask
0x8000f008,-$SZREG
538 $PTR_SUB $sp,6*$SZREG
539 $REG_S $ra,5*$SZREG($sp)
540 $REG_S $t3,4*$SZREG($sp)
541 $REG_S $t2,3*$SZREG($sp)
542 $REG_S $t1,2*$SZREG($sp)
543 $REG_S $t0,1*$SZREG($sp)
544 $REG_S $gp,0*$SZREG($sp)
551 beqz
$at,.L_bn_add_words_tail
553 .L_bn_add_words_loop
:
562 $LD $ta1,-3*$BNSZ($a2)
564 $LD $ta2,-2*$BNSZ($a2)
570 $ST $t0,-4*$BNSZ($a0)
577 $ST $t1,-3*$BNSZ($a0)
584 $ST $t2,-2*$BNSZ($a0)
595 bgtzl
$at,.L_bn_add_words_loop
598 beqz
$a3,.L_bn_add_words_return
601 .L_bn_add_words_tail
:
612 beqz
$a3,.L_bn_add_words_return
623 beqz
$a3,.L_bn_add_words_return
626 $LD $ta2,2*$BNSZ($a2)
634 .L_bn_add_words_return
:
637 $code.=<<___
if ($flavour =~ /nubi/i);
638 $REG_L $t3,4*$SZREG($sp)
639 $REG_L $t2,3*$SZREG($sp)
640 $REG_L $t1,2*$SZREG($sp)
641 $REG_L $t0,1*$SZREG($sp)
642 $REG_L $gp,0*$SZREG($sp)
643 $PTR_ADD $sp,6*$SZREG
649 .end bn_add_words_internal
656 bgtz
$a3,bn_sub_words_internal
663 .ent bn_sub_words_internal
664 bn_sub_words_internal
:
666 $code.=<<___
if ($flavour =~ /nubi/i);
667 .frame
$sp,6*$SZREG,$ra
668 .mask
0x8000f008,-$SZREG
670 $PTR_SUB $sp,6*$SZREG
671 $REG_S $ra,5*$SZREG($sp)
672 $REG_S $t3,4*$SZREG($sp)
673 $REG_S $t2,3*$SZREG($sp)
674 $REG_S $t1,2*$SZREG($sp)
675 $REG_S $t0,1*$SZREG($sp)
676 $REG_S $gp,0*$SZREG($sp)
683 beqz
$at,.L_bn_sub_words_tail
685 .L_bn_sub_words_loop
:
694 $LD $ta1,-3*$BNSZ($a2)
696 $LD $ta2,-2*$BNSZ($a2)
702 $ST $t0,-4*$BNSZ($a0)
709 $ST $t1,-3*$BNSZ($a0)
717 $ST $t2,-2*$BNSZ($a0)
728 bgtzl
$at,.L_bn_sub_words_loop
731 beqz
$a3,.L_bn_sub_words_return
734 .L_bn_sub_words_tail
:
745 beqz
$a3,.L_bn_sub_words_return
756 beqz
$a3,.L_bn_sub_words_return
759 $LD $ta2,2*$BNSZ($a2)
767 .L_bn_sub_words_return
:
770 $code.=<<___
if ($flavour =~ /nubi/i);
771 $REG_L $t3,4*$SZREG($sp)
772 $REG_L $t2,3*$SZREG($sp)
773 $REG_L $t1,2*$SZREG($sp)
774 $REG_L $t0,1*$SZREG($sp)
775 $REG_L $gp,0*$SZREG($sp)
776 $PTR_ADD $sp,6*$SZREG
781 .end bn_sub_words_internal
784 .globl bn_div_3_words
788 move
$a3,$a0 # we know that bn_div_words does not
789 # touch $a3, $ta2, $ta3 and preserves $a2
790 # so that we can save two arguments
791 # and return address in registers
792 # instead of stack:-)
796 bne
$a0,$a2,bn_div_3_words_internal
804 .ent bn_div_3_words_internal
805 bn_div_3_words_internal
:
807 $code.=<<___
if ($flavour =~ /nubi/i);
808 .frame
$sp,6*$SZREG,$ra
809 .mask
0x8000f008,-$SZREG
811 $PTR_SUB $sp,6*$SZREG
812 $REG_S $ra,5*$SZREG($sp)
813 $REG_S $t3,4*$SZREG($sp)
814 $REG_S $t2,3*$SZREG($sp)
815 $REG_S $t1,2*$SZREG($sp)
816 $REG_S $t0,1*$SZREG($sp)
817 $REG_S $gp,0*$SZREG($sp)
825 $LD $t2,-2*$BNSZ($a3)
830 .L_bn_div_3_words_inner_loop
:
831 bnez
$t8,.L_bn_div_3_words_inner_loop_done
843 beqzl
$at,.L_bn_div_3_words_inner_loop
846 .L_bn_div_3_words_inner_loop_done
:
849 $code.=<<___
if ($flavour =~ /nubi/i);
850 $REG_L $t3,4*$SZREG($sp)
851 $REG_L $t2,3*$SZREG($sp)
852 $REG_L $t1,2*$SZREG($sp)
853 $REG_L $t0,1*$SZREG($sp)
854 $REG_L $gp,0*$SZREG($sp)
855 $PTR_ADD $sp,6*$SZREG
860 .end bn_div_3_words_internal
867 bnez
$a2,bn_div_words_internal
868 li
$v0,-1 # I would rather signal div-by-zero
869 # which can be done with 'break 7'
875 .ent bn_div_words_internal
876 bn_div_words_internal
:
878 $code.=<<___
if ($flavour =~ /nubi/i);
879 .frame
$sp,6*$SZREG,$ra
880 .mask
0x8000f008,-$SZREG
882 $PTR_SUB $sp,6*$SZREG
883 $REG_S $ra,5*$SZREG($sp)
884 $REG_S $t3,4*$SZREG($sp)
885 $REG_S $t2,3*$SZREG($sp)
886 $REG_S $t1,2*$SZREG($sp)
887 $REG_S $t0,1*$SZREG($sp)
888 $REG_S $gp,0*$SZREG($sp)
892 bltz
$a2,.L_bn_div_words_body
906 break 6 # signal overflow
916 .L_bn_div_words_body
:
917 $SRL $DH,$a2,4*$BNSZ # bits
925 $SRL $HH,$a0,4*$BNSZ # bits
926 $SRL $QT,4*$BNSZ # q=0xffffffff
927 beq
$DH,$HH,.L_bn_div_words_skip_div1
930 .L_bn_div_words_skip_div1
:
932 $SLL $t3,$a0,4*$BNSZ # bits
933 $SRL $at,$a1,4*$BNSZ # bits
937 .L_bn_div_words_inner_loop1
:
945 beqz
$at,.L_bn_div_words_inner_loop1_done
948 b
.L_bn_div_words_inner_loop1
951 .L_bn_div_words_inner_loop1_done
:
953 $SLL $a1,4*$BNSZ # bits
955 $SLL $v0,$QT,4*$BNSZ # bits
958 $SRL $HH,$a0,4*$BNSZ # bits
959 $SRL $QT,4*$BNSZ # q=0xffffffff
960 beq
$DH,$HH,.L_bn_div_words_skip_div2
963 .L_bn_div_words_skip_div2
:
965 $SLL $t3,$a0,4*$BNSZ # bits
966 $SRL $at,$a1,4*$BNSZ # bits
970 .L_bn_div_words_inner_loop2
:
978 beqz
$at,.L_bn_div_words_inner_loop2_done
981 b
.L_bn_div_words_inner_loop2
984 .L_bn_div_words_inner_loop2_done
:
988 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
989 $SRL $a2,$t9 # restore $a2
994 $code.=<<___
if ($flavour =~ /nubi/i);
995 $REG_L $t3,4*$SZREG($sp)
996 $REG_L $t2,3*$SZREG($sp)
997 $REG_L $t1,2*$SZREG($sp)
998 $REG_L $t0,1*$SZREG($sp)
999 $REG_L $gp,0*$SZREG($sp)
1000 $PTR_ADD $sp,6*$SZREG
1005 .end bn_div_words_internal
1007 undef $HH; undef $QT; undef $DH;
1009 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1010 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1012 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1013 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1015 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1020 .globl bn_mul_comba8
1025 $code.=<<___
if ($flavour =~ /nubi/i);
1026 .frame
$sp,12*$SZREG,$ra
1027 .mask
0x803ff008,-$SZREG
1028 $PTR_SUB $sp,12*$SZREG
1029 $REG_S $ra,11*$SZREG($sp)
1030 $REG_S $s5,10*$SZREG($sp)
1031 $REG_S $s4,9*$SZREG($sp)
1032 $REG_S $s3,8*$SZREG($sp)
1033 $REG_S $s2,7*$SZREG($sp)
1034 $REG_S $s1,6*$SZREG($sp)
1035 $REG_S $s0,5*$SZREG($sp)
1036 $REG_S $t3,4*$SZREG($sp)
1037 $REG_S $t2,3*$SZREG($sp)
1038 $REG_S $t1,2*$SZREG($sp)
1039 $REG_S $t0,1*$SZREG($sp)
1040 $REG_S $gp,0*$SZREG($sp)
1042 $code.=<<___
if ($flavour !~ /nubi/i);
1043 .frame
$sp,6*$SZREG,$ra
1044 .mask
0x003f0000,-$SZREG
1045 $PTR_SUB $sp,6*$SZREG
1046 $REG_S $s5,5*$SZREG($sp)
1047 $REG_S $s4,4*$SZREG($sp)
1048 $REG_S $s3,3*$SZREG($sp)
1049 $REG_S $s2,2*$SZREG($sp)
1050 $REG_S $s1,1*$SZREG($sp)
1051 $REG_S $s0,0*$SZREG($sp)
1056 $LD $a_0,0($a1) # If compiled with -mips3 option on
1057 # R5000 box assembler barks on this
1058 # 1ine with "should not have mult/div
1059 # as last instruction in bb (R10K
1060 # bug)" warning. If anybody out there
1061 # has a clue about how to circumvent
1062 # this do send me a note.
1063 # <appro\@fy.chalmers.se>
1067 $LD $a_2,2*$BNSZ($a1)
1068 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1069 $LD $a_3,3*$BNSZ($a1)
1071 $LD $b_2,2*$BNSZ($a2)
1072 $LD $b_3,3*$BNSZ($a2)
1076 $LD $a_4,4*$BNSZ($a1)
1077 $LD $a_5,5*$BNSZ($a1)
1078 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1079 $LD $a_6,6*$BNSZ($a1)
1080 $LD $a_7,7*$BNSZ($a1)
1081 $LD $b_4,4*$BNSZ($a2)
1082 $LD $b_5,5*$BNSZ($a2)
1087 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1089 $LD $b_6,6*$BNSZ($a2)
1090 $LD $b_7,7*$BNSZ($a2)
1091 $ST $c_1,0($a0) # r[0]=c1;
1096 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1100 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1106 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1113 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1121 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1126 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1132 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1140 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1149 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1158 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1163 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1169 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1177 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1186 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1195 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1204 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1209 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1215 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1223 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1232 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1241 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1250 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1259 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1264 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1270 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1278 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1287 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1296 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1305 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1314 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1323 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1328 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1334 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1342 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1351 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1360 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1369 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1378 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1387 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1396 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1401 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1407 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1415 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1424 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1433 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1442 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1451 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1460 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1465 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1471 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1479 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1488 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1497 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1506 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1515 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1520 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1526 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1534 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1543 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1552 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1561 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1566 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1572 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1580 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1589 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1598 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1603 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1609 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1617 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1626 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1631 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1637 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1645 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1650 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1658 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1659 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1663 $code.=<<___
if ($flavour =~ /nubi/i);
1664 $REG_L $s5,10*$SZREG($sp)
1665 $REG_L $s4,9*$SZREG($sp)
1666 $REG_L $s3,8*$SZREG($sp)
1667 $REG_L $s2,7*$SZREG($sp)
1668 $REG_L $s1,6*$SZREG($sp)
1669 $REG_L $s0,5*$SZREG($sp)
1670 $REG_L $t3,4*$SZREG($sp)
1671 $REG_L $t2,3*$SZREG($sp)
1672 $REG_L $t1,2*$SZREG($sp)
1673 $REG_L $t0,1*$SZREG($sp)
1674 $REG_L $gp,0*$SZREG($sp)
1676 $PTR_ADD $sp,12*$SZREG
1678 $code.=<<___
if ($flavour !~ /nubi/i);
1679 $REG_L $s5,5*$SZREG($sp)
1680 $REG_L $s4,4*$SZREG($sp)
1681 $REG_L $s3,3*$SZREG($sp)
1682 $REG_L $s2,2*$SZREG($sp)
1683 $REG_L $s1,1*$SZREG($sp)
1684 $REG_L $s0,0*$SZREG($sp)
1686 $PTR_ADD $sp,6*$SZREG
1692 .globl bn_mul_comba4
1696 $code.=<<___
if ($flavour =~ /nubi/i);
1697 .frame
$sp,6*$SZREG,$ra
1698 .mask
0x8000f008,-$SZREG
1700 $PTR_SUB $sp,6*$SZREG
1701 $REG_S $ra,5*$SZREG($sp)
1702 $REG_S $t3,4*$SZREG($sp)
1703 $REG_S $t2,3*$SZREG($sp)
1704 $REG_S $t1,2*$SZREG($sp)
1705 $REG_S $t0,1*$SZREG($sp)
1706 $REG_S $gp,0*$SZREG($sp)
1713 $LD $a_2,2*$BNSZ($a1)
1714 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1715 $LD $a_3,3*$BNSZ($a1)
1717 $LD $b_2,2*$BNSZ($a2)
1718 $LD $b_3,3*$BNSZ($a2)
1723 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1728 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1734 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1744 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1751 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1759 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1764 $ST $c_3,2*$BNSZ($a0)
1770 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1778 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1787 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1796 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1801 $ST $c_1,3*$BNSZ($a0)
1807 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1815 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1824 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1829 $ST $c_2,4*$BNSZ($a0)
1835 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1843 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1848 $ST $c_3,5*$BNSZ($a0)
1856 $ST $c_1,6*$BNSZ($a0)
1857 $ST $c_2,7*$BNSZ($a0)
1861 $code.=<<___
if ($flavour =~ /nubi/i);
1862 $REG_L $t3,4*$SZREG($sp)
1863 $REG_L $t2,3*$SZREG($sp)
1864 $REG_L $t1,2*$SZREG($sp)
1865 $REG_L $t0,1*$SZREG($sp)
1866 $REG_L $gp,0*$SZREG($sp)
1867 $PTR_ADD $sp,6*$SZREG
1875 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1880 .globl bn_sqr_comba8
1884 $code.=<<___
if ($flavour =~ /nubi/i);
1885 .frame
$sp,6*$SZREG,$ra
1886 .mask
0x8000f008,-$SZREG
1888 $PTR_SUB $sp,6*$SZREG
1889 $REG_S $ra,5*$SZREG($sp)
1890 $REG_S $t3,4*$SZREG($sp)
1891 $REG_S $t2,3*$SZREG($sp)
1892 $REG_S $t1,2*$SZREG($sp)
1893 $REG_S $t0,1*$SZREG($sp)
1894 $REG_S $gp,0*$SZREG($sp)
1900 $LD $a_2,2*$BNSZ($a1)
1901 $LD $a_3,3*$BNSZ($a1)
1903 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1904 $LD $a_4,4*$BNSZ($a1)
1905 $LD $a_5,5*$BNSZ($a1)
1906 $LD $a_6,6*$BNSZ($a1)
1907 $LD $a_7,7*$BNSZ($a1)
1912 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1917 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1930 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1944 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1949 $ST $c_3,2*$BNSZ($a0)
1955 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
1969 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
1980 $ST $c_1,3*$BNSZ($a0)
1986 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
2000 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2015 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
2020 $ST $c_2,4*$BNSZ($a0)
2026 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
2040 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2054 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
2066 $ST $c_3,5*$BNSZ($a0)
2072 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
2086 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
2101 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2116 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2121 $ST $c_1,6*$BNSZ($a0)
2127 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
2141 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
2156 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
2171 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
2182 $ST $c_2,7*$BNSZ($a0)
2188 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
2202 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
2217 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
2232 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2237 $ST $c_3,8*$BNSZ($a0)
2243 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
2257 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
2272 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
2283 $ST $c_1,9*$BNSZ($a0)
2289 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
2303 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
2318 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2323 $ST $c_2,10*$BNSZ($a0)
2329 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
2343 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
2354 $ST $c_3,11*$BNSZ($a0)
2360 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
2374 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2379 $ST $c_1,12*$BNSZ($a0)
2385 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
2395 $ST $c_2,13*$BNSZ($a0)
2403 $ST $c_3,14*$BNSZ($a0)
2404 $ST $c_1,15*$BNSZ($a0)
2408 $code.=<<___
if ($flavour =~ /nubi/i);
2409 $REG_L $t3,4*$SZREG($sp)
2410 $REG_L $t2,3*$SZREG($sp)
2411 $REG_L $t1,2*$SZREG($sp)
2412 $REG_L $t0,1*$SZREG($sp)
2413 $REG_L $gp,0*$SZREG($sp)
2414 $PTR_ADD $sp,6*$SZREG
2422 .globl bn_sqr_comba4
2426 $code.=<<___
if ($flavour =~ /nubi/i);
2427 .frame
$sp,6*$SZREG,$ra
2428 .mask
0x8000f008,-$SZREG
2430 $PTR_SUB $sp,6*$SZREG
2431 $REG_S $ra,5*$SZREG($sp)
2432 $REG_S $t3,4*$SZREG($sp)
2433 $REG_S $t2,3*$SZREG($sp)
2434 $REG_S $t1,2*$SZREG($sp)
2435 $REG_S $t0,1*$SZREG($sp)
2436 $REG_S $gp,0*$SZREG($sp)
2442 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2443 $LD $a_2,2*$BNSZ($a1)
2444 $LD $a_3,3*$BNSZ($a1)
2449 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2454 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2467 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
2481 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2486 $ST $c_3,2*$BNSZ($a0)
2492 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
2506 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
2517 $ST $c_1,3*$BNSZ($a0)
2523 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2537 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2542 $ST $c_2,4*$BNSZ($a0)
2548 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2558 $ST $c_3,5*$BNSZ($a0)
2566 $ST $c_1,6*$BNSZ($a0)
2567 $ST $c_2,7*$BNSZ($a0)
2571 $code.=<<___
if ($flavour =~ /nubi/i);
2572 $REG_L $t3,4*$SZREG($sp)
2573 $REG_L $t2,3*$SZREG($sp)
2574 $REG_L $t1,2*$SZREG($sp)
2575 $REG_L $t0,1*$SZREG($sp)
2576 $REG_L $gp,0*$SZREG($sp)
2577 $PTR_ADD $sp,6*$SZREG