3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 # Rights for redistribution and usage in source and binary forms are
8 # granted according to the OpenSSL license. Warranty of any kind is
10 # ====================================================================
15 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
17 # The module is designed to work with either of the "new" MIPS ABI(5),
18 # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19 # IRIX 5.x not only because it doesn't support new ABIs but also
20 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22 # cause illegal instruction exception:-(
24 # In addition the code depends on preprocessor flags set up by MIPSpro
25 # compiler driver (either as or cc) and therefore (probably?) can't be
26 # compiled by the GNU assembler. GNU C driver manages fine though...
27 # I mean as long as -mmips-as is specified or is the default option,
28 # because then it simply invokes /usr/bin/as which in turn takes
29 # perfect care of the preprocessor definitions. Another neat feature
30 # offered by the MIPSpro assembler is an optimization pass. This gave
31 # me the opportunity to have the code looking more regular as all those
32 # architecture dependent instruction rescheduling details were left to
33 # the assembler. Cool, huh?
35 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36 # goes way over 3 times faster!
38 # <appro@fy.chalmers.se>
42 # Adapt the module even for 32-bit ABIs and other OSes. The former was
43 # achieved by mechanical replacement of 64-bit arithmetic instructions
44 # such as dmultu, daddu, etc. with their 32-bit counterparts and
45 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46 # >3x performance improvement naturally does not apply to 32-bit code
47 # [because there is no instruction 32-bit compiler can't use], one
48 # has to content with 40-85% improvement depending on benchmark and
49 # key length, more for longer keys.
52 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53 open STDOUT
,">$output";
55 if ($flavour =~ /64|n32/i) {
88 # Below is N32/64 register layout used in the original module.
90 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
97 # No special adaptation is required for O32. NUBI on the other hand
98 # is treated by saving/restoring ($v1,$t0..$t3).
100 $gp=$v1 if ($flavour =~ /nubi/i);
106 .asciiz
"mips3.s, Version 1.2"
107 .asciiz
"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
113 .globl bn_mul_add_words
114 .ent bn_mul_add_words
117 bgtz
$a2,bn_mul_add_words_internal
121 .end bn_mul_add_words
124 .ent bn_mul_add_words_internal
125 bn_mul_add_words_internal
:
127 $code.=<<___
if ($flavour =~ /nubi/i);
128 .frame
$sp,6*$SZREG,$ra
129 .mask
0x8000f008,-$SZREG
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
143 beqz
$ta0,.L_bn_mul_add_words_tail
145 .L_bn_mul_add_words_loop
:
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
154 sltu
$v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
190 $ST $ta1,-2*$BNSZ($a0)
204 bgtz
$ta0,.L_bn_mul_add_words_loop
207 beqz
$a2,.L_bn_mul_add_words_return
210 .L_bn_mul_add_words_tail
:
225 beqz
$a2,.L_bn_mul_add_words_return
240 beqz
$a2,.L_bn_mul_add_words_return
255 .L_bn_mul_add_words_return
:
258 $code.=<<___
if ($flavour =~ /nubi/i);
259 $REG_L $t3,4*$SZREG($sp)
260 $REG_L $t2,3*$SZREG($sp)
261 $REG_L $t1,2*$SZREG($sp)
262 $REG_L $t0,1*$SZREG($sp)
263 $REG_L $gp,0*$SZREG($sp)
264 $PTR_ADD $sp,6*$SZREG
269 .end bn_mul_add_words_internal
276 bgtz
$a2,bn_mul_words_internal
283 .ent bn_mul_words_internal
284 bn_mul_words_internal
:
286 $code.=<<___
if ($flavour =~ /nubi/i);
287 .frame
$sp,6*$SZREG,$ra
288 .mask
0x8000f008,-$SZREG
290 $PTR_SUB $sp,6*$SZREG
291 $REG_S $ra,5*$SZREG($sp)
292 $REG_S $t3,4*$SZREG($sp)
293 $REG_S $t2,3*$SZREG($sp)
294 $REG_S $t1,2*$SZREG($sp)
295 $REG_S $t0,1*$SZREG($sp)
296 $REG_S $gp,0*$SZREG($sp)
302 beqz
$ta0,.L_bn_mul_words_tail
304 .L_bn_mul_words_loop
:
308 $LD $ta0,2*$BNSZ($a1)
309 $LD $ta2,3*$BNSZ($a1)
326 $ST $v0,-3*$BNSZ($a0)
334 $ST $v0,-2*$BNSZ($a0)
344 bgtz
$ta0,.L_bn_mul_words_loop
347 beqz
$a2,.L_bn_mul_words_return
350 .L_bn_mul_words_tail
:
361 beqz
$a2,.L_bn_mul_words_return
372 beqz
$a2,.L_bn_mul_words_return
383 .L_bn_mul_words_return
:
386 $code.=<<___
if ($flavour =~ /nubi/i);
387 $REG_L $t3,4*$SZREG($sp)
388 $REG_L $t2,3*$SZREG($sp)
389 $REG_L $t1,2*$SZREG($sp)
390 $REG_L $t0,1*$SZREG($sp)
391 $REG_L $gp,0*$SZREG($sp)
392 $PTR_ADD $sp,6*$SZREG
397 .end bn_mul_words_internal
404 bgtz
$a2,bn_sqr_words_internal
411 .ent bn_sqr_words_internal
412 bn_sqr_words_internal
:
414 $code.=<<___
if ($flavour =~ /nubi/i);
415 .frame
$sp,6*$SZREG,$ra
416 .mask
0x8000f008,-$SZREG
418 $PTR_SUB $sp,6*$SZREG
419 $REG_S $ra,5*$SZREG($sp)
420 $REG_S $t3,4*$SZREG($sp)
421 $REG_S $t2,3*$SZREG($sp)
422 $REG_S $t1,2*$SZREG($sp)
423 $REG_S $t0,1*$SZREG($sp)
424 $REG_S $gp,0*$SZREG($sp)
430 beqz
$ta0,.L_bn_sqr_words_tail
432 .L_bn_sqr_words_loop
:
436 $LD $ta0,2*$BNSZ($a1)
437 $LD $ta2,3*$BNSZ($a1)
449 $ST $t3,-6*$BNSZ($a0)
450 $ST $t2,-5*$BNSZ($a0)
455 $ST $ta1,-4*$BNSZ($a0)
456 $ST $ta0,-3*$BNSZ($a0)
463 $ST $ta3,-2*$BNSZ($a0)
466 bgtz
$ta0,.L_bn_sqr_words_loop
469 beqz
$a2,.L_bn_sqr_words_return
472 .L_bn_sqr_words_tail
:
481 beqz
$a2,.L_bn_sqr_words_return
490 beqz
$a2,.L_bn_sqr_words_return
499 .L_bn_sqr_words_return
:
502 $code.=<<___
if ($flavour =~ /nubi/i);
503 $REG_L $t3,4*$SZREG($sp)
504 $REG_L $t2,3*$SZREG($sp)
505 $REG_L $t1,2*$SZREG($sp)
506 $REG_L $t0,1*$SZREG($sp)
507 $REG_L $gp,0*$SZREG($sp)
508 $PTR_ADD $sp,6*$SZREG
514 .end bn_sqr_words_internal
521 bgtz
$a3,bn_add_words_internal
528 .ent bn_add_words_internal
529 bn_add_words_internal
:
531 $code.=<<___
if ($flavour =~ /nubi/i);
532 .frame
$sp,6*$SZREG,$ra
533 .mask
0x8000f008,-$SZREG
535 $PTR_SUB $sp,6*$SZREG
536 $REG_S $ra,5*$SZREG($sp)
537 $REG_S $t3,4*$SZREG($sp)
538 $REG_S $t2,3*$SZREG($sp)
539 $REG_S $t1,2*$SZREG($sp)
540 $REG_S $t0,1*$SZREG($sp)
541 $REG_S $gp,0*$SZREG($sp)
547 beqz
$at,.L_bn_add_words_tail
549 .L_bn_add_words_loop
:
559 $LD $ta1,-3*$BNSZ($a2)
561 $LD $ta2,-2*$BNSZ($a2)
567 $ST $t0,-4*$BNSZ($a0)
574 $ST $t1,-3*$BNSZ($a0)
581 $ST $t2,-2*$BNSZ($a0)
591 bgtz
$at,.L_bn_add_words_loop
594 beqz
$a3,.L_bn_add_words_return
597 .L_bn_add_words_tail
:
608 beqz
$a3,.L_bn_add_words_return
619 beqz
$a3,.L_bn_add_words_return
622 $LD $ta2,2*$BNSZ($a2)
630 .L_bn_add_words_return
:
633 $code.=<<___
if ($flavour =~ /nubi/i);
634 $REG_L $t3,4*$SZREG($sp)
635 $REG_L $t2,3*$SZREG($sp)
636 $REG_L $t1,2*$SZREG($sp)
637 $REG_L $t0,1*$SZREG($sp)
638 $REG_L $gp,0*$SZREG($sp)
639 $PTR_ADD $sp,6*$SZREG
645 .end bn_add_words_internal
652 bgtz
$a3,bn_sub_words_internal
659 .ent bn_sub_words_internal
660 bn_sub_words_internal
:
662 $code.=<<___
if ($flavour =~ /nubi/i);
663 .frame
$sp,6*$SZREG,$ra
664 .mask
0x8000f008,-$SZREG
666 $PTR_SUB $sp,6*$SZREG
667 $REG_S $ra,5*$SZREG($sp)
668 $REG_S $t3,4*$SZREG($sp)
669 $REG_S $t2,3*$SZREG($sp)
670 $REG_S $t1,2*$SZREG($sp)
671 $REG_S $t0,1*$SZREG($sp)
672 $REG_S $gp,0*$SZREG($sp)
678 beqz
$at,.L_bn_sub_words_tail
680 .L_bn_sub_words_loop
:
690 $LD $ta1,-3*$BNSZ($a2)
692 $LD $ta2,-2*$BNSZ($a2)
698 $ST $t0,-4*$BNSZ($a0)
705 $ST $t1,-3*$BNSZ($a0)
713 $ST $t2,-2*$BNSZ($a0)
723 bgtz
$at,.L_bn_sub_words_loop
726 beqz
$a3,.L_bn_sub_words_return
729 .L_bn_sub_words_tail
:
740 beqz
$a3,.L_bn_sub_words_return
751 beqz
$a3,.L_bn_sub_words_return
754 $LD $ta2,2*$BNSZ($a2)
762 .L_bn_sub_words_return
:
765 $code.=<<___
if ($flavour =~ /nubi/i);
766 $REG_L $t3,4*$SZREG($sp)
767 $REG_L $t2,3*$SZREG($sp)
768 $REG_L $t1,2*$SZREG($sp)
769 $REG_L $t0,1*$SZREG($sp)
770 $REG_L $gp,0*$SZREG($sp)
771 $PTR_ADD $sp,6*$SZREG
776 .end bn_sub_words_internal
779 .globl bn_div_3_words
783 move
$a3,$a0 # we know that bn_div_words does not
784 # touch $a3, $ta2, $ta3 and preserves $a2
785 # so that we can save two arguments
786 # and return address in registers
787 # instead of stack:-)
791 bne
$a0,$a2,bn_div_3_words_internal
799 .ent bn_div_3_words_internal
800 bn_div_3_words_internal
:
802 $code.=<<___
if ($flavour =~ /nubi/i);
803 .frame
$sp,6*$SZREG,$ra
804 .mask
0x8000f008,-$SZREG
806 $PTR_SUB $sp,6*$SZREG
807 $REG_S $ra,5*$SZREG($sp)
808 $REG_S $t3,4*$SZREG($sp)
809 $REG_S $t2,3*$SZREG($sp)
810 $REG_S $t1,2*$SZREG($sp)
811 $REG_S $t0,1*$SZREG($sp)
812 $REG_S $gp,0*$SZREG($sp)
817 bal bn_div_words_internal
820 $LD $t2,-2*$BNSZ($a3)
825 .L_bn_div_3_words_inner_loop
:
826 bnez
$t8,.L_bn_div_3_words_inner_loop_done
838 beqz
$at,.L_bn_div_3_words_inner_loop
842 .L_bn_div_3_words_inner_loop_done
:
845 $code.=<<___
if ($flavour =~ /nubi/i);
846 $REG_L $t3,4*$SZREG($sp)
847 $REG_L $t2,3*$SZREG($sp)
848 $REG_L $t1,2*$SZREG($sp)
849 $REG_L $t0,1*$SZREG($sp)
850 $REG_L $gp,0*$SZREG($sp)
851 $PTR_ADD $sp,6*$SZREG
856 .end bn_div_3_words_internal
863 bnez
$a2,bn_div_words_internal
864 li
$v0,-1 # I would rather signal div-by-zero
865 # which can be done with 'break 7'
871 .ent bn_div_words_internal
872 bn_div_words_internal
:
874 $code.=<<___
if ($flavour =~ /nubi/i);
875 .frame
$sp,6*$SZREG,$ra
876 .mask
0x8000f008,-$SZREG
878 $PTR_SUB $sp,6*$SZREG
879 $REG_S $ra,5*$SZREG($sp)
880 $REG_S $t3,4*$SZREG($sp)
881 $REG_S $t2,3*$SZREG($sp)
882 $REG_S $t1,2*$SZREG($sp)
883 $REG_S $t0,1*$SZREG($sp)
884 $REG_S $gp,0*$SZREG($sp)
888 bltz
$a2,.L_bn_div_words_body
903 break 6 # signal overflow
913 .L_bn_div_words_body
:
914 $SRL $DH,$a2,4*$BNSZ # bits
923 $SRL $HH,$a0,4*$BNSZ # bits
924 $SRL $QT,4*$BNSZ # q=0xffffffff
925 beq
$DH,$HH,.L_bn_div_words_skip_div1
928 .L_bn_div_words_skip_div1
:
930 $SLL $t3,$a0,4*$BNSZ # bits
931 $SRL $at,$a1,4*$BNSZ # bits
935 .L_bn_div_words_inner_loop1
:
943 beqz
$at,.L_bn_div_words_inner_loop1_done
946 b
.L_bn_div_words_inner_loop1
949 .L_bn_div_words_inner_loop1_done
:
951 $SLL $a1,4*$BNSZ # bits
953 $SLL $v0,$QT,4*$BNSZ # bits
956 $SRL $HH,$a0,4*$BNSZ # bits
957 $SRL $QT,4*$BNSZ # q=0xffffffff
958 beq
$DH,$HH,.L_bn_div_words_skip_div2
961 .L_bn_div_words_skip_div2
:
963 $SLL $t3,$a0,4*$BNSZ # bits
964 $SRL $at,$a1,4*$BNSZ # bits
968 .L_bn_div_words_inner_loop2
:
976 beqz
$at,.L_bn_div_words_inner_loop2_done
979 b
.L_bn_div_words_inner_loop2
982 .L_bn_div_words_inner_loop2_done
:
986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
987 $SRL $a2,$t9 # restore $a2
992 $code.=<<___
if ($flavour =~ /nubi/i);
993 $REG_L $t3,4*$SZREG($sp)
994 $REG_L $t2,3*$SZREG($sp)
995 $REG_L $t1,2*$SZREG($sp)
996 $REG_L $t0,1*$SZREG($sp)
997 $REG_L $gp,0*$SZREG($sp)
998 $PTR_ADD $sp,6*$SZREG
1003 .end bn_div_words_internal
1005 undef $HH; undef $QT; undef $DH;
1007 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1010 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1013 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1018 .globl bn_mul_comba8
1023 $code.=<<___
if ($flavour =~ /nubi/i);
1024 .frame
$sp,12*$SZREG,$ra
1025 .mask
0x803ff008,-$SZREG
1026 $PTR_SUB $sp,12*$SZREG
1027 $REG_S $ra,11*$SZREG($sp)
1028 $REG_S $s5,10*$SZREG($sp)
1029 $REG_S $s4,9*$SZREG($sp)
1030 $REG_S $s3,8*$SZREG($sp)
1031 $REG_S $s2,7*$SZREG($sp)
1032 $REG_S $s1,6*$SZREG($sp)
1033 $REG_S $s0,5*$SZREG($sp)
1034 $REG_S $t3,4*$SZREG($sp)
1035 $REG_S $t2,3*$SZREG($sp)
1036 $REG_S $t1,2*$SZREG($sp)
1037 $REG_S $t0,1*$SZREG($sp)
1038 $REG_S $gp,0*$SZREG($sp)
1040 $code.=<<___
if ($flavour !~ /nubi/i);
1041 .frame
$sp,6*$SZREG,$ra
1042 .mask
0x003f0000,-$SZREG
1043 $PTR_SUB $sp,6*$SZREG
1044 $REG_S $s5,5*$SZREG($sp)
1045 $REG_S $s4,4*$SZREG($sp)
1046 $REG_S $s3,3*$SZREG($sp)
1047 $REG_S $s2,2*$SZREG($sp)
1048 $REG_S $s1,1*$SZREG($sp)
1049 $REG_S $s0,0*$SZREG($sp)
1054 $LD $a_0,0($a1) # If compiled with -mips3 option on
1055 # R5000 box assembler barks on this
1056 # 1ine with "should not have mult/div
1057 # as last instruction in bb (R10K
1058 # bug)" warning. If anybody out there
1059 # has a clue about how to circumvent
1060 # this do send me a note.
1061 # <appro\@fy.chalmers.se>
1065 $LD $a_2,2*$BNSZ($a1)
1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1067 $LD $a_3,3*$BNSZ($a1)
1069 $LD $b_2,2*$BNSZ($a2)
1070 $LD $b_3,3*$BNSZ($a2)
1074 $LD $a_4,4*$BNSZ($a1)
1075 $LD $a_5,5*$BNSZ($a1)
1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1077 $LD $a_6,6*$BNSZ($a1)
1078 $LD $a_7,7*$BNSZ($a1)
1079 $LD $b_4,4*$BNSZ($a2)
1080 $LD $b_5,5*$BNSZ($a2)
1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1087 $LD $b_6,6*$BNSZ($a2)
1088 $LD $b_7,7*$BNSZ($a2)
1089 $ST $c_1,0($a0) # r[0]=c1;
1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1098 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1661 $code.=<<___
if ($flavour =~ /nubi/i);
1662 $REG_L $s5,10*$SZREG($sp)
1663 $REG_L $s4,9*$SZREG($sp)
1664 $REG_L $s3,8*$SZREG($sp)
1665 $REG_L $s2,7*$SZREG($sp)
1666 $REG_L $s1,6*$SZREG($sp)
1667 $REG_L $s0,5*$SZREG($sp)
1668 $REG_L $t3,4*$SZREG($sp)
1669 $REG_L $t2,3*$SZREG($sp)
1670 $REG_L $t1,2*$SZREG($sp)
1671 $REG_L $t0,1*$SZREG($sp)
1672 $REG_L $gp,0*$SZREG($sp)
1674 $PTR_ADD $sp,12*$SZREG
1676 $code.=<<___
if ($flavour !~ /nubi/i);
1677 $REG_L $s5,5*$SZREG($sp)
1678 $REG_L $s4,4*$SZREG($sp)
1679 $REG_L $s3,3*$SZREG($sp)
1680 $REG_L $s2,2*$SZREG($sp)
1681 $REG_L $s1,1*$SZREG($sp)
1682 $REG_L $s0,0*$SZREG($sp)
1684 $PTR_ADD $sp,6*$SZREG
1690 .globl bn_mul_comba4
1694 $code.=<<___
if ($flavour =~ /nubi/i);
1695 .frame
$sp,6*$SZREG,$ra
1696 .mask
0x8000f008,-$SZREG
1698 $PTR_SUB $sp,6*$SZREG
1699 $REG_S $ra,5*$SZREG($sp)
1700 $REG_S $t3,4*$SZREG($sp)
1701 $REG_S $t2,3*$SZREG($sp)
1702 $REG_S $t1,2*$SZREG($sp)
1703 $REG_S $t0,1*$SZREG($sp)
1704 $REG_S $gp,0*$SZREG($sp)
1711 $LD $a_2,2*$BNSZ($a1)
1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1713 $LD $a_3,3*$BNSZ($a1)
1715 $LD $b_2,2*$BNSZ($a2)
1716 $LD $b_3,3*$BNSZ($a2)
1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1762 $ST $c_3,2*$BNSZ($a0)
1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1799 $ST $c_1,3*$BNSZ($a0)
1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1827 $ST $c_2,4*$BNSZ($a0)
1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1846 $ST $c_3,5*$BNSZ($a0)
1854 $ST $c_1,6*$BNSZ($a0)
1855 $ST $c_2,7*$BNSZ($a0)
1859 $code.=<<___
if ($flavour =~ /nubi/i);
1860 $REG_L $t3,4*$SZREG($sp)
1861 $REG_L $t2,3*$SZREG($sp)
1862 $REG_L $t1,2*$SZREG($sp)
1863 $REG_L $t0,1*$SZREG($sp)
1864 $REG_L $gp,0*$SZREG($sp)
1865 $PTR_ADD $sp,6*$SZREG
1873 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1878 .globl bn_sqr_comba8
1882 $code.=<<___
if ($flavour =~ /nubi/i);
1883 .frame
$sp,6*$SZREG,$ra
1884 .mask
0x8000f008,-$SZREG
1886 $PTR_SUB $sp,6*$SZREG
1887 $REG_S $ra,5*$SZREG($sp)
1888 $REG_S $t3,4*$SZREG($sp)
1889 $REG_S $t2,3*$SZREG($sp)
1890 $REG_S $t1,2*$SZREG($sp)
1891 $REG_S $t0,1*$SZREG($sp)
1892 $REG_S $gp,0*$SZREG($sp)
1898 $LD $a_2,2*$BNSZ($a1)
1899 $LD $a_3,3*$BNSZ($a1)
1901 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1902 $LD $a_4,4*$BNSZ($a1)
1903 $LD $a_5,5*$BNSZ($a1)
1904 $LD $a_6,6*$BNSZ($a1)
1905 $LD $a_7,7*$BNSZ($a1)
1910 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1915 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1928 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1942 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1947 $ST $c_3,2*$BNSZ($a0)
1953 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
1967 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
1978 $ST $c_1,3*$BNSZ($a0)
1984 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
1998 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2013 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
2018 $ST $c_2,4*$BNSZ($a0)
2024 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
2038 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2052 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
2064 $ST $c_3,5*$BNSZ($a0)
2070 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
2084 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
2099 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2114 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2119 $ST $c_1,6*$BNSZ($a0)
2125 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
2139 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
2154 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
2169 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
2180 $ST $c_2,7*$BNSZ($a0)
2186 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
2200 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
2215 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
2230 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2235 $ST $c_3,8*$BNSZ($a0)
2241 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
2255 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
2270 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
2281 $ST $c_1,9*$BNSZ($a0)
2287 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
2301 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
2316 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2321 $ST $c_2,10*$BNSZ($a0)
2327 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
2341 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
2352 $ST $c_3,11*$BNSZ($a0)
2358 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
2372 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2377 $ST $c_1,12*$BNSZ($a0)
2383 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
2393 $ST $c_2,13*$BNSZ($a0)
2401 $ST $c_3,14*$BNSZ($a0)
2402 $ST $c_1,15*$BNSZ($a0)
2406 $code.=<<___
if ($flavour =~ /nubi/i);
2407 $REG_L $t3,4*$SZREG($sp)
2408 $REG_L $t2,3*$SZREG($sp)
2409 $REG_L $t1,2*$SZREG($sp)
2410 $REG_L $t0,1*$SZREG($sp)
2411 $REG_L $gp,0*$SZREG($sp)
2412 $PTR_ADD $sp,6*$SZREG
2420 .globl bn_sqr_comba4
2424 $code.=<<___
if ($flavour =~ /nubi/i);
2425 .frame
$sp,6*$SZREG,$ra
2426 .mask
0x8000f008,-$SZREG
2428 $PTR_SUB $sp,6*$SZREG
2429 $REG_S $ra,5*$SZREG($sp)
2430 $REG_S $t3,4*$SZREG($sp)
2431 $REG_S $t2,3*$SZREG($sp)
2432 $REG_S $t1,2*$SZREG($sp)
2433 $REG_S $t0,1*$SZREG($sp)
2434 $REG_S $gp,0*$SZREG($sp)
2440 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2441 $LD $a_2,2*$BNSZ($a1)
2442 $LD $a_3,3*$BNSZ($a1)
2447 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2452 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2465 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
2479 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2484 $ST $c_3,2*$BNSZ($a0)
2490 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
2504 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
2515 $ST $c_1,3*$BNSZ($a0)
2521 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2535 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2540 $ST $c_2,4*$BNSZ($a0)
2546 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2556 $ST $c_3,5*$BNSZ($a0)
2564 $ST $c_1,6*$BNSZ($a0)
2565 $ST $c_2,7*$BNSZ($a0)
2569 $code.=<<___
if ($flavour =~ /nubi/i);
2570 $REG_L $t3,4*$SZREG($sp)
2571 $REG_L $t2,3*$SZREG($sp)
2572 $REG_L $t1,2*$SZREG($sp)
2573 $REG_L $t0,1*$SZREG($sp)
2574 $REG_L $gp,0*$SZREG($sp)
2575 $PTR_ADD $sp,6*$SZREG