Allow IPv6 address entry in tools>ping - Loosens valid character check
[tomato/davidwu.git] / release / src / router / openssl / crypto / bn / asm / mips.pl
blobc162a3ec2304aaa62a32bccf2c15c8071495fbfe
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project.
7 # Rights for redistribution and usage in source and binary forms are
8 # granted according to the OpenSSL license. Warranty of any kind is
9 # disclaimed.
10 # ====================================================================
13 # July 1999
15 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
17 # The module is designed to work with either of the "new" MIPS ABI(5),
18 # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19 # IRIX 5.x not only because it doesn't support new ABIs but also
20 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22 # cause illegal instruction exception:-(
24 # In addition the code depends on preprocessor flags set up by MIPSpro
25 # compiler driver (either as or cc) and therefore (probably?) can't be
26 # compiled by the GNU assembler. GNU C driver manages fine though...
27 # I mean as long as -mmips-as is specified or is the default option,
28 # because then it simply invokes /usr/bin/as which in turn takes
29 # perfect care of the preprocessor definitions. Another neat feature
30 # offered by the MIPSpro assembler is an optimization pass. This gave
31 # me the opportunity to have the code looking more regular as all those
32 # architecture dependent instruction rescheduling details were left to
33 # the assembler. Cool, huh?
35 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36 # goes way over 3 times faster!
38 # <appro@fy.chalmers.se>
40 # October 2010
42 # Adapt the module even for 32-bit ABIs and other OSes. The former was
43 # achieved by mechanical replacement of 64-bit arithmetic instructions
44 # such as dmultu, daddu, etc. with their 32-bit counterparts and
45 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46 # >3x performance improvement naturally does not apply to 32-bit code
47 # [because there is no instruction 32-bit compiler can't use], one
48 # has to content with 40-85% improvement depending on benchmark and
49 # key length, more for longer keys.
51 $flavour = shift;
52 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53 open STDOUT,">$output";
55 if ($flavour =~ /64|n32/i) {
56 $LD="ld";
57 $ST="sd";
58 $MULTU="dmultu";
59 $DIVU="ddivu";
60 $ADDU="daddu";
61 $SUBU="dsubu";
62 $SRL="dsrl";
63 $SLL="dsll";
64 $BNSZ=8;
65 $PTR_ADD="daddu";
66 $PTR_SUB="dsubu";
67 $SZREG=8;
68 $REG_S="sd";
69 $REG_L="ld";
70 } else {
71 $LD="lw";
72 $ST="sw";
73 $MULTU="multu";
74 $DIVU="divu";
75 $ADDU="addu";
76 $SUBU="subu";
77 $SRL="srl";
78 $SLL="sll";
79 $BNSZ=4;
80 $PTR_ADD="addu";
81 $PTR_SUB="subu";
82 $SZREG=4;
83 $REG_S="sw";
84 $REG_L="lw";
85 $code=".set mips2\n";
88 # Below is N32/64 register layout used in the original module.
90 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
97 # No special adaptation is required for O32. NUBI on the other hand
98 # is treated by saving/restoring ($v1,$t0..$t3).
100 $gp=$v1 if ($flavour =~ /nubi/i);
102 $minus4=$v1;
104 $code.=<<___;
105 .rdata
106 .asciiz "mips3.s, Version 1.2"
107 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
109 .text
110 .set noat
112 .align 5
113 .globl bn_mul_add_words
114 .ent bn_mul_add_words
115 bn_mul_add_words:
116 .set noreorder
117 bgtz $a2,bn_mul_add_words_internal
118 move $v0,$zero
119 jr $ra
120 move $a0,$v0
121 .end bn_mul_add_words
123 .align 5
124 .ent bn_mul_add_words_internal
125 bn_mul_add_words_internal:
127 $code.=<<___ if ($flavour =~ /nubi/i);
128 .frame $sp,6*$SZREG,$ra
129 .mask 0x8000f008,-$SZREG
130 .set noreorder
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
139 $code.=<<___;
140 .set reorder
141 li $minus4,-4
142 and $ta0,$a2,$minus4
143 $LD $t0,0($a1)
144 beqz $ta0,.L_bn_mul_add_words_tail
146 .L_bn_mul_add_words_loop:
147 $MULTU $t0,$a3
148 $LD $t1,0($a0)
149 $LD $t2,$BNSZ($a1)
150 $LD $t3,$BNSZ($a0)
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
153 $ADDU $t1,$v0
154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
157 mflo $at
158 mfhi $t0
159 $ADDU $t1,$at
160 $ADDU $v0,$t0
161 $MULTU $t2,$a3
162 sltu $at,$t1,$at
163 $ST $t1,0($a0)
164 $ADDU $v0,$at
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
168 $ADDU $t3,$v0
169 sltu $v0,$t3,$v0
170 mflo $at
171 mfhi $t2
172 $ADDU $t3,$at
173 $ADDU $v0,$t2
174 $MULTU $ta0,$a3
175 sltu $at,$t3,$at
176 $ST $t3,$BNSZ($a0)
177 $ADDU $v0,$at
179 subu $a2,4
180 $PTR_ADD $a0,4*$BNSZ
181 $PTR_ADD $a1,4*$BNSZ
182 $ADDU $ta1,$v0
183 sltu $v0,$ta1,$v0
184 mflo $at
185 mfhi $ta0
186 $ADDU $ta1,$at
187 $ADDU $v0,$ta0
188 $MULTU $ta2,$a3
189 sltu $at,$ta1,$at
190 $ST $ta1,-2*$BNSZ($a0)
191 $ADDU $v0,$at
194 and $ta0,$a2,$minus4
195 $ADDU $ta3,$v0
196 sltu $v0,$ta3,$v0
197 mflo $at
198 mfhi $ta2
199 $ADDU $ta3,$at
200 $ADDU $v0,$ta2
201 sltu $at,$ta3,$at
202 $ST $ta3,-$BNSZ($a0)
203 $ADDU $v0,$at
204 .set noreorder
205 bgtzl $ta0,.L_bn_mul_add_words_loop
206 $LD $t0,0($a1)
208 beqz $a2,.L_bn_mul_add_words_return
211 .L_bn_mul_add_words_tail:
212 .set reorder
213 $LD $t0,0($a1)
214 $MULTU $t0,$a3
215 $LD $t1,0($a0)
216 subu $a2,1
217 $ADDU $t1,$v0
218 sltu $v0,$t1,$v0
219 mflo $at
220 mfhi $t0
221 $ADDU $t1,$at
222 $ADDU $v0,$t0
223 sltu $at,$t1,$at
224 $ST $t1,0($a0)
225 $ADDU $v0,$at
226 beqz $a2,.L_bn_mul_add_words_return
228 $LD $t0,$BNSZ($a1)
229 $MULTU $t0,$a3
230 $LD $t1,$BNSZ($a0)
231 subu $a2,1
232 $ADDU $t1,$v0
233 sltu $v0,$t1,$v0
234 mflo $at
235 mfhi $t0
236 $ADDU $t1,$at
237 $ADDU $v0,$t0
238 sltu $at,$t1,$at
239 $ST $t1,$BNSZ($a0)
240 $ADDU $v0,$at
241 beqz $a2,.L_bn_mul_add_words_return
243 $LD $t0,2*$BNSZ($a1)
244 $MULTU $t0,$a3
245 $LD $t1,2*$BNSZ($a0)
246 $ADDU $t1,$v0
247 sltu $v0,$t1,$v0
248 mflo $at
249 mfhi $t0
250 $ADDU $t1,$at
251 $ADDU $v0,$t0
252 sltu $at,$t1,$at
253 $ST $t1,2*$BNSZ($a0)
254 $ADDU $v0,$at
256 .L_bn_mul_add_words_return:
257 .set noreorder
259 $code.=<<___ if ($flavour =~ /nubi/i);
260 $REG_L $t3,4*$SZREG($sp)
261 $REG_L $t2,3*$SZREG($sp)
262 $REG_L $t1,2*$SZREG($sp)
263 $REG_L $t0,1*$SZREG($sp)
264 $REG_L $gp,0*$SZREG($sp)
265 $PTR_ADD $sp,6*$SZREG
267 $code.=<<___;
268 jr $ra
269 move $a0,$v0
270 .end bn_mul_add_words_internal
272 .align 5
273 .globl bn_mul_words
274 .ent bn_mul_words
275 bn_mul_words:
276 .set noreorder
277 bgtz $a2,bn_mul_words_internal
278 move $v0,$zero
279 jr $ra
280 move $a0,$v0
281 .end bn_mul_words
283 .align 5
284 .ent bn_mul_words_internal
285 bn_mul_words_internal:
287 $code.=<<___ if ($flavour =~ /nubi/i);
288 .frame $sp,6*$SZREG,$ra
289 .mask 0x8000f008,-$SZREG
290 .set noreorder
291 $PTR_SUB $sp,6*$SZREG
292 $REG_S $ra,5*$SZREG($sp)
293 $REG_S $t3,4*$SZREG($sp)
294 $REG_S $t2,3*$SZREG($sp)
295 $REG_S $t1,2*$SZREG($sp)
296 $REG_S $t0,1*$SZREG($sp)
297 $REG_S $gp,0*$SZREG($sp)
299 $code.=<<___;
300 .set reorder
301 li $minus4,-4
302 and $ta0,$a2,$minus4
303 $LD $t0,0($a1)
304 beqz $ta0,.L_bn_mul_words_tail
306 .L_bn_mul_words_loop:
307 $MULTU $t0,$a3
308 $LD $t2,$BNSZ($a1)
309 $LD $ta0,2*$BNSZ($a1)
310 $LD $ta2,3*$BNSZ($a1)
311 mflo $at
312 mfhi $t0
313 $ADDU $v0,$at
314 sltu $t1,$v0,$at
315 $MULTU $t2,$a3
316 $ST $v0,0($a0)
317 $ADDU $v0,$t1,$t0
319 subu $a2,4
320 $PTR_ADD $a0,4*$BNSZ
321 $PTR_ADD $a1,4*$BNSZ
322 mflo $at
323 mfhi $t2
324 $ADDU $v0,$at
325 sltu $t3,$v0,$at
326 $MULTU $ta0,$a3
327 $ST $v0,-3*$BNSZ($a0)
328 $ADDU $v0,$t3,$t2
330 mflo $at
331 mfhi $ta0
332 $ADDU $v0,$at
333 sltu $ta1,$v0,$at
334 $MULTU $ta2,$a3
335 $ST $v0,-2*$BNSZ($a0)
336 $ADDU $v0,$ta1,$ta0
338 and $ta0,$a2,$minus4
339 mflo $at
340 mfhi $ta2
341 $ADDU $v0,$at
342 sltu $ta3,$v0,$at
343 $ST $v0,-$BNSZ($a0)
344 $ADDU $v0,$ta3,$ta2
345 .set noreorder
346 bgtzl $ta0,.L_bn_mul_words_loop
347 $LD $t0,0($a1)
349 beqz $a2,.L_bn_mul_words_return
352 .L_bn_mul_words_tail:
353 .set reorder
354 $LD $t0,0($a1)
355 $MULTU $t0,$a3
356 subu $a2,1
357 mflo $at
358 mfhi $t0
359 $ADDU $v0,$at
360 sltu $t1,$v0,$at
361 $ST $v0,0($a0)
362 $ADDU $v0,$t1,$t0
363 beqz $a2,.L_bn_mul_words_return
365 $LD $t0,$BNSZ($a1)
366 $MULTU $t0,$a3
367 subu $a2,1
368 mflo $at
369 mfhi $t0
370 $ADDU $v0,$at
371 sltu $t1,$v0,$at
372 $ST $v0,$BNSZ($a0)
373 $ADDU $v0,$t1,$t0
374 beqz $a2,.L_bn_mul_words_return
376 $LD $t0,2*$BNSZ($a1)
377 $MULTU $t0,$a3
378 mflo $at
379 mfhi $t0
380 $ADDU $v0,$at
381 sltu $t1,$v0,$at
382 $ST $v0,2*$BNSZ($a0)
383 $ADDU $v0,$t1,$t0
385 .L_bn_mul_words_return:
386 .set noreorder
388 $code.=<<___ if ($flavour =~ /nubi/i);
389 $REG_L $t3,4*$SZREG($sp)
390 $REG_L $t2,3*$SZREG($sp)
391 $REG_L $t1,2*$SZREG($sp)
392 $REG_L $t0,1*$SZREG($sp)
393 $REG_L $gp,0*$SZREG($sp)
394 $PTR_ADD $sp,6*$SZREG
396 $code.=<<___;
397 jr $ra
398 move $a0,$v0
399 .end bn_mul_words_internal
401 .align 5
402 .globl bn_sqr_words
403 .ent bn_sqr_words
404 bn_sqr_words:
405 .set noreorder
406 bgtz $a2,bn_sqr_words_internal
407 move $v0,$zero
408 jr $ra
409 move $a0,$v0
410 .end bn_sqr_words
412 .align 5
413 .ent bn_sqr_words_internal
414 bn_sqr_words_internal:
416 $code.=<<___ if ($flavour =~ /nubi/i);
417 .frame $sp,6*$SZREG,$ra
418 .mask 0x8000f008,-$SZREG
419 .set noreorder
420 $PTR_SUB $sp,6*$SZREG
421 $REG_S $ra,5*$SZREG($sp)
422 $REG_S $t3,4*$SZREG($sp)
423 $REG_S $t2,3*$SZREG($sp)
424 $REG_S $t1,2*$SZREG($sp)
425 $REG_S $t0,1*$SZREG($sp)
426 $REG_S $gp,0*$SZREG($sp)
428 $code.=<<___;
429 .set reorder
430 li $minus4,-4
431 and $ta0,$a2,$minus4
432 $LD $t0,0($a1)
433 beqz $ta0,.L_bn_sqr_words_tail
435 .L_bn_sqr_words_loop:
436 $MULTU $t0,$t0
437 $LD $t2,$BNSZ($a1)
438 $LD $ta0,2*$BNSZ($a1)
439 $LD $ta2,3*$BNSZ($a1)
440 mflo $t1
441 mfhi $t0
442 $ST $t1,0($a0)
443 $ST $t0,$BNSZ($a0)
445 $MULTU $t2,$t2
446 subu $a2,4
447 $PTR_ADD $a0,8*$BNSZ
448 $PTR_ADD $a1,4*$BNSZ
449 mflo $t3
450 mfhi $t2
451 $ST $t3,-6*$BNSZ($a0)
452 $ST $t2,-5*$BNSZ($a0)
454 $MULTU $ta0,$ta0
455 mflo $ta1
456 mfhi $ta0
457 $ST $ta1,-4*$BNSZ($a0)
458 $ST $ta0,-3*$BNSZ($a0)
461 $MULTU $ta2,$ta2
462 and $ta0,$a2,$minus4
463 mflo $ta3
464 mfhi $ta2
465 $ST $ta3,-2*$BNSZ($a0)
466 $ST $ta2,-$BNSZ($a0)
468 .set noreorder
469 bgtzl $ta0,.L_bn_sqr_words_loop
470 $LD $t0,0($a1)
472 beqz $a2,.L_bn_sqr_words_return
475 .L_bn_sqr_words_tail:
476 .set reorder
477 $LD $t0,0($a1)
478 $MULTU $t0,$t0
479 subu $a2,1
480 mflo $t1
481 mfhi $t0
482 $ST $t1,0($a0)
483 $ST $t0,$BNSZ($a0)
484 beqz $a2,.L_bn_sqr_words_return
486 $LD $t0,$BNSZ($a1)
487 $MULTU $t0,$t0
488 subu $a2,1
489 mflo $t1
490 mfhi $t0
491 $ST $t1,2*$BNSZ($a0)
492 $ST $t0,3*$BNSZ($a0)
493 beqz $a2,.L_bn_sqr_words_return
495 $LD $t0,2*$BNSZ($a1)
496 $MULTU $t0,$t0
497 mflo $t1
498 mfhi $t0
499 $ST $t1,4*$BNSZ($a0)
500 $ST $t0,5*$BNSZ($a0)
502 .L_bn_sqr_words_return:
503 .set noreorder
505 $code.=<<___ if ($flavour =~ /nubi/i);
506 $REG_L $t3,4*$SZREG($sp)
507 $REG_L $t2,3*$SZREG($sp)
508 $REG_L $t1,2*$SZREG($sp)
509 $REG_L $t0,1*$SZREG($sp)
510 $REG_L $gp,0*$SZREG($sp)
511 $PTR_ADD $sp,6*$SZREG
513 $code.=<<___;
514 jr $ra
515 move $a0,$v0
517 .end bn_sqr_words_internal
519 .align 5
520 .globl bn_add_words
521 .ent bn_add_words
522 bn_add_words:
523 .set noreorder
524 bgtz $a3,bn_add_words_internal
525 move $v0,$zero
526 jr $ra
527 move $a0,$v0
528 .end bn_add_words
530 .align 5
531 .ent bn_add_words_internal
532 bn_add_words_internal:
534 $code.=<<___ if ($flavour =~ /nubi/i);
535 .frame $sp,6*$SZREG,$ra
536 .mask 0x8000f008,-$SZREG
537 .set noreorder
538 $PTR_SUB $sp,6*$SZREG
539 $REG_S $ra,5*$SZREG($sp)
540 $REG_S $t3,4*$SZREG($sp)
541 $REG_S $t2,3*$SZREG($sp)
542 $REG_S $t1,2*$SZREG($sp)
543 $REG_S $t0,1*$SZREG($sp)
544 $REG_S $gp,0*$SZREG($sp)
546 $code.=<<___;
547 .set reorder
548 li $minus4,-4
549 and $at,$a3,$minus4
550 $LD $t0,0($a1)
551 beqz $at,.L_bn_add_words_tail
553 .L_bn_add_words_loop:
554 $LD $ta0,0($a2)
555 subu $a3,4
556 $LD $t1,$BNSZ($a1)
557 and $at,$a3,$minus4
558 $LD $t2,2*$BNSZ($a1)
559 $PTR_ADD $a2,4*$BNSZ
560 $LD $t3,3*$BNSZ($a1)
561 $PTR_ADD $a0,4*$BNSZ
562 $LD $ta1,-3*$BNSZ($a2)
563 $PTR_ADD $a1,4*$BNSZ
564 $LD $ta2,-2*$BNSZ($a2)
565 $LD $ta3,-$BNSZ($a2)
566 $ADDU $ta0,$t0
567 sltu $t8,$ta0,$t0
568 $ADDU $t0,$ta0,$v0
569 sltu $v0,$t0,$ta0
570 $ST $t0,-4*$BNSZ($a0)
571 $ADDU $v0,$t8
573 $ADDU $ta1,$t1
574 sltu $t9,$ta1,$t1
575 $ADDU $t1,$ta1,$v0
576 sltu $v0,$t1,$ta1
577 $ST $t1,-3*$BNSZ($a0)
578 $ADDU $v0,$t9
580 $ADDU $ta2,$t2
581 sltu $t8,$ta2,$t2
582 $ADDU $t2,$ta2,$v0
583 sltu $v0,$t2,$ta2
584 $ST $t2,-2*$BNSZ($a0)
585 $ADDU $v0,$t8
587 $ADDU $ta3,$t3
588 sltu $t9,$ta3,$t3
589 $ADDU $t3,$ta3,$v0
590 sltu $v0,$t3,$ta3
591 $ST $t3,-$BNSZ($a0)
592 $ADDU $v0,$t9
594 .set noreorder
595 bgtzl $at,.L_bn_add_words_loop
596 $LD $t0,0($a1)
598 beqz $a3,.L_bn_add_words_return
601 .L_bn_add_words_tail:
602 .set reorder
603 $LD $t0,0($a1)
604 $LD $ta0,0($a2)
605 $ADDU $ta0,$t0
606 subu $a3,1
607 sltu $t8,$ta0,$t0
608 $ADDU $t0,$ta0,$v0
609 sltu $v0,$t0,$ta0
610 $ST $t0,0($a0)
611 $ADDU $v0,$t8
612 beqz $a3,.L_bn_add_words_return
614 $LD $t1,$BNSZ($a1)
615 $LD $ta1,$BNSZ($a2)
616 $ADDU $ta1,$t1
617 subu $a3,1
618 sltu $t9,$ta1,$t1
619 $ADDU $t1,$ta1,$v0
620 sltu $v0,$t1,$ta1
621 $ST $t1,$BNSZ($a0)
622 $ADDU $v0,$t9
623 beqz $a3,.L_bn_add_words_return
625 $LD $t2,2*$BNSZ($a1)
626 $LD $ta2,2*$BNSZ($a2)
627 $ADDU $ta2,$t2
628 sltu $t8,$ta2,$t2
629 $ADDU $t2,$ta2,$v0
630 sltu $v0,$t2,$ta2
631 $ST $t2,2*$BNSZ($a0)
632 $ADDU $v0,$t8
634 .L_bn_add_words_return:
635 .set noreorder
637 $code.=<<___ if ($flavour =~ /nubi/i);
638 $REG_L $t3,4*$SZREG($sp)
639 $REG_L $t2,3*$SZREG($sp)
640 $REG_L $t1,2*$SZREG($sp)
641 $REG_L $t0,1*$SZREG($sp)
642 $REG_L $gp,0*$SZREG($sp)
643 $PTR_ADD $sp,6*$SZREG
645 $code.=<<___;
646 jr $ra
647 move $a0,$v0
649 .end bn_add_words_internal
651 .align 5
652 .globl bn_sub_words
653 .ent bn_sub_words
654 bn_sub_words:
655 .set noreorder
656 bgtz $a3,bn_sub_words_internal
657 move $v0,$zero
658 jr $ra
659 move $a0,$zero
660 .end bn_sub_words
662 .align 5
663 .ent bn_sub_words_internal
664 bn_sub_words_internal:
666 $code.=<<___ if ($flavour =~ /nubi/i);
667 .frame $sp,6*$SZREG,$ra
668 .mask 0x8000f008,-$SZREG
669 .set noreorder
670 $PTR_SUB $sp,6*$SZREG
671 $REG_S $ra,5*$SZREG($sp)
672 $REG_S $t3,4*$SZREG($sp)
673 $REG_S $t2,3*$SZREG($sp)
674 $REG_S $t1,2*$SZREG($sp)
675 $REG_S $t0,1*$SZREG($sp)
676 $REG_S $gp,0*$SZREG($sp)
678 $code.=<<___;
679 .set reorder
680 li $minus4,-4
681 and $at,$a3,$minus4
682 $LD $t0,0($a1)
683 beqz $at,.L_bn_sub_words_tail
685 .L_bn_sub_words_loop:
686 $LD $ta0,0($a2)
687 subu $a3,4
688 $LD $t1,$BNSZ($a1)
689 and $at,$a3,$minus4
690 $LD $t2,2*$BNSZ($a1)
691 $PTR_ADD $a2,4*$BNSZ
692 $LD $t3,3*$BNSZ($a1)
693 $PTR_ADD $a0,4*$BNSZ
694 $LD $ta1,-3*$BNSZ($a2)
695 $PTR_ADD $a1,4*$BNSZ
696 $LD $ta2,-2*$BNSZ($a2)
697 $LD $ta3,-$BNSZ($a2)
698 sltu $t8,$t0,$ta0
699 $SUBU $ta0,$t0,$ta0
700 $SUBU $t0,$ta0,$v0
701 sgtu $v0,$t0,$ta0
702 $ST $t0,-4*$BNSZ($a0)
703 $ADDU $v0,$t8
705 sltu $t9,$t1,$ta1
706 $SUBU $ta1,$t1,$ta1
707 $SUBU $t1,$ta1,$v0
708 sgtu $v0,$t1,$ta1
709 $ST $t1,-3*$BNSZ($a0)
710 $ADDU $v0,$t9
713 sltu $t8,$t2,$ta2
714 $SUBU $ta2,$t2,$ta2
715 $SUBU $t2,$ta2,$v0
716 sgtu $v0,$t2,$ta2
717 $ST $t2,-2*$BNSZ($a0)
718 $ADDU $v0,$t8
720 sltu $t9,$t3,$ta3
721 $SUBU $ta3,$t3,$ta3
722 $SUBU $t3,$ta3,$v0
723 sgtu $v0,$t3,$ta3
724 $ST $t3,-$BNSZ($a0)
725 $ADDU $v0,$t9
727 .set noreorder
728 bgtzl $at,.L_bn_sub_words_loop
729 $LD $t0,0($a1)
731 beqz $a3,.L_bn_sub_words_return
734 .L_bn_sub_words_tail:
735 .set reorder
736 $LD $t0,0($a1)
737 $LD $ta0,0($a2)
738 subu $a3,1
739 sltu $t8,$t0,$ta0
740 $SUBU $ta0,$t0,$ta0
741 $SUBU $t0,$ta0,$v0
742 sgtu $v0,$t0,$ta0
743 $ST $t0,0($a0)
744 $ADDU $v0,$t8
745 beqz $a3,.L_bn_sub_words_return
747 $LD $t1,$BNSZ($a1)
748 subu $a3,1
749 $LD $ta1,$BNSZ($a2)
750 sltu $t9,$t1,$ta1
751 $SUBU $ta1,$t1,$ta1
752 $SUBU $t1,$ta1,$v0
753 sgtu $v0,$t1,$ta1
754 $ST $t1,$BNSZ($a0)
755 $ADDU $v0,$t9
756 beqz $a3,.L_bn_sub_words_return
758 $LD $t2,2*$BNSZ($a1)
759 $LD $ta2,2*$BNSZ($a2)
760 sltu $t8,$t2,$ta2
761 $SUBU $ta2,$t2,$ta2
762 $SUBU $t2,$ta2,$v0
763 sgtu $v0,$t2,$ta2
764 $ST $t2,2*$BNSZ($a0)
765 $ADDU $v0,$t8
767 .L_bn_sub_words_return:
768 .set noreorder
770 $code.=<<___ if ($flavour =~ /nubi/i);
771 $REG_L $t3,4*$SZREG($sp)
772 $REG_L $t2,3*$SZREG($sp)
773 $REG_L $t1,2*$SZREG($sp)
774 $REG_L $t0,1*$SZREG($sp)
775 $REG_L $gp,0*$SZREG($sp)
776 $PTR_ADD $sp,6*$SZREG
778 $code.=<<___;
779 jr $ra
780 move $a0,$v0
781 .end bn_sub_words_internal
783 .align 5
784 .globl bn_div_3_words
785 .ent bn_div_3_words
786 bn_div_3_words:
787 .set noreorder
788 move $a3,$a0 # we know that bn_div_words does not
789 # touch $a3, $ta2, $ta3 and preserves $a2
790 # so that we can save two arguments
791 # and return address in registers
792 # instead of stack:-)
794 $LD $a0,($a3)
795 move $ta2,$a1
796 bne $a0,$a2,bn_div_3_words_internal
797 $LD $a1,-$BNSZ($a3)
798 li $v0,-1
799 jr $ra
800 move $a0,$v0
801 .end bn_div_3_words
803 .align 5
804 .ent bn_div_3_words_internal
805 bn_div_3_words_internal:
807 $code.=<<___ if ($flavour =~ /nubi/i);
808 .frame $sp,6*$SZREG,$ra
809 .mask 0x8000f008,-$SZREG
810 .set noreorder
811 $PTR_SUB $sp,6*$SZREG
812 $REG_S $ra,5*$SZREG($sp)
813 $REG_S $t3,4*$SZREG($sp)
814 $REG_S $t2,3*$SZREG($sp)
815 $REG_S $t1,2*$SZREG($sp)
816 $REG_S $t0,1*$SZREG($sp)
817 $REG_S $gp,0*$SZREG($sp)
819 $code.=<<___;
820 .set reorder
821 move $ta3,$ra
822 bal bn_div_words
823 move $ra,$ta3
824 $MULTU $ta2,$v0
825 $LD $t2,-2*$BNSZ($a3)
826 move $ta0,$zero
827 mfhi $t1
828 mflo $t0
829 sltu $t8,$t1,$a1
830 .L_bn_div_3_words_inner_loop:
831 bnez $t8,.L_bn_div_3_words_inner_loop_done
832 sgeu $at,$t2,$t0
833 seq $t9,$t1,$a1
834 and $at,$t9
835 sltu $t3,$t0,$ta2
836 $ADDU $a1,$a2
837 $SUBU $t1,$t3
838 $SUBU $t0,$ta2
839 sltu $t8,$t1,$a1
840 sltu $ta0,$a1,$a2
841 or $t8,$ta0
842 .set noreorder
843 beqzl $at,.L_bn_div_3_words_inner_loop
844 $SUBU $v0,1
845 .set reorder
846 .L_bn_div_3_words_inner_loop_done:
847 .set noreorder
849 $code.=<<___ if ($flavour =~ /nubi/i);
850 $REG_L $t3,4*$SZREG($sp)
851 $REG_L $t2,3*$SZREG($sp)
852 $REG_L $t1,2*$SZREG($sp)
853 $REG_L $t0,1*$SZREG($sp)
854 $REG_L $gp,0*$SZREG($sp)
855 $PTR_ADD $sp,6*$SZREG
857 $code.=<<___;
858 jr $ra
859 move $a0,$v0
860 .end bn_div_3_words_internal
862 .align 5
863 .globl bn_div_words
864 .ent bn_div_words
865 bn_div_words:
866 .set noreorder
867 bnez $a2,bn_div_words_internal
868 li $v0,-1 # I would rather signal div-by-zero
869 # which can be done with 'break 7'
870 jr $ra
871 move $a0,$v0
872 .end bn_div_words
874 .align 5
875 .ent bn_div_words_internal
876 bn_div_words_internal:
878 $code.=<<___ if ($flavour =~ /nubi/i);
879 .frame $sp,6*$SZREG,$ra
880 .mask 0x8000f008,-$SZREG
881 .set noreorder
882 $PTR_SUB $sp,6*$SZREG
883 $REG_S $ra,5*$SZREG($sp)
884 $REG_S $t3,4*$SZREG($sp)
885 $REG_S $t2,3*$SZREG($sp)
886 $REG_S $t1,2*$SZREG($sp)
887 $REG_S $t0,1*$SZREG($sp)
888 $REG_S $gp,0*$SZREG($sp)
890 $code.=<<___;
891 move $v1,$zero
892 bltz $a2,.L_bn_div_words_body
893 move $t9,$v1
894 $SLL $a2,1
895 bgtz $a2,.-4
896 addu $t9,1
898 .set reorder
899 negu $t1,$t9
900 li $t2,-1
901 $SLL $t2,$t1
902 and $t2,$a0
903 $SRL $at,$a1,$t1
904 .set noreorder
905 bnezl $t2,.+8
906 break 6 # signal overflow
907 .set reorder
908 $SLL $a0,$t9
909 $SLL $a1,$t9
910 or $a0,$at
912 $QT=$ta0;
913 $HH=$ta1;
914 $DH=$v1;
915 $code.=<<___;
916 .L_bn_div_words_body:
917 $SRL $DH,$a2,4*$BNSZ # bits
918 sgeu $at,$a0,$a2
919 .set noreorder
920 bnezl $at,.+8
921 $SUBU $a0,$a2
922 .set reorder
924 li $QT,-1
925 $SRL $HH,$a0,4*$BNSZ # bits
926 $SRL $QT,4*$BNSZ # q=0xffffffff
927 beq $DH,$HH,.L_bn_div_words_skip_div1
928 $DIVU $zero,$a0,$DH
929 mflo $QT
930 .L_bn_div_words_skip_div1:
931 $MULTU $a2,$QT
932 $SLL $t3,$a0,4*$BNSZ # bits
933 $SRL $at,$a1,4*$BNSZ # bits
934 or $t3,$at
935 mflo $t0
936 mfhi $t1
937 .L_bn_div_words_inner_loop1:
938 sltu $t2,$t3,$t0
939 seq $t8,$HH,$t1
940 sltu $at,$HH,$t1
941 and $t2,$t8
942 sltu $v0,$t0,$a2
943 or $at,$t2
944 .set noreorder
945 beqz $at,.L_bn_div_words_inner_loop1_done
946 $SUBU $t1,$v0
947 $SUBU $t0,$a2
948 b .L_bn_div_words_inner_loop1
949 $SUBU $QT,1
950 .set reorder
951 .L_bn_div_words_inner_loop1_done:
953 $SLL $a1,4*$BNSZ # bits
954 $SUBU $a0,$t3,$t0
955 $SLL $v0,$QT,4*$BNSZ # bits
957 li $QT,-1
958 $SRL $HH,$a0,4*$BNSZ # bits
959 $SRL $QT,4*$BNSZ # q=0xffffffff
960 beq $DH,$HH,.L_bn_div_words_skip_div2
961 $DIVU $zero,$a0,$DH
962 mflo $QT
963 .L_bn_div_words_skip_div2:
964 $MULTU $a2,$QT
965 $SLL $t3,$a0,4*$BNSZ # bits
966 $SRL $at,$a1,4*$BNSZ # bits
967 or $t3,$at
968 mflo $t0
969 mfhi $t1
970 .L_bn_div_words_inner_loop2:
971 sltu $t2,$t3,$t0
972 seq $t8,$HH,$t1
973 sltu $at,$HH,$t1
974 and $t2,$t8
975 sltu $v1,$t0,$a2
976 or $at,$t2
977 .set noreorder
978 beqz $at,.L_bn_div_words_inner_loop2_done
979 $SUBU $t1,$v1
980 $SUBU $t0,$a2
981 b .L_bn_div_words_inner_loop2
982 $SUBU $QT,1
983 .set reorder
984 .L_bn_div_words_inner_loop2_done:
986 $SUBU $a0,$t3,$t0
987 or $v0,$QT
988 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
989 $SRL $a2,$t9 # restore $a2
991 .set noreorder
992 move $a1,$v1
994 $code.=<<___ if ($flavour =~ /nubi/i);
995 $REG_L $t3,4*$SZREG($sp)
996 $REG_L $t2,3*$SZREG($sp)
997 $REG_L $t1,2*$SZREG($sp)
998 $REG_L $t0,1*$SZREG($sp)
999 $REG_L $gp,0*$SZREG($sp)
1000 $PTR_ADD $sp,6*$SZREG
1002 $code.=<<___;
1003 jr $ra
1004 move $a0,$v0
1005 .end bn_div_words_internal
1007 undef $HH; undef $QT; undef $DH;
1009 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1010 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1012 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1013 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1015 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1017 $code.=<<___;
1019 .align 5
1020 .globl bn_mul_comba8
1021 .ent bn_mul_comba8
1022 bn_mul_comba8:
1023 .set noreorder
1025 $code.=<<___ if ($flavour =~ /nubi/i);
1026 .frame $sp,12*$SZREG,$ra
1027 .mask 0x803ff008,-$SZREG
1028 $PTR_SUB $sp,12*$SZREG
1029 $REG_S $ra,11*$SZREG($sp)
1030 $REG_S $s5,10*$SZREG($sp)
1031 $REG_S $s4,9*$SZREG($sp)
1032 $REG_S $s3,8*$SZREG($sp)
1033 $REG_S $s2,7*$SZREG($sp)
1034 $REG_S $s1,6*$SZREG($sp)
1035 $REG_S $s0,5*$SZREG($sp)
1036 $REG_S $t3,4*$SZREG($sp)
1037 $REG_S $t2,3*$SZREG($sp)
1038 $REG_S $t1,2*$SZREG($sp)
1039 $REG_S $t0,1*$SZREG($sp)
1040 $REG_S $gp,0*$SZREG($sp)
1042 $code.=<<___ if ($flavour !~ /nubi/i);
1043 .frame $sp,6*$SZREG,$ra
1044 .mask 0x003f0000,-$SZREG
1045 $PTR_SUB $sp,6*$SZREG
1046 $REG_S $s5,5*$SZREG($sp)
1047 $REG_S $s4,4*$SZREG($sp)
1048 $REG_S $s3,3*$SZREG($sp)
1049 $REG_S $s2,2*$SZREG($sp)
1050 $REG_S $s1,1*$SZREG($sp)
1051 $REG_S $s0,0*$SZREG($sp)
1053 $code.=<<___;
1055 .set reorder
1056 $LD $a_0,0($a1) # If compiled with -mips3 option on
1057 # R5000 box assembler barks on this
1058 # 1ine with "should not have mult/div
1059 # as last instruction in bb (R10K
1060 # bug)" warning. If anybody out there
1061 # has a clue about how to circumvent
1062 # this do send me a note.
1063 # <appro\@fy.chalmers.se>
1065 $LD $b_0,0($a2)
1066 $LD $a_1,$BNSZ($a1)
1067 $LD $a_2,2*$BNSZ($a1)
1068 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1069 $LD $a_3,3*$BNSZ($a1)
1070 $LD $b_1,$BNSZ($a2)
1071 $LD $b_2,2*$BNSZ($a2)
1072 $LD $b_3,3*$BNSZ($a2)
1073 mflo $c_1
1074 mfhi $c_2
1076 $LD $a_4,4*$BNSZ($a1)
1077 $LD $a_5,5*$BNSZ($a1)
1078 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1079 $LD $a_6,6*$BNSZ($a1)
1080 $LD $a_7,7*$BNSZ($a1)
1081 $LD $b_4,4*$BNSZ($a2)
1082 $LD $b_5,5*$BNSZ($a2)
1083 mflo $t_1
1084 mfhi $t_2
1085 $ADDU $c_2,$t_1
1086 sltu $at,$c_2,$t_1
1087 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1088 $ADDU $c_3,$t_2,$at
1089 $LD $b_6,6*$BNSZ($a2)
1090 $LD $b_7,7*$BNSZ($a2)
1091 $ST $c_1,0($a0) # r[0]=c1;
1092 mflo $t_1
1093 mfhi $t_2
1094 $ADDU $c_2,$t_1
1095 sltu $at,$c_2,$t_1
1096 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1097 $ADDU $t_2,$at
1098 $ADDU $c_3,$t_2
1099 sltu $c_1,$c_3,$t_2
1100 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1102 mflo $t_1
1103 mfhi $t_2
1104 $ADDU $c_3,$t_1
1105 sltu $at,$c_3,$t_1
1106 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1107 $ADDU $t_2,$at
1108 $ADDU $c_1,$t_2
1109 mflo $t_1
1110 mfhi $t_2
1111 $ADDU $c_3,$t_1
1112 sltu $at,$c_3,$t_1
1113 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1114 $ADDU $t_2,$at
1115 $ADDU $c_1,$t_2
1116 sltu $c_2,$c_1,$t_2
1117 mflo $t_1
1118 mfhi $t_2
1119 $ADDU $c_3,$t_1
1120 sltu $at,$c_3,$t_1
1121 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1122 $ADDU $t_2,$at
1123 $ADDU $c_1,$t_2
1124 sltu $at,$c_1,$t_2
1125 $ADDU $c_2,$at
1126 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1128 mflo $t_1
1129 mfhi $t_2
1130 $ADDU $c_1,$t_1
1131 sltu $at,$c_1,$t_1
1132 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1133 $ADDU $t_2,$at
1134 $ADDU $c_2,$t_2
1135 sltu $c_3,$c_2,$t_2
1136 mflo $t_1
1137 mfhi $t_2
1138 $ADDU $c_1,$t_1
1139 sltu $at,$c_1,$t_1
1140 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1141 $ADDU $t_2,$at
1142 $ADDU $c_2,$t_2
1143 sltu $at,$c_2,$t_2
1144 $ADDU $c_3,$at
1145 mflo $t_1
1146 mfhi $t_2
1147 $ADDU $c_1,$t_1
1148 sltu $at,$c_1,$t_1
1149 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1150 $ADDU $t_2,$at
1151 $ADDU $c_2,$t_2
1152 sltu $at,$c_2,$t_2
1153 $ADDU $c_3,$at
1154 mflo $t_1
1155 mfhi $t_2
1156 $ADDU $c_1,$t_1
1157 sltu $at,$c_1,$t_1
1158 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1159 $ADDU $t_2,$at
1160 $ADDU $c_2,$t_2
1161 sltu $at,$c_2,$t_2
1162 $ADDU $c_3,$at
1163 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1165 mflo $t_1
1166 mfhi $t_2
1167 $ADDU $c_2,$t_1
1168 sltu $at,$c_2,$t_1
1169 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1170 $ADDU $t_2,$at
1171 $ADDU $c_3,$t_2
1172 sltu $c_1,$c_3,$t_2
1173 mflo $t_1
1174 mfhi $t_2
1175 $ADDU $c_2,$t_1
1176 sltu $at,$c_2,$t_1
1177 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1178 $ADDU $t_2,$at
1179 $ADDU $c_3,$t_2
1180 sltu $at,$c_3,$t_2
1181 $ADDU $c_1,$at
1182 mflo $t_1
1183 mfhi $t_2
1184 $ADDU $c_2,$t_1
1185 sltu $at,$c_2,$t_1
1186 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1187 $ADDU $t_2,$at
1188 $ADDU $c_3,$t_2
1189 sltu $at,$c_3,$t_2
1190 $ADDU $c_1,$at
1191 mflo $t_1
1192 mfhi $t_2
1193 $ADDU $c_2,$t_1
1194 sltu $at,$c_2,$t_1
1195 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1196 $ADDU $t_2,$at
1197 $ADDU $c_3,$t_2
1198 sltu $at,$c_3,$t_2
1199 $ADDU $c_1,$at
1200 mflo $t_1
1201 mfhi $t_2
1202 $ADDU $c_2,$t_1
1203 sltu $at,$c_2,$t_1
1204 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1205 $ADDU $t_2,$at
1206 $ADDU $c_3,$t_2
1207 sltu $at,$c_3,$t_2
1208 $ADDU $c_1,$at
1209 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1211 mflo $t_1
1212 mfhi $t_2
1213 $ADDU $c_3,$t_1
1214 sltu $at,$c_3,$t_1
1215 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1216 $ADDU $t_2,$at
1217 $ADDU $c_1,$t_2
1218 sltu $c_2,$c_1,$t_2
1219 mflo $t_1
1220 mfhi $t_2
1221 $ADDU $c_3,$t_1
1222 sltu $at,$c_3,$t_1
1223 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1224 $ADDU $t_2,$at
1225 $ADDU $c_1,$t_2
1226 sltu $at,$c_1,$t_2
1227 $ADDU $c_2,$at
1228 mflo $t_1
1229 mfhi $t_2
1230 $ADDU $c_3,$t_1
1231 sltu $at,$c_3,$t_1
1232 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1233 $ADDU $t_2,$at
1234 $ADDU $c_1,$t_2
1235 sltu $at,$c_1,$t_2
1236 $ADDU $c_2,$at
1237 mflo $t_1
1238 mfhi $t_2
1239 $ADDU $c_3,$t_1
1240 sltu $at,$c_3,$t_1
1241 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1242 $ADDU $t_2,$at
1243 $ADDU $c_1,$t_2
1244 sltu $at,$c_1,$t_2
1245 $ADDU $c_2,$at
1246 mflo $t_1
1247 mfhi $t_2
1248 $ADDU $c_3,$t_1
1249 sltu $at,$c_3,$t_1
1250 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1251 $ADDU $t_2,$at
1252 $ADDU $c_1,$t_2
1253 sltu $at,$c_1,$t_2
1254 $ADDU $c_2,$at
1255 mflo $t_1
1256 mfhi $t_2
1257 $ADDU $c_3,$t_1
1258 sltu $at,$c_3,$t_1
1259 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1260 $ADDU $t_2,$at
1261 $ADDU $c_1,$t_2
1262 sltu $at,$c_1,$t_2
1263 $ADDU $c_2,$at
1264 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1266 mflo $t_1
1267 mfhi $t_2
1268 $ADDU $c_1,$t_1
1269 sltu $at,$c_1,$t_1
1270 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1271 $ADDU $t_2,$at
1272 $ADDU $c_2,$t_2
1273 sltu $c_3,$c_2,$t_2
1274 mflo $t_1
1275 mfhi $t_2
1276 $ADDU $c_1,$t_1
1277 sltu $at,$c_1,$t_1
1278 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1279 $ADDU $t_2,$at
1280 $ADDU $c_2,$t_2
1281 sltu $at,$c_2,$t_2
1282 $ADDU $c_3,$at
1283 mflo $t_1
1284 mfhi $t_2
1285 $ADDU $c_1,$t_1
1286 sltu $at,$c_1,$t_1
1287 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1288 $ADDU $t_2,$at
1289 $ADDU $c_2,$t_2
1290 sltu $at,$c_2,$t_2
1291 $ADDU $c_3,$at
1292 mflo $t_1
1293 mfhi $t_2
1294 $ADDU $c_1,$t_1
1295 sltu $at,$c_1,$t_1
1296 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1297 $ADDU $t_2,$at
1298 $ADDU $c_2,$t_2
1299 sltu $at,$c_2,$t_2
1300 $ADDU $c_3,$at
1301 mflo $t_1
1302 mfhi $t_2
1303 $ADDU $c_1,$t_1
1304 sltu $at,$c_1,$t_1
1305 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1306 $ADDU $t_2,$at
1307 $ADDU $c_2,$t_2
1308 sltu $at,$c_2,$t_2
1309 $ADDU $c_3,$at
1310 mflo $t_1
1311 mfhi $t_2
1312 $ADDU $c_1,$t_1
1313 sltu $at,$c_1,$t_1
1314 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1315 $ADDU $t_2,$at
1316 $ADDU $c_2,$t_2
1317 sltu $at,$c_2,$t_2
1318 $ADDU $c_3,$at
1319 mflo $t_1
1320 mfhi $t_2
1321 $ADDU $c_1,$t_1
1322 sltu $at,$c_1,$t_1
1323 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1324 $ADDU $t_2,$at
1325 $ADDU $c_2,$t_2
1326 sltu $at,$c_2,$t_2
1327 $ADDU $c_3,$at
1328 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1330 mflo $t_1
1331 mfhi $t_2
1332 $ADDU $c_2,$t_1
1333 sltu $at,$c_2,$t_1
1334 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1335 $ADDU $t_2,$at
1336 $ADDU $c_3,$t_2
1337 sltu $c_1,$c_3,$t_2
1338 mflo $t_1
1339 mfhi $t_2
1340 $ADDU $c_2,$t_1
1341 sltu $at,$c_2,$t_1
1342 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1343 $ADDU $t_2,$at
1344 $ADDU $c_3,$t_2
1345 sltu $at,$c_3,$t_2
1346 $ADDU $c_1,$at
1347 mflo $t_1
1348 mfhi $t_2
1349 $ADDU $c_2,$t_1
1350 sltu $at,$c_2,$t_1
1351 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1352 $ADDU $t_2,$at
1353 $ADDU $c_3,$t_2
1354 sltu $at,$c_3,$t_2
1355 $ADDU $c_1,$at
1356 mflo $t_1
1357 mfhi $t_2
1358 $ADDU $c_2,$t_1
1359 sltu $at,$c_2,$t_1
1360 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1361 $ADDU $t_2,$at
1362 $ADDU $c_3,$t_2
1363 sltu $at,$c_3,$t_2
1364 $ADDU $c_1,$at
1365 mflo $t_1
1366 mfhi $t_2
1367 $ADDU $c_2,$t_1
1368 sltu $at,$c_2,$t_1
1369 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1370 $ADDU $t_2,$at
1371 $ADDU $c_3,$t_2
1372 sltu $at,$c_3,$t_2
1373 $ADDU $c_1,$at
1374 mflo $t_1
1375 mfhi $t_2
1376 $ADDU $c_2,$t_1
1377 sltu $at,$c_2,$t_1
1378 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1379 $ADDU $t_2,$at
1380 $ADDU $c_3,$t_2
1381 sltu $at,$c_3,$t_2
1382 $ADDU $c_1,$at
1383 mflo $t_1
1384 mfhi $t_2
1385 $ADDU $c_2,$t_1
1386 sltu $at,$c_2,$t_1
1387 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1388 $ADDU $t_2,$at
1389 $ADDU $c_3,$t_2
1390 sltu $at,$c_3,$t_2
1391 $ADDU $c_1,$at
1392 mflo $t_1
1393 mfhi $t_2
1394 $ADDU $c_2,$t_1
1395 sltu $at,$c_2,$t_1
1396 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1397 $ADDU $t_2,$at
1398 $ADDU $c_3,$t_2
1399 sltu $at,$c_3,$t_2
1400 $ADDU $c_1,$at
1401 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1403 mflo $t_1
1404 mfhi $t_2
1405 $ADDU $c_3,$t_1
1406 sltu $at,$c_3,$t_1
1407 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1408 $ADDU $t_2,$at
1409 $ADDU $c_1,$t_2
1410 sltu $c_2,$c_1,$t_2
1411 mflo $t_1
1412 mfhi $t_2
1413 $ADDU $c_3,$t_1
1414 sltu $at,$c_3,$t_1
1415 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1416 $ADDU $t_2,$at
1417 $ADDU $c_1,$t_2
1418 sltu $at,$c_1,$t_2
1419 $ADDU $c_2,$at
1420 mflo $t_1
1421 mfhi $t_2
1422 $ADDU $c_3,$t_1
1423 sltu $at,$c_3,$t_1
1424 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1425 $ADDU $t_2,$at
1426 $ADDU $c_1,$t_2
1427 sltu $at,$c_1,$t_2
1428 $ADDU $c_2,$at
1429 mflo $t_1
1430 mfhi $t_2
1431 $ADDU $c_3,$t_1
1432 sltu $at,$c_3,$t_1
1433 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1434 $ADDU $t_2,$at
1435 $ADDU $c_1,$t_2
1436 sltu $at,$c_1,$t_2
1437 $ADDU $c_2,$at
1438 mflo $t_1
1439 mfhi $t_2
1440 $ADDU $c_3,$t_1
1441 sltu $at,$c_3,$t_1
1442 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1443 $ADDU $t_2,$at
1444 $ADDU $c_1,$t_2
1445 sltu $at,$c_1,$t_2
1446 $ADDU $c_2,$at
1447 mflo $t_1
1448 mfhi $t_2
1449 $ADDU $c_3,$t_1
1450 sltu $at,$c_3,$t_1
1451 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1452 $ADDU $t_2,$at
1453 $ADDU $c_1,$t_2
1454 sltu $at,$c_1,$t_2
1455 $ADDU $c_2,$at
1456 mflo $t_1
1457 mfhi $t_2
1458 $ADDU $c_3,$t_1
1459 sltu $at,$c_3,$t_1
1460 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1461 $ADDU $t_2,$at
1462 $ADDU $c_1,$t_2
1463 sltu $at,$c_1,$t_2
1464 $ADDU $c_2,$at
1465 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1467 mflo $t_1
1468 mfhi $t_2
1469 $ADDU $c_1,$t_1
1470 sltu $at,$c_1,$t_1
1471 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1472 $ADDU $t_2,$at
1473 $ADDU $c_2,$t_2
1474 sltu $c_3,$c_2,$t_2
1475 mflo $t_1
1476 mfhi $t_2
1477 $ADDU $c_1,$t_1
1478 sltu $at,$c_1,$t_1
1479 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1480 $ADDU $t_2,$at
1481 $ADDU $c_2,$t_2
1482 sltu $at,$c_2,$t_2
1483 $ADDU $c_3,$at
1484 mflo $t_1
1485 mfhi $t_2
1486 $ADDU $c_1,$t_1
1487 sltu $at,$c_1,$t_1
1488 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1489 $ADDU $t_2,$at
1490 $ADDU $c_2,$t_2
1491 sltu $at,$c_2,$t_2
1492 $ADDU $c_3,$at
1493 mflo $t_1
1494 mfhi $t_2
1495 $ADDU $c_1,$t_1
1496 sltu $at,$c_1,$t_1
1497 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1498 $ADDU $t_2,$at
1499 $ADDU $c_2,$t_2
1500 sltu $at,$c_2,$t_2
1501 $ADDU $c_3,$at
1502 mflo $t_1
1503 mfhi $t_2
1504 $ADDU $c_1,$t_1
1505 sltu $at,$c_1,$t_1
1506 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1507 $ADDU $t_2,$at
1508 $ADDU $c_2,$t_2
1509 sltu $at,$c_2,$t_2
1510 $ADDU $c_3,$at
1511 mflo $t_1
1512 mfhi $t_2
1513 $ADDU $c_1,$t_1
1514 sltu $at,$c_1,$t_1
1515 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1516 $ADDU $t_2,$at
1517 $ADDU $c_2,$t_2
1518 sltu $at,$c_2,$t_2
1519 $ADDU $c_3,$at
1520 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1522 mflo $t_1
1523 mfhi $t_2
1524 $ADDU $c_2,$t_1
1525 sltu $at,$c_2,$t_1
1526 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1527 $ADDU $t_2,$at
1528 $ADDU $c_3,$t_2
1529 sltu $c_1,$c_3,$t_2
1530 mflo $t_1
1531 mfhi $t_2
1532 $ADDU $c_2,$t_1
1533 sltu $at,$c_2,$t_1
1534 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1535 $ADDU $t_2,$at
1536 $ADDU $c_3,$t_2
1537 sltu $at,$c_3,$t_2
1538 $ADDU $c_1,$at
1539 mflo $t_1
1540 mfhi $t_2
1541 $ADDU $c_2,$t_1
1542 sltu $at,$c_2,$t_1
1543 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1544 $ADDU $t_2,$at
1545 $ADDU $c_3,$t_2
1546 sltu $at,$c_3,$t_2
1547 $ADDU $c_1,$at
1548 mflo $t_1
1549 mfhi $t_2
1550 $ADDU $c_2,$t_1
1551 sltu $at,$c_2,$t_1
1552 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1553 $ADDU $t_2,$at
1554 $ADDU $c_3,$t_2
1555 sltu $at,$c_3,$t_2
1556 $ADDU $c_1,$at
1557 mflo $t_1
1558 mfhi $t_2
1559 $ADDU $c_2,$t_1
1560 sltu $at,$c_2,$t_1
1561 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1562 $ADDU $t_2,$at
1563 $ADDU $c_3,$t_2
1564 sltu $at,$c_3,$t_2
1565 $ADDU $c_1,$at
1566 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1568 mflo $t_1
1569 mfhi $t_2
1570 $ADDU $c_3,$t_1
1571 sltu $at,$c_3,$t_1
1572 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1573 $ADDU $t_2,$at
1574 $ADDU $c_1,$t_2
1575 sltu $c_2,$c_1,$t_2
1576 mflo $t_1
1577 mfhi $t_2
1578 $ADDU $c_3,$t_1
1579 sltu $at,$c_3,$t_1
1580 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1581 $ADDU $t_2,$at
1582 $ADDU $c_1,$t_2
1583 sltu $at,$c_1,$t_2
1584 $ADDU $c_2,$at
1585 mflo $t_1
1586 mfhi $t_2
1587 $ADDU $c_3,$t_1
1588 sltu $at,$c_3,$t_1
1589 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1590 $ADDU $t_2,$at
1591 $ADDU $c_1,$t_2
1592 sltu $at,$c_1,$t_2
1593 $ADDU $c_2,$at
1594 mflo $t_1
1595 mfhi $t_2
1596 $ADDU $c_3,$t_1
1597 sltu $at,$c_3,$t_1
1598 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1599 $ADDU $t_2,$at
1600 $ADDU $c_1,$t_2
1601 sltu $at,$c_1,$t_2
1602 $ADDU $c_2,$at
1603 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1605 mflo $t_1
1606 mfhi $t_2
1607 $ADDU $c_1,$t_1
1608 sltu $at,$c_1,$t_1
1609 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1610 $ADDU $t_2,$at
1611 $ADDU $c_2,$t_2
1612 sltu $c_3,$c_2,$t_2
1613 mflo $t_1
1614 mfhi $t_2
1615 $ADDU $c_1,$t_1
1616 sltu $at,$c_1,$t_1
1617 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1618 $ADDU $t_2,$at
1619 $ADDU $c_2,$t_2
1620 sltu $at,$c_2,$t_2
1621 $ADDU $c_3,$at
1622 mflo $t_1
1623 mfhi $t_2
1624 $ADDU $c_1,$t_1
1625 sltu $at,$c_1,$t_1
1626 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1627 $ADDU $t_2,$at
1628 $ADDU $c_2,$t_2
1629 sltu $at,$c_2,$t_2
1630 $ADDU $c_3,$at
1631 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1633 mflo $t_1
1634 mfhi $t_2
1635 $ADDU $c_2,$t_1
1636 sltu $at,$c_2,$t_1
1637 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1638 $ADDU $t_2,$at
1639 $ADDU $c_3,$t_2
1640 sltu $c_1,$c_3,$t_2
1641 mflo $t_1
1642 mfhi $t_2
1643 $ADDU $c_2,$t_1
1644 sltu $at,$c_2,$t_1
1645 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1646 $ADDU $t_2,$at
1647 $ADDU $c_3,$t_2
1648 sltu $at,$c_3,$t_2
1649 $ADDU $c_1,$at
1650 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1652 mflo $t_1
1653 mfhi $t_2
1654 $ADDU $c_3,$t_1
1655 sltu $at,$c_3,$t_1
1656 $ADDU $t_2,$at
1657 $ADDU $c_1,$t_2
1658 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1659 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1661 .set noreorder
1663 $code.=<<___ if ($flavour =~ /nubi/i);
1664 $REG_L $s5,10*$SZREG($sp)
1665 $REG_L $s4,9*$SZREG($sp)
1666 $REG_L $s3,8*$SZREG($sp)
1667 $REG_L $s2,7*$SZREG($sp)
1668 $REG_L $s1,6*$SZREG($sp)
1669 $REG_L $s0,5*$SZREG($sp)
1670 $REG_L $t3,4*$SZREG($sp)
1671 $REG_L $t2,3*$SZREG($sp)
1672 $REG_L $t1,2*$SZREG($sp)
1673 $REG_L $t0,1*$SZREG($sp)
1674 $REG_L $gp,0*$SZREG($sp)
1675 jr $ra
1676 $PTR_ADD $sp,12*$SZREG
1678 $code.=<<___ if ($flavour !~ /nubi/i);
1679 $REG_L $s5,5*$SZREG($sp)
1680 $REG_L $s4,4*$SZREG($sp)
1681 $REG_L $s3,3*$SZREG($sp)
1682 $REG_L $s2,2*$SZREG($sp)
1683 $REG_L $s1,1*$SZREG($sp)
1684 $REG_L $s0,0*$SZREG($sp)
1685 jr $ra
1686 $PTR_ADD $sp,6*$SZREG
1688 $code.=<<___;
1689 .end bn_mul_comba8
1691 .align 5
1692 .globl bn_mul_comba4
1693 .ent bn_mul_comba4
1694 bn_mul_comba4:
1696 $code.=<<___ if ($flavour =~ /nubi/i);
1697 .frame $sp,6*$SZREG,$ra
1698 .mask 0x8000f008,-$SZREG
1699 .set noreorder
1700 $PTR_SUB $sp,6*$SZREG
1701 $REG_S $ra,5*$SZREG($sp)
1702 $REG_S $t3,4*$SZREG($sp)
1703 $REG_S $t2,3*$SZREG($sp)
1704 $REG_S $t1,2*$SZREG($sp)
1705 $REG_S $t0,1*$SZREG($sp)
1706 $REG_S $gp,0*$SZREG($sp)
1708 $code.=<<___;
1709 .set reorder
1710 $LD $a_0,0($a1)
1711 $LD $b_0,0($a2)
1712 $LD $a_1,$BNSZ($a1)
1713 $LD $a_2,2*$BNSZ($a1)
1714 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1715 $LD $a_3,3*$BNSZ($a1)
1716 $LD $b_1,$BNSZ($a2)
1717 $LD $b_2,2*$BNSZ($a2)
1718 $LD $b_3,3*$BNSZ($a2)
1719 mflo $c_1
1720 mfhi $c_2
1721 $ST $c_1,0($a0)
1723 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1724 mflo $t_1
1725 mfhi $t_2
1726 $ADDU $c_2,$t_1
1727 sltu $at,$c_2,$t_1
1728 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1729 $ADDU $c_3,$t_2,$at
1730 mflo $t_1
1731 mfhi $t_2
1732 $ADDU $c_2,$t_1
1733 sltu $at,$c_2,$t_1
1734 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1735 $ADDU $t_2,$at
1736 $ADDU $c_3,$t_2
1737 sltu $c_1,$c_3,$t_2
1738 $ST $c_2,$BNSZ($a0)
1740 mflo $t_1
1741 mfhi $t_2
1742 $ADDU $c_3,$t_1
1743 sltu $at,$c_3,$t_1
1744 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1745 $ADDU $t_2,$at
1746 $ADDU $c_1,$t_2
1747 mflo $t_1
1748 mfhi $t_2
1749 $ADDU $c_3,$t_1
1750 sltu $at,$c_3,$t_1
1751 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1752 $ADDU $t_2,$at
1753 $ADDU $c_1,$t_2
1754 sltu $c_2,$c_1,$t_2
1755 mflo $t_1
1756 mfhi $t_2
1757 $ADDU $c_3,$t_1
1758 sltu $at,$c_3,$t_1
1759 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1760 $ADDU $t_2,$at
1761 $ADDU $c_1,$t_2
1762 sltu $at,$c_1,$t_2
1763 $ADDU $c_2,$at
1764 $ST $c_3,2*$BNSZ($a0)
1766 mflo $t_1
1767 mfhi $t_2
1768 $ADDU $c_1,$t_1
1769 sltu $at,$c_1,$t_1
1770 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1771 $ADDU $t_2,$at
1772 $ADDU $c_2,$t_2
1773 sltu $c_3,$c_2,$t_2
1774 mflo $t_1
1775 mfhi $t_2
1776 $ADDU $c_1,$t_1
1777 sltu $at,$c_1,$t_1
1778 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1779 $ADDU $t_2,$at
1780 $ADDU $c_2,$t_2
1781 sltu $at,$c_2,$t_2
1782 $ADDU $c_3,$at
1783 mflo $t_1
1784 mfhi $t_2
1785 $ADDU $c_1,$t_1
1786 sltu $at,$c_1,$t_1
1787 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1788 $ADDU $t_2,$at
1789 $ADDU $c_2,$t_2
1790 sltu $at,$c_2,$t_2
1791 $ADDU $c_3,$at
1792 mflo $t_1
1793 mfhi $t_2
1794 $ADDU $c_1,$t_1
1795 sltu $at,$c_1,$t_1
1796 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1797 $ADDU $t_2,$at
1798 $ADDU $c_2,$t_2
1799 sltu $at,$c_2,$t_2
1800 $ADDU $c_3,$at
1801 $ST $c_1,3*$BNSZ($a0)
1803 mflo $t_1
1804 mfhi $t_2
1805 $ADDU $c_2,$t_1
1806 sltu $at,$c_2,$t_1
1807 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1808 $ADDU $t_2,$at
1809 $ADDU $c_3,$t_2
1810 sltu $c_1,$c_3,$t_2
1811 mflo $t_1
1812 mfhi $t_2
1813 $ADDU $c_2,$t_1
1814 sltu $at,$c_2,$t_1
1815 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1816 $ADDU $t_2,$at
1817 $ADDU $c_3,$t_2
1818 sltu $at,$c_3,$t_2
1819 $ADDU $c_1,$at
1820 mflo $t_1
1821 mfhi $t_2
1822 $ADDU $c_2,$t_1
1823 sltu $at,$c_2,$t_1
1824 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1825 $ADDU $t_2,$at
1826 $ADDU $c_3,$t_2
1827 sltu $at,$c_3,$t_2
1828 $ADDU $c_1,$at
1829 $ST $c_2,4*$BNSZ($a0)
1831 mflo $t_1
1832 mfhi $t_2
1833 $ADDU $c_3,$t_1
1834 sltu $at,$c_3,$t_1
1835 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1836 $ADDU $t_2,$at
1837 $ADDU $c_1,$t_2
1838 sltu $c_2,$c_1,$t_2
1839 mflo $t_1
1840 mfhi $t_2
1841 $ADDU $c_3,$t_1
1842 sltu $at,$c_3,$t_1
1843 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1844 $ADDU $t_2,$at
1845 $ADDU $c_1,$t_2
1846 sltu $at,$c_1,$t_2
1847 $ADDU $c_2,$at
1848 $ST $c_3,5*$BNSZ($a0)
1850 mflo $t_1
1851 mfhi $t_2
1852 $ADDU $c_1,$t_1
1853 sltu $at,$c_1,$t_1
1854 $ADDU $t_2,$at
1855 $ADDU $c_2,$t_2
1856 $ST $c_1,6*$BNSZ($a0)
1857 $ST $c_2,7*$BNSZ($a0)
1859 .set noreorder
1861 $code.=<<___ if ($flavour =~ /nubi/i);
1862 $REG_L $t3,4*$SZREG($sp)
1863 $REG_L $t2,3*$SZREG($sp)
1864 $REG_L $t1,2*$SZREG($sp)
1865 $REG_L $t0,1*$SZREG($sp)
1866 $REG_L $gp,0*$SZREG($sp)
1867 $PTR_ADD $sp,6*$SZREG
1869 $code.=<<___;
1870 jr $ra
1872 .end bn_mul_comba4
1875 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1877 $code.=<<___;
1879 .align 5
1880 .globl bn_sqr_comba8
1881 .ent bn_sqr_comba8
1882 bn_sqr_comba8:
1884 $code.=<<___ if ($flavour =~ /nubi/i);
1885 .frame $sp,6*$SZREG,$ra
1886 .mask 0x8000f008,-$SZREG
1887 .set noreorder
1888 $PTR_SUB $sp,6*$SZREG
1889 $REG_S $ra,5*$SZREG($sp)
1890 $REG_S $t3,4*$SZREG($sp)
1891 $REG_S $t2,3*$SZREG($sp)
1892 $REG_S $t1,2*$SZREG($sp)
1893 $REG_S $t0,1*$SZREG($sp)
1894 $REG_S $gp,0*$SZREG($sp)
1896 $code.=<<___;
1897 .set reorder
1898 $LD $a_0,0($a1)
1899 $LD $a_1,$BNSZ($a1)
1900 $LD $a_2,2*$BNSZ($a1)
1901 $LD $a_3,3*$BNSZ($a1)
1903 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1904 $LD $a_4,4*$BNSZ($a1)
1905 $LD $a_5,5*$BNSZ($a1)
1906 $LD $a_6,6*$BNSZ($a1)
1907 $LD $a_7,7*$BNSZ($a1)
1908 mflo $c_1
1909 mfhi $c_2
1910 $ST $c_1,0($a0)
1912 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1913 mflo $t_1
1914 mfhi $t_2
1915 slt $c_1,$t_2,$zero
1916 $SLL $t_2,1
1917 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1918 slt $a2,$t_1,$zero
1919 $ADDU $t_2,$a2
1920 $SLL $t_1,1
1921 $ADDU $c_2,$t_1
1922 sltu $at,$c_2,$t_1
1923 $ADDU $c_3,$t_2,$at
1924 $ST $c_2,$BNSZ($a0)
1926 mflo $t_1
1927 mfhi $t_2
1928 slt $c_2,$t_2,$zero
1929 $SLL $t_2,1
1930 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1931 slt $a2,$t_1,$zero
1932 $ADDU $t_2,$a2
1933 $SLL $t_1,1
1934 $ADDU $c_3,$t_1
1935 sltu $at,$c_3,$t_1
1936 $ADDU $t_2,$at
1937 $ADDU $c_1,$t_2
1938 sltu $at,$c_1,$t_2
1939 $ADDU $c_2,$at
1940 mflo $t_1
1941 mfhi $t_2
1942 $ADDU $c_3,$t_1
1943 sltu $at,$c_3,$t_1
1944 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1945 $ADDU $t_2,$at
1946 $ADDU $c_1,$t_2
1947 sltu $at,$c_1,$t_2
1948 $ADDU $c_2,$at
1949 $ST $c_3,2*$BNSZ($a0)
1951 mflo $t_1
1952 mfhi $t_2
1953 slt $c_3,$t_2,$zero
1954 $SLL $t_2,1
1955 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
1956 slt $a2,$t_1,$zero
1957 $ADDU $t_2,$a2
1958 $SLL $t_1,1
1959 $ADDU $c_1,$t_1
1960 sltu $at,$c_1,$t_1
1961 $ADDU $t_2,$at
1962 $ADDU $c_2,$t_2
1963 sltu $at,$c_2,$t_2
1964 $ADDU $c_3,$at
1965 mflo $t_1
1966 mfhi $t_2
1967 slt $at,$t_2,$zero
1968 $ADDU $c_3,$at
1969 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
1970 $SLL $t_2,1
1971 slt $a2,$t_1,$zero
1972 $ADDU $t_2,$a2
1973 $SLL $t_1,1
1974 $ADDU $c_1,$t_1
1975 sltu $at,$c_1,$t_1
1976 $ADDU $t_2,$at
1977 $ADDU $c_2,$t_2
1978 sltu $at,$c_2,$t_2
1979 $ADDU $c_3,$at
1980 $ST $c_1,3*$BNSZ($a0)
1982 mflo $t_1
1983 mfhi $t_2
1984 slt $c_1,$t_2,$zero
1985 $SLL $t_2,1
1986 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
1987 slt $a2,$t_1,$zero
1988 $ADDU $t_2,$a2
1989 $SLL $t_1,1
1990 $ADDU $c_2,$t_1
1991 sltu $at,$c_2,$t_1
1992 $ADDU $t_2,$at
1993 $ADDU $c_3,$t_2
1994 sltu $at,$c_3,$t_2
1995 $ADDU $c_1,$at
1996 mflo $t_1
1997 mfhi $t_2
1998 slt $at,$t_2,$zero
1999 $ADDU $c_1,$at
2000 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2001 $SLL $t_2,1
2002 slt $a2,$t_1,$zero
2003 $ADDU $t_2,$a2
2004 $SLL $t_1,1
2005 $ADDU $c_2,$t_1
2006 sltu $at,$c_2,$t_1
2007 $ADDU $t_2,$at
2008 $ADDU $c_3,$t_2
2009 sltu $at,$c_3,$t_2
2010 $ADDU $c_1,$at
2011 mflo $t_1
2012 mfhi $t_2
2013 $ADDU $c_2,$t_1
2014 sltu $at,$c_2,$t_1
2015 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
2016 $ADDU $t_2,$at
2017 $ADDU $c_3,$t_2
2018 sltu $at,$c_3,$t_2
2019 $ADDU $c_1,$at
2020 $ST $c_2,4*$BNSZ($a0)
2022 mflo $t_1
2023 mfhi $t_2
2024 slt $c_2,$t_2,$zero
2025 $SLL $t_2,1
2026 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
2027 slt $a2,$t_1,$zero
2028 $ADDU $t_2,$a2
2029 $SLL $t_1,1
2030 $ADDU $c_3,$t_1
2031 sltu $at,$c_3,$t_1
2032 $ADDU $t_2,$at
2033 $ADDU $c_1,$t_2
2034 sltu $at,$c_1,$t_2
2035 $ADDU $c_2,$at
2036 mflo $t_1
2037 mfhi $t_2
2038 slt $at,$t_2,$zero
2039 $ADDU $c_2,$at
2040 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2041 $SLL $t_2,1
2042 slt $a2,$t_1,$zero
2043 $ADDU $t_2,$a2
2044 $SLL $t_1,1
2045 $ADDU $c_3,$t_1
2046 sltu $at,$c_3,$t_1
2047 $ADDU $t_2,$at
2048 $ADDU $c_1,$t_2
2049 sltu $at,$c_1,$t_2
2050 $ADDU $c_2,$at
2051 mflo $t_1
2052 mfhi $t_2
2053 slt $at,$t_2,$zero
2054 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
2055 $ADDU $c_2,$at
2056 $SLL $t_2,1
2057 slt $a2,$t_1,$zero
2058 $ADDU $t_2,$a2
2059 $SLL $t_1,1
2060 $ADDU $c_3,$t_1
2061 sltu $at,$c_3,$t_1
2062 $ADDU $t_2,$at
2063 $ADDU $c_1,$t_2
2064 sltu $at,$c_1,$t_2
2065 $ADDU $c_2,$at
2066 $ST $c_3,5*$BNSZ($a0)
2068 mflo $t_1
2069 mfhi $t_2
2070 slt $c_3,$t_2,$zero
2071 $SLL $t_2,1
2072 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
2073 slt $a2,$t_1,$zero
2074 $ADDU $t_2,$a2
2075 $SLL $t_1,1
2076 $ADDU $c_1,$t_1
2077 sltu $at,$c_1,$t_1
2078 $ADDU $t_2,$at
2079 $ADDU $c_2,$t_2
2080 sltu $at,$c_2,$t_2
2081 $ADDU $c_3,$at
2082 mflo $t_1
2083 mfhi $t_2
2084 slt $at,$t_2,$zero
2085 $ADDU $c_3,$at
2086 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
2087 $SLL $t_2,1
2088 slt $a2,$t_1,$zero
2089 $ADDU $t_2,$a2
2090 $SLL $t_1,1
2091 $ADDU $c_1,$t_1
2092 sltu $at,$c_1,$t_1
2093 $ADDU $t_2,$at
2094 $ADDU $c_2,$t_2
2095 sltu $at,$c_2,$t_2
2096 $ADDU $c_3,$at
2097 mflo $t_1
2098 mfhi $t_2
2099 slt $at,$t_2,$zero
2100 $ADDU $c_3,$at
2101 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2102 $SLL $t_2,1
2103 slt $a2,$t_1,$zero
2104 $ADDU $t_2,$a2
2105 $SLL $t_1,1
2106 $ADDU $c_1,$t_1
2107 sltu $at,$c_1,$t_1
2108 $ADDU $t_2,$at
2109 $ADDU $c_2,$t_2
2110 sltu $at,$c_2,$t_2
2111 $ADDU $c_3,$at
2112 mflo $t_1
2113 mfhi $t_2
2114 $ADDU $c_1,$t_1
2115 sltu $at,$c_1,$t_1
2116 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2117 $ADDU $t_2,$at
2118 $ADDU $c_2,$t_2
2119 sltu $at,$c_2,$t_2
2120 $ADDU $c_3,$at
2121 $ST $c_1,6*$BNSZ($a0)
2123 mflo $t_1
2124 mfhi $t_2
2125 slt $c_1,$t_2,$zero
2126 $SLL $t_2,1
2127 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
2128 slt $a2,$t_1,$zero
2129 $ADDU $t_2,$a2
2130 $SLL $t_1,1
2131 $ADDU $c_2,$t_1
2132 sltu $at,$c_2,$t_1
2133 $ADDU $t_2,$at
2134 $ADDU $c_3,$t_2
2135 sltu $at,$c_3,$t_2
2136 $ADDU $c_1,$at
2137 mflo $t_1
2138 mfhi $t_2
2139 slt $at,$t_2,$zero
2140 $ADDU $c_1,$at
2141 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
2142 $SLL $t_2,1
2143 slt $a2,$t_1,$zero
2144 $ADDU $t_2,$a2
2145 $SLL $t_1,1
2146 $ADDU $c_2,$t_1
2147 sltu $at,$c_2,$t_1
2148 $ADDU $t_2,$at
2149 $ADDU $c_3,$t_2
2150 sltu $at,$c_3,$t_2
2151 $ADDU $c_1,$at
2152 mflo $t_1
2153 mfhi $t_2
2154 slt $at,$t_2,$zero
2155 $ADDU $c_1,$at
2156 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
2157 $SLL $t_2,1
2158 slt $a2,$t_1,$zero
2159 $ADDU $t_2,$a2
2160 $SLL $t_1,1
2161 $ADDU $c_2,$t_1
2162 sltu $at,$c_2,$t_1
2163 $ADDU $t_2,$at
2164 $ADDU $c_3,$t_2
2165 sltu $at,$c_3,$t_2
2166 $ADDU $c_1,$at
2167 mflo $t_1
2168 mfhi $t_2
2169 slt $at,$t_2,$zero
2170 $ADDU $c_1,$at
2171 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
2172 $SLL $t_2,1
2173 slt $a2,$t_1,$zero
2174 $ADDU $t_2,$a2
2175 $SLL $t_1,1
2176 $ADDU $c_2,$t_1
2177 sltu $at,$c_2,$t_1
2178 $ADDU $t_2,$at
2179 $ADDU $c_3,$t_2
2180 sltu $at,$c_3,$t_2
2181 $ADDU $c_1,$at
2182 $ST $c_2,7*$BNSZ($a0)
2184 mflo $t_1
2185 mfhi $t_2
2186 slt $c_2,$t_2,$zero
2187 $SLL $t_2,1
2188 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
2189 slt $a2,$t_1,$zero
2190 $ADDU $t_2,$a2
2191 $SLL $t_1,1
2192 $ADDU $c_3,$t_1
2193 sltu $at,$c_3,$t_1
2194 $ADDU $t_2,$at
2195 $ADDU $c_1,$t_2
2196 sltu $at,$c_1,$t_2
2197 $ADDU $c_2,$at
2198 mflo $t_1
2199 mfhi $t_2
2200 slt $at,$t_2,$zero
2201 $ADDU $c_2,$at
2202 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
2203 $SLL $t_2,1
2204 slt $a2,$t_1,$zero
2205 $ADDU $t_2,$a2
2206 $SLL $t_1,1
2207 $ADDU $c_3,$t_1
2208 sltu $at,$c_3,$t_1
2209 $ADDU $t_2,$at
2210 $ADDU $c_1,$t_2
2211 sltu $at,$c_1,$t_2
2212 $ADDU $c_2,$at
2213 mflo $t_1
2214 mfhi $t_2
2215 slt $at,$t_2,$zero
2216 $ADDU $c_2,$at
2217 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
2218 $SLL $t_2,1
2219 slt $a2,$t_1,$zero
2220 $ADDU $t_2,$a2
2221 $SLL $t_1,1
2222 $ADDU $c_3,$t_1
2223 sltu $at,$c_3,$t_1
2224 $ADDU $t_2,$at
2225 $ADDU $c_1,$t_2
2226 sltu $at,$c_1,$t_2
2227 $ADDU $c_2,$at
2228 mflo $t_1
2229 mfhi $t_2
2230 $ADDU $c_3,$t_1
2231 sltu $at,$c_3,$t_1
2232 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2233 $ADDU $t_2,$at
2234 $ADDU $c_1,$t_2
2235 sltu $at,$c_1,$t_2
2236 $ADDU $c_2,$at
2237 $ST $c_3,8*$BNSZ($a0)
2239 mflo $t_1
2240 mfhi $t_2
2241 slt $c_3,$t_2,$zero
2242 $SLL $t_2,1
2243 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
2244 slt $a2,$t_1,$zero
2245 $ADDU $t_2,$a2
2246 $SLL $t_1,1
2247 $ADDU $c_1,$t_1
2248 sltu $at,$c_1,$t_1
2249 $ADDU $t_2,$at
2250 $ADDU $c_2,$t_2
2251 sltu $at,$c_2,$t_2
2252 $ADDU $c_3,$at
2253 mflo $t_1
2254 mfhi $t_2
2255 slt $at,$t_2,$zero
2256 $ADDU $c_3,$at
2257 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
2258 $SLL $t_2,1
2259 slt $a2,$t_1,$zero
2260 $ADDU $t_2,$a2
2261 $SLL $t_1,1
2262 $ADDU $c_1,$t_1
2263 sltu $at,$c_1,$t_1
2264 $ADDU $t_2,$at
2265 $ADDU $c_2,$t_2
2266 sltu $at,$c_2,$t_2
2267 $ADDU $c_3,$at
2268 mflo $t_1
2269 mfhi $t_2
2270 slt $at,$t_2,$zero
2271 $ADDU $c_3,$at
2272 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
2273 $SLL $t_2,1
2274 slt $a2,$t_1,$zero
2275 $ADDU $t_2,$a2
2276 $SLL $t_1,1
2277 $ADDU $c_1,$t_1
2278 sltu $at,$c_1,$t_1
2279 $ADDU $t_2,$at
2280 $ADDU $c_2,$t_2
2281 sltu $at,$c_2,$t_2
2282 $ADDU $c_3,$at
2283 $ST $c_1,9*$BNSZ($a0)
2285 mflo $t_1
2286 mfhi $t_2
2287 slt $c_1,$t_2,$zero
2288 $SLL $t_2,1
2289 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
2290 slt $a2,$t_1,$zero
2291 $ADDU $t_2,$a2
2292 $SLL $t_1,1
2293 $ADDU $c_2,$t_1
2294 sltu $at,$c_2,$t_1
2295 $ADDU $t_2,$at
2296 $ADDU $c_3,$t_2
2297 sltu $at,$c_3,$t_2
2298 $ADDU $c_1,$at
2299 mflo $t_1
2300 mfhi $t_2
2301 slt $at,$t_2,$zero
2302 $ADDU $c_1,$at
2303 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
2304 $SLL $t_2,1
2305 slt $a2,$t_1,$zero
2306 $ADDU $t_2,$a2
2307 $SLL $t_1,1
2308 $ADDU $c_2,$t_1
2309 sltu $at,$c_2,$t_1
2310 $ADDU $t_2,$at
2311 $ADDU $c_3,$t_2
2312 sltu $at,$c_3,$t_2
2313 $ADDU $c_1,$at
2314 mflo $t_1
2315 mfhi $t_2
2316 $ADDU $c_2,$t_1
2317 sltu $at,$c_2,$t_1
2318 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2319 $ADDU $t_2,$at
2320 $ADDU $c_3,$t_2
2321 sltu $at,$c_3,$t_2
2322 $ADDU $c_1,$at
2323 $ST $c_2,10*$BNSZ($a0)
2325 mflo $t_1
2326 mfhi $t_2
2327 slt $c_2,$t_2,$zero
2328 $SLL $t_2,1
2329 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
2330 slt $a2,$t_1,$zero
2331 $ADDU $t_2,$a2
2332 $SLL $t_1,1
2333 $ADDU $c_3,$t_1
2334 sltu $at,$c_3,$t_1
2335 $ADDU $t_2,$at
2336 $ADDU $c_1,$t_2
2337 sltu $at,$c_1,$t_2
2338 $ADDU $c_2,$at
2339 mflo $t_1
2340 mfhi $t_2
2341 slt $at,$t_2,$zero
2342 $ADDU $c_2,$at
2343 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
2344 $SLL $t_2,1
2345 slt $a2,$t_1,$zero
2346 $ADDU $t_2,$a2
2347 $SLL $t_1,1
2348 $ADDU $c_3,$t_1
2349 sltu $at,$c_3,$t_1
2350 $ADDU $t_2,$at
2351 $ADDU $c_1,$t_2
2352 sltu $at,$c_1,$t_2
2353 $ADDU $c_2,$at
2354 $ST $c_3,11*$BNSZ($a0)
2356 mflo $t_1
2357 mfhi $t_2
2358 slt $c_3,$t_2,$zero
2359 $SLL $t_2,1
2360 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
2361 slt $a2,$t_1,$zero
2362 $ADDU $t_2,$a2
2363 $SLL $t_1,1
2364 $ADDU $c_1,$t_1
2365 sltu $at,$c_1,$t_1
2366 $ADDU $t_2,$at
2367 $ADDU $c_2,$t_2
2368 sltu $at,$c_2,$t_2
2369 $ADDU $c_3,$at
2370 mflo $t_1
2371 mfhi $t_2
2372 $ADDU $c_1,$t_1
2373 sltu $at,$c_1,$t_1
2374 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2375 $ADDU $t_2,$at
2376 $ADDU $c_2,$t_2
2377 sltu $at,$c_2,$t_2
2378 $ADDU $c_3,$at
2379 $ST $c_1,12*$BNSZ($a0)
2381 mflo $t_1
2382 mfhi $t_2
2383 slt $c_1,$t_2,$zero
2384 $SLL $t_2,1
2385 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
2386 slt $a2,$t_1,$zero
2387 $ADDU $t_2,$a2
2388 $SLL $t_1,1
2389 $ADDU $c_2,$t_1
2390 sltu $at,$c_2,$t_1
2391 $ADDU $t_2,$at
2392 $ADDU $c_3,$t_2
2393 sltu $at,$c_3,$t_2
2394 $ADDU $c_1,$at
2395 $ST $c_2,13*$BNSZ($a0)
2397 mflo $t_1
2398 mfhi $t_2
2399 $ADDU $c_3,$t_1
2400 sltu $at,$c_3,$t_1
2401 $ADDU $t_2,$at
2402 $ADDU $c_1,$t_2
2403 $ST $c_3,14*$BNSZ($a0)
2404 $ST $c_1,15*$BNSZ($a0)
2406 .set noreorder
2408 $code.=<<___ if ($flavour =~ /nubi/i);
2409 $REG_L $t3,4*$SZREG($sp)
2410 $REG_L $t2,3*$SZREG($sp)
2411 $REG_L $t1,2*$SZREG($sp)
2412 $REG_L $t0,1*$SZREG($sp)
2413 $REG_L $gp,0*$SZREG($sp)
2414 $PTR_ADD $sp,6*$SZREG
2416 $code.=<<___;
2417 jr $ra
2419 .end bn_sqr_comba8
2421 .align 5
2422 .globl bn_sqr_comba4
2423 .ent bn_sqr_comba4
2424 bn_sqr_comba4:
2426 $code.=<<___ if ($flavour =~ /nubi/i);
2427 .frame $sp,6*$SZREG,$ra
2428 .mask 0x8000f008,-$SZREG
2429 .set noreorder
2430 $PTR_SUB $sp,6*$SZREG
2431 $REG_S $ra,5*$SZREG($sp)
2432 $REG_S $t3,4*$SZREG($sp)
2433 $REG_S $t2,3*$SZREG($sp)
2434 $REG_S $t1,2*$SZREG($sp)
2435 $REG_S $t0,1*$SZREG($sp)
2436 $REG_S $gp,0*$SZREG($sp)
2438 $code.=<<___;
2439 .set reorder
2440 $LD $a_0,0($a1)
2441 $LD $a_1,$BNSZ($a1)
2442 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2443 $LD $a_2,2*$BNSZ($a1)
2444 $LD $a_3,3*$BNSZ($a1)
2445 mflo $c_1
2446 mfhi $c_2
2447 $ST $c_1,0($a0)
2449 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2450 mflo $t_1
2451 mfhi $t_2
2452 slt $c_1,$t_2,$zero
2453 $SLL $t_2,1
2454 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2455 slt $a2,$t_1,$zero
2456 $ADDU $t_2,$a2
2457 $SLL $t_1,1
2458 $ADDU $c_2,$t_1
2459 sltu $at,$c_2,$t_1
2460 $ADDU $c_3,$t_2,$at
2461 $ST $c_2,$BNSZ($a0)
2463 mflo $t_1
2464 mfhi $t_2
2465 slt $c_2,$t_2,$zero
2466 $SLL $t_2,1
2467 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
2468 slt $a2,$t_1,$zero
2469 $ADDU $t_2,$a2
2470 $SLL $t_1,1
2471 $ADDU $c_3,$t_1
2472 sltu $at,$c_3,$t_1
2473 $ADDU $t_2,$at
2474 $ADDU $c_1,$t_2
2475 sltu $at,$c_1,$t_2
2476 $ADDU $c_2,$at
2477 mflo $t_1
2478 mfhi $t_2
2479 $ADDU $c_3,$t_1
2480 sltu $at,$c_3,$t_1
2481 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2482 $ADDU $t_2,$at
2483 $ADDU $c_1,$t_2
2484 sltu $at,$c_1,$t_2
2485 $ADDU $c_2,$at
2486 $ST $c_3,2*$BNSZ($a0)
2488 mflo $t_1
2489 mfhi $t_2
2490 slt $c_3,$t_2,$zero
2491 $SLL $t_2,1
2492 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
2493 slt $a2,$t_1,$zero
2494 $ADDU $t_2,$a2
2495 $SLL $t_1,1
2496 $ADDU $c_1,$t_1
2497 sltu $at,$c_1,$t_1
2498 $ADDU $t_2,$at
2499 $ADDU $c_2,$t_2
2500 sltu $at,$c_2,$t_2
2501 $ADDU $c_3,$at
2502 mflo $t_1
2503 mfhi $t_2
2504 slt $at,$t_2,$zero
2505 $ADDU $c_3,$at
2506 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
2507 $SLL $t_2,1
2508 slt $a2,$t_1,$zero
2509 $ADDU $t_2,$a2
2510 $SLL $t_1,1
2511 $ADDU $c_1,$t_1
2512 sltu $at,$c_1,$t_1
2513 $ADDU $t_2,$at
2514 $ADDU $c_2,$t_2
2515 sltu $at,$c_2,$t_2
2516 $ADDU $c_3,$at
2517 $ST $c_1,3*$BNSZ($a0)
2519 mflo $t_1
2520 mfhi $t_2
2521 slt $c_1,$t_2,$zero
2522 $SLL $t_2,1
2523 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2524 slt $a2,$t_1,$zero
2525 $ADDU $t_2,$a2
2526 $SLL $t_1,1
2527 $ADDU $c_2,$t_1
2528 sltu $at,$c_2,$t_1
2529 $ADDU $t_2,$at
2530 $ADDU $c_3,$t_2
2531 sltu $at,$c_3,$t_2
2532 $ADDU $c_1,$at
2533 mflo $t_1
2534 mfhi $t_2
2535 $ADDU $c_2,$t_1
2536 sltu $at,$c_2,$t_1
2537 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2538 $ADDU $t_2,$at
2539 $ADDU $c_3,$t_2
2540 sltu $at,$c_3,$t_2
2541 $ADDU $c_1,$at
2542 $ST $c_2,4*$BNSZ($a0)
2544 mflo $t_1
2545 mfhi $t_2
2546 slt $c_2,$t_2,$zero
2547 $SLL $t_2,1
2548 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2549 slt $a2,$t_1,$zero
2550 $ADDU $t_2,$a2
2551 $SLL $t_1,1
2552 $ADDU $c_3,$t_1
2553 sltu $at,$c_3,$t_1
2554 $ADDU $t_2,$at
2555 $ADDU $c_1,$t_2
2556 sltu $at,$c_1,$t_2
2557 $ADDU $c_2,$at
2558 $ST $c_3,5*$BNSZ($a0)
2560 mflo $t_1
2561 mfhi $t_2
2562 $ADDU $c_1,$t_1
2563 sltu $at,$c_1,$t_1
2564 $ADDU $t_2,$at
2565 $ADDU $c_2,$t_2
2566 $ST $c_1,6*$BNSZ($a0)
2567 $ST $c_2,7*$BNSZ($a0)
2569 .set noreorder
2571 $code.=<<___ if ($flavour =~ /nubi/i);
2572 $REG_L $t3,4*$SZREG($sp)
2573 $REG_L $t2,3*$SZREG($sp)
2574 $REG_L $t1,2*$SZREG($sp)
2575 $REG_L $t0,1*$SZREG($sp)
2576 $REG_L $gp,0*$SZREG($sp)
2577 $PTR_ADD $sp,6*$SZREG
2579 $code.=<<___;
2580 jr $ra
2582 .end bn_sqr_comba4
2584 print $code;
2585 close STDOUT;