Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / openssl / crypto / bn / asm / mips.pl
blobd2f3ef7bbf2cac96fd022003b4076096300c2f16
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project.
7 # Rights for redistribution and usage in source and binary forms are
8 # granted according to the OpenSSL license. Warranty of any kind is
9 # disclaimed.
10 # ====================================================================
13 # July 1999
15 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
17 # The module is designed to work with either of the "new" MIPS ABI(5),
18 # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19 # IRIX 5.x not only because it doesn't support new ABIs but also
20 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22 # cause illegal instruction exception:-(
24 # In addition the code depends on preprocessor flags set up by MIPSpro
25 # compiler driver (either as or cc) and therefore (probably?) can't be
26 # compiled by the GNU assembler. GNU C driver manages fine though...
27 # I mean as long as -mmips-as is specified or is the default option,
28 # because then it simply invokes /usr/bin/as which in turn takes
29 # perfect care of the preprocessor definitions. Another neat feature
30 # offered by the MIPSpro assembler is an optimization pass. This gave
31 # me the opportunity to have the code looking more regular as all those
32 # architecture dependent instruction rescheduling details were left to
33 # the assembler. Cool, huh?
35 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36 # goes way over 3 times faster!
38 # <appro@fy.chalmers.se>
40 # October 2010
42 # Adapt the module even for 32-bit ABIs and other OSes. The former was
43 # achieved by mechanical replacement of 64-bit arithmetic instructions
44 # such as dmultu, daddu, etc. with their 32-bit counterparts and
45 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46 # >3x performance improvement naturally does not apply to 32-bit code
47 # [because there is no instruction 32-bit compiler can't use], one
48 # has to content with 40-85% improvement depending on benchmark and
49 # key length, more for longer keys.
51 $flavour = shift;
52 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53 open STDOUT,">$output";
55 if ($flavour =~ /64|n32/i) {
56 $LD="ld";
57 $ST="sd";
58 $MULTU="dmultu";
59 $DIVU="ddivu";
60 $ADDU="daddu";
61 $SUBU="dsubu";
62 $SRL="dsrl";
63 $SLL="dsll";
64 $BNSZ=8;
65 $PTR_ADD="daddu";
66 $PTR_SUB="dsubu";
67 $SZREG=8;
68 $REG_S="sd";
69 $REG_L="ld";
70 } else {
71 $LD="lw";
72 $ST="sw";
73 $MULTU="multu";
74 $DIVU="divu";
75 $ADDU="addu";
76 $SUBU="subu";
77 $SRL="srl";
78 $SLL="sll";
79 $BNSZ=4;
80 $PTR_ADD="addu";
81 $PTR_SUB="subu";
82 $SZREG=4;
83 $REG_S="sw";
84 $REG_L="lw";
85 $code=".set mips2\n";
88 # Below is N32/64 register layout used in the original module.
90 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
97 # No special adaptation is required for O32. NUBI on the other hand
98 # is treated by saving/restoring ($v1,$t0..$t3).
100 $gp=$v1 if ($flavour =~ /nubi/i);
102 $minus4=$v1;
104 $code.=<<___;
105 .rdata
106 .asciiz "mips3.s, Version 1.2"
107 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
109 .text
110 .set noat
112 .align 5
113 .globl bn_mul_add_words
114 .ent bn_mul_add_words
115 bn_mul_add_words:
116 .set noreorder
117 bgtz $a2,bn_mul_add_words_internal
118 move $v0,$zero
119 jr $ra
120 move $a0,$v0
121 .end bn_mul_add_words
123 .align 5
124 .ent bn_mul_add_words_internal
125 bn_mul_add_words_internal:
127 $code.=<<___ if ($flavour =~ /nubi/i);
128 .frame $sp,6*$SZREG,$ra
129 .mask 0x8000f008,-$SZREG
130 .set noreorder
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
139 $code.=<<___;
140 .set reorder
141 li $minus4,-4
142 and $ta0,$a2,$minus4
143 beqz $ta0,.L_bn_mul_add_words_tail
145 .L_bn_mul_add_words_loop:
146 $LD $t0,0($a1)
147 $MULTU $t0,$a3
148 $LD $t1,0($a0)
149 $LD $t2,$BNSZ($a1)
150 $LD $t3,$BNSZ($a0)
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
153 $ADDU $t1,$v0
154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
157 mflo $at
158 mfhi $t0
159 $ADDU $t1,$at
160 $ADDU $v0,$t0
161 $MULTU $t2,$a3
162 sltu $at,$t1,$at
163 $ST $t1,0($a0)
164 $ADDU $v0,$at
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
168 $ADDU $t3,$v0
169 sltu $v0,$t3,$v0
170 mflo $at
171 mfhi $t2
172 $ADDU $t3,$at
173 $ADDU $v0,$t2
174 $MULTU $ta0,$a3
175 sltu $at,$t3,$at
176 $ST $t3,$BNSZ($a0)
177 $ADDU $v0,$at
179 subu $a2,4
180 $PTR_ADD $a0,4*$BNSZ
181 $PTR_ADD $a1,4*$BNSZ
182 $ADDU $ta1,$v0
183 sltu $v0,$ta1,$v0
184 mflo $at
185 mfhi $ta0
186 $ADDU $ta1,$at
187 $ADDU $v0,$ta0
188 $MULTU $ta2,$a3
189 sltu $at,$ta1,$at
190 $ST $ta1,-2*$BNSZ($a0)
191 $ADDU $v0,$at
194 and $ta0,$a2,$minus4
195 $ADDU $ta3,$v0
196 sltu $v0,$ta3,$v0
197 mflo $at
198 mfhi $ta2
199 $ADDU $ta3,$at
200 $ADDU $v0,$ta2
201 sltu $at,$ta3,$at
202 $ST $ta3,-$BNSZ($a0)
203 .set noreorder
204 bgtz $ta0,.L_bn_mul_add_words_loop
205 $ADDU $v0,$at
207 beqz $a2,.L_bn_mul_add_words_return
210 .L_bn_mul_add_words_tail:
211 .set reorder
212 $LD $t0,0($a1)
213 $MULTU $t0,$a3
214 $LD $t1,0($a0)
215 subu $a2,1
216 $ADDU $t1,$v0
217 sltu $v0,$t1,$v0
218 mflo $at
219 mfhi $t0
220 $ADDU $t1,$at
221 $ADDU $v0,$t0
222 sltu $at,$t1,$at
223 $ST $t1,0($a0)
224 $ADDU $v0,$at
225 beqz $a2,.L_bn_mul_add_words_return
227 $LD $t0,$BNSZ($a1)
228 $MULTU $t0,$a3
229 $LD $t1,$BNSZ($a0)
230 subu $a2,1
231 $ADDU $t1,$v0
232 sltu $v0,$t1,$v0
233 mflo $at
234 mfhi $t0
235 $ADDU $t1,$at
236 $ADDU $v0,$t0
237 sltu $at,$t1,$at
238 $ST $t1,$BNSZ($a0)
239 $ADDU $v0,$at
240 beqz $a2,.L_bn_mul_add_words_return
242 $LD $t0,2*$BNSZ($a1)
243 $MULTU $t0,$a3
244 $LD $t1,2*$BNSZ($a0)
245 $ADDU $t1,$v0
246 sltu $v0,$t1,$v0
247 mflo $at
248 mfhi $t0
249 $ADDU $t1,$at
250 $ADDU $v0,$t0
251 sltu $at,$t1,$at
252 $ST $t1,2*$BNSZ($a0)
253 $ADDU $v0,$at
255 .L_bn_mul_add_words_return:
256 .set noreorder
258 $code.=<<___ if ($flavour =~ /nubi/i);
259 $REG_L $t3,4*$SZREG($sp)
260 $REG_L $t2,3*$SZREG($sp)
261 $REG_L $t1,2*$SZREG($sp)
262 $REG_L $t0,1*$SZREG($sp)
263 $REG_L $gp,0*$SZREG($sp)
264 $PTR_ADD $sp,6*$SZREG
266 $code.=<<___;
267 jr $ra
268 move $a0,$v0
269 .end bn_mul_add_words_internal
271 .align 5
272 .globl bn_mul_words
273 .ent bn_mul_words
274 bn_mul_words:
275 .set noreorder
276 bgtz $a2,bn_mul_words_internal
277 move $v0,$zero
278 jr $ra
279 move $a0,$v0
280 .end bn_mul_words
282 .align 5
283 .ent bn_mul_words_internal
284 bn_mul_words_internal:
286 $code.=<<___ if ($flavour =~ /nubi/i);
287 .frame $sp,6*$SZREG,$ra
288 .mask 0x8000f008,-$SZREG
289 .set noreorder
290 $PTR_SUB $sp,6*$SZREG
291 $REG_S $ra,5*$SZREG($sp)
292 $REG_S $t3,4*$SZREG($sp)
293 $REG_S $t2,3*$SZREG($sp)
294 $REG_S $t1,2*$SZREG($sp)
295 $REG_S $t0,1*$SZREG($sp)
296 $REG_S $gp,0*$SZREG($sp)
298 $code.=<<___;
299 .set reorder
300 li $minus4,-4
301 and $ta0,$a2,$minus4
302 beqz $ta0,.L_bn_mul_words_tail
304 .L_bn_mul_words_loop:
305 $LD $t0,0($a1)
306 $MULTU $t0,$a3
307 $LD $t2,$BNSZ($a1)
308 $LD $ta0,2*$BNSZ($a1)
309 $LD $ta2,3*$BNSZ($a1)
310 mflo $at
311 mfhi $t0
312 $ADDU $v0,$at
313 sltu $t1,$v0,$at
314 $MULTU $t2,$a3
315 $ST $v0,0($a0)
316 $ADDU $v0,$t1,$t0
318 subu $a2,4
319 $PTR_ADD $a0,4*$BNSZ
320 $PTR_ADD $a1,4*$BNSZ
321 mflo $at
322 mfhi $t2
323 $ADDU $v0,$at
324 sltu $t3,$v0,$at
325 $MULTU $ta0,$a3
326 $ST $v0,-3*$BNSZ($a0)
327 $ADDU $v0,$t3,$t2
329 mflo $at
330 mfhi $ta0
331 $ADDU $v0,$at
332 sltu $ta1,$v0,$at
333 $MULTU $ta2,$a3
334 $ST $v0,-2*$BNSZ($a0)
335 $ADDU $v0,$ta1,$ta0
337 and $ta0,$a2,$minus4
338 mflo $at
339 mfhi $ta2
340 $ADDU $v0,$at
341 sltu $ta3,$v0,$at
342 $ST $v0,-$BNSZ($a0)
343 .set noreorder
344 bgtz $ta0,.L_bn_mul_words_loop
345 $ADDU $v0,$ta3,$ta2
347 beqz $a2,.L_bn_mul_words_return
350 .L_bn_mul_words_tail:
351 .set reorder
352 $LD $t0,0($a1)
353 $MULTU $t0,$a3
354 subu $a2,1
355 mflo $at
356 mfhi $t0
357 $ADDU $v0,$at
358 sltu $t1,$v0,$at
359 $ST $v0,0($a0)
360 $ADDU $v0,$t1,$t0
361 beqz $a2,.L_bn_mul_words_return
363 $LD $t0,$BNSZ($a1)
364 $MULTU $t0,$a3
365 subu $a2,1
366 mflo $at
367 mfhi $t0
368 $ADDU $v0,$at
369 sltu $t1,$v0,$at
370 $ST $v0,$BNSZ($a0)
371 $ADDU $v0,$t1,$t0
372 beqz $a2,.L_bn_mul_words_return
374 $LD $t0,2*$BNSZ($a1)
375 $MULTU $t0,$a3
376 mflo $at
377 mfhi $t0
378 $ADDU $v0,$at
379 sltu $t1,$v0,$at
380 $ST $v0,2*$BNSZ($a0)
381 $ADDU $v0,$t1,$t0
383 .L_bn_mul_words_return:
384 .set noreorder
386 $code.=<<___ if ($flavour =~ /nubi/i);
387 $REG_L $t3,4*$SZREG($sp)
388 $REG_L $t2,3*$SZREG($sp)
389 $REG_L $t1,2*$SZREG($sp)
390 $REG_L $t0,1*$SZREG($sp)
391 $REG_L $gp,0*$SZREG($sp)
392 $PTR_ADD $sp,6*$SZREG
394 $code.=<<___;
395 jr $ra
396 move $a0,$v0
397 .end bn_mul_words_internal
399 .align 5
400 .globl bn_sqr_words
401 .ent bn_sqr_words
402 bn_sqr_words:
403 .set noreorder
404 bgtz $a2,bn_sqr_words_internal
405 move $v0,$zero
406 jr $ra
407 move $a0,$v0
408 .end bn_sqr_words
410 .align 5
411 .ent bn_sqr_words_internal
412 bn_sqr_words_internal:
414 $code.=<<___ if ($flavour =~ /nubi/i);
415 .frame $sp,6*$SZREG,$ra
416 .mask 0x8000f008,-$SZREG
417 .set noreorder
418 $PTR_SUB $sp,6*$SZREG
419 $REG_S $ra,5*$SZREG($sp)
420 $REG_S $t3,4*$SZREG($sp)
421 $REG_S $t2,3*$SZREG($sp)
422 $REG_S $t1,2*$SZREG($sp)
423 $REG_S $t0,1*$SZREG($sp)
424 $REG_S $gp,0*$SZREG($sp)
426 $code.=<<___;
427 .set reorder
428 li $minus4,-4
429 and $ta0,$a2,$minus4
430 beqz $ta0,.L_bn_sqr_words_tail
432 .L_bn_sqr_words_loop:
433 $LD $t0,0($a1)
434 $MULTU $t0,$t0
435 $LD $t2,$BNSZ($a1)
436 $LD $ta0,2*$BNSZ($a1)
437 $LD $ta2,3*$BNSZ($a1)
438 mflo $t1
439 mfhi $t0
440 $ST $t1,0($a0)
441 $ST $t0,$BNSZ($a0)
443 $MULTU $t2,$t2
444 subu $a2,4
445 $PTR_ADD $a0,8*$BNSZ
446 $PTR_ADD $a1,4*$BNSZ
447 mflo $t3
448 mfhi $t2
449 $ST $t3,-6*$BNSZ($a0)
450 $ST $t2,-5*$BNSZ($a0)
452 $MULTU $ta0,$ta0
453 mflo $ta1
454 mfhi $ta0
455 $ST $ta1,-4*$BNSZ($a0)
456 $ST $ta0,-3*$BNSZ($a0)
459 $MULTU $ta2,$ta2
460 and $ta0,$a2,$minus4
461 mflo $ta3
462 mfhi $ta2
463 $ST $ta3,-2*$BNSZ($a0)
465 .set noreorder
466 bgtz $ta0,.L_bn_sqr_words_loop
467 $ST $ta2,-$BNSZ($a0)
469 beqz $a2,.L_bn_sqr_words_return
472 .L_bn_sqr_words_tail:
473 .set reorder
474 $LD $t0,0($a1)
475 $MULTU $t0,$t0
476 subu $a2,1
477 mflo $t1
478 mfhi $t0
479 $ST $t1,0($a0)
480 $ST $t0,$BNSZ($a0)
481 beqz $a2,.L_bn_sqr_words_return
483 $LD $t0,$BNSZ($a1)
484 $MULTU $t0,$t0
485 subu $a2,1
486 mflo $t1
487 mfhi $t0
488 $ST $t1,2*$BNSZ($a0)
489 $ST $t0,3*$BNSZ($a0)
490 beqz $a2,.L_bn_sqr_words_return
492 $LD $t0,2*$BNSZ($a1)
493 $MULTU $t0,$t0
494 mflo $t1
495 mfhi $t0
496 $ST $t1,4*$BNSZ($a0)
497 $ST $t0,5*$BNSZ($a0)
499 .L_bn_sqr_words_return:
500 .set noreorder
502 $code.=<<___ if ($flavour =~ /nubi/i);
503 $REG_L $t3,4*$SZREG($sp)
504 $REG_L $t2,3*$SZREG($sp)
505 $REG_L $t1,2*$SZREG($sp)
506 $REG_L $t0,1*$SZREG($sp)
507 $REG_L $gp,0*$SZREG($sp)
508 $PTR_ADD $sp,6*$SZREG
510 $code.=<<___;
511 jr $ra
512 move $a0,$v0
514 .end bn_sqr_words_internal
516 .align 5
517 .globl bn_add_words
518 .ent bn_add_words
519 bn_add_words:
520 .set noreorder
521 bgtz $a3,bn_add_words_internal
522 move $v0,$zero
523 jr $ra
524 move $a0,$v0
525 .end bn_add_words
527 .align 5
528 .ent bn_add_words_internal
529 bn_add_words_internal:
531 $code.=<<___ if ($flavour =~ /nubi/i);
532 .frame $sp,6*$SZREG,$ra
533 .mask 0x8000f008,-$SZREG
534 .set noreorder
535 $PTR_SUB $sp,6*$SZREG
536 $REG_S $ra,5*$SZREG($sp)
537 $REG_S $t3,4*$SZREG($sp)
538 $REG_S $t2,3*$SZREG($sp)
539 $REG_S $t1,2*$SZREG($sp)
540 $REG_S $t0,1*$SZREG($sp)
541 $REG_S $gp,0*$SZREG($sp)
543 $code.=<<___;
544 .set reorder
545 li $minus4,-4
546 and $at,$a3,$minus4
547 beqz $at,.L_bn_add_words_tail
549 .L_bn_add_words_loop:
550 $LD $t0,0($a1)
551 $LD $ta0,0($a2)
552 subu $a3,4
553 $LD $t1,$BNSZ($a1)
554 and $at,$a3,$minus4
555 $LD $t2,2*$BNSZ($a1)
556 $PTR_ADD $a2,4*$BNSZ
557 $LD $t3,3*$BNSZ($a1)
558 $PTR_ADD $a0,4*$BNSZ
559 $LD $ta1,-3*$BNSZ($a2)
560 $PTR_ADD $a1,4*$BNSZ
561 $LD $ta2,-2*$BNSZ($a2)
562 $LD $ta3,-$BNSZ($a2)
563 $ADDU $ta0,$t0
564 sltu $t8,$ta0,$t0
565 $ADDU $t0,$ta0,$v0
566 sltu $v0,$t0,$ta0
567 $ST $t0,-4*$BNSZ($a0)
568 $ADDU $v0,$t8
570 $ADDU $ta1,$t1
571 sltu $t9,$ta1,$t1
572 $ADDU $t1,$ta1,$v0
573 sltu $v0,$t1,$ta1
574 $ST $t1,-3*$BNSZ($a0)
575 $ADDU $v0,$t9
577 $ADDU $ta2,$t2
578 sltu $t8,$ta2,$t2
579 $ADDU $t2,$ta2,$v0
580 sltu $v0,$t2,$ta2
581 $ST $t2,-2*$BNSZ($a0)
582 $ADDU $v0,$t8
584 $ADDU $ta3,$t3
585 sltu $t9,$ta3,$t3
586 $ADDU $t3,$ta3,$v0
587 sltu $v0,$t3,$ta3
588 $ST $t3,-$BNSZ($a0)
590 .set noreorder
591 bgtz $at,.L_bn_add_words_loop
592 $ADDU $v0,$t9
594 beqz $a3,.L_bn_add_words_return
597 .L_bn_add_words_tail:
598 .set reorder
599 $LD $t0,0($a1)
600 $LD $ta0,0($a2)
601 $ADDU $ta0,$t0
602 subu $a3,1
603 sltu $t8,$ta0,$t0
604 $ADDU $t0,$ta0,$v0
605 sltu $v0,$t0,$ta0
606 $ST $t0,0($a0)
607 $ADDU $v0,$t8
608 beqz $a3,.L_bn_add_words_return
610 $LD $t1,$BNSZ($a1)
611 $LD $ta1,$BNSZ($a2)
612 $ADDU $ta1,$t1
613 subu $a3,1
614 sltu $t9,$ta1,$t1
615 $ADDU $t1,$ta1,$v0
616 sltu $v0,$t1,$ta1
617 $ST $t1,$BNSZ($a0)
618 $ADDU $v0,$t9
619 beqz $a3,.L_bn_add_words_return
621 $LD $t2,2*$BNSZ($a1)
622 $LD $ta2,2*$BNSZ($a2)
623 $ADDU $ta2,$t2
624 sltu $t8,$ta2,$t2
625 $ADDU $t2,$ta2,$v0
626 sltu $v0,$t2,$ta2
627 $ST $t2,2*$BNSZ($a0)
628 $ADDU $v0,$t8
630 .L_bn_add_words_return:
631 .set noreorder
633 $code.=<<___ if ($flavour =~ /nubi/i);
634 $REG_L $t3,4*$SZREG($sp)
635 $REG_L $t2,3*$SZREG($sp)
636 $REG_L $t1,2*$SZREG($sp)
637 $REG_L $t0,1*$SZREG($sp)
638 $REG_L $gp,0*$SZREG($sp)
639 $PTR_ADD $sp,6*$SZREG
641 $code.=<<___;
642 jr $ra
643 move $a0,$v0
645 .end bn_add_words_internal
647 .align 5
648 .globl bn_sub_words
649 .ent bn_sub_words
650 bn_sub_words:
651 .set noreorder
652 bgtz $a3,bn_sub_words_internal
653 move $v0,$zero
654 jr $ra
655 move $a0,$zero
656 .end bn_sub_words
658 .align 5
659 .ent bn_sub_words_internal
660 bn_sub_words_internal:
662 $code.=<<___ if ($flavour =~ /nubi/i);
663 .frame $sp,6*$SZREG,$ra
664 .mask 0x8000f008,-$SZREG
665 .set noreorder
666 $PTR_SUB $sp,6*$SZREG
667 $REG_S $ra,5*$SZREG($sp)
668 $REG_S $t3,4*$SZREG($sp)
669 $REG_S $t2,3*$SZREG($sp)
670 $REG_S $t1,2*$SZREG($sp)
671 $REG_S $t0,1*$SZREG($sp)
672 $REG_S $gp,0*$SZREG($sp)
674 $code.=<<___;
675 .set reorder
676 li $minus4,-4
677 and $at,$a3,$minus4
678 beqz $at,.L_bn_sub_words_tail
680 .L_bn_sub_words_loop:
681 $LD $t0,0($a1)
682 $LD $ta0,0($a2)
683 subu $a3,4
684 $LD $t1,$BNSZ($a1)
685 and $at,$a3,$minus4
686 $LD $t2,2*$BNSZ($a1)
687 $PTR_ADD $a2,4*$BNSZ
688 $LD $t3,3*$BNSZ($a1)
689 $PTR_ADD $a0,4*$BNSZ
690 $LD $ta1,-3*$BNSZ($a2)
691 $PTR_ADD $a1,4*$BNSZ
692 $LD $ta2,-2*$BNSZ($a2)
693 $LD $ta3,-$BNSZ($a2)
694 sltu $t8,$t0,$ta0
695 $SUBU $ta0,$t0,$ta0
696 $SUBU $t0,$ta0,$v0
697 sgtu $v0,$t0,$ta0
698 $ST $t0,-4*$BNSZ($a0)
699 $ADDU $v0,$t8
701 sltu $t9,$t1,$ta1
702 $SUBU $ta1,$t1,$ta1
703 $SUBU $t1,$ta1,$v0
704 sgtu $v0,$t1,$ta1
705 $ST $t1,-3*$BNSZ($a0)
706 $ADDU $v0,$t9
709 sltu $t8,$t2,$ta2
710 $SUBU $ta2,$t2,$ta2
711 $SUBU $t2,$ta2,$v0
712 sgtu $v0,$t2,$ta2
713 $ST $t2,-2*$BNSZ($a0)
714 $ADDU $v0,$t8
716 sltu $t9,$t3,$ta3
717 $SUBU $ta3,$t3,$ta3
718 $SUBU $t3,$ta3,$v0
719 sgtu $v0,$t3,$ta3
720 $ST $t3,-$BNSZ($a0)
722 .set noreorder
723 bgtz $at,.L_bn_sub_words_loop
724 $ADDU $v0,$t9
726 beqz $a3,.L_bn_sub_words_return
729 .L_bn_sub_words_tail:
730 .set reorder
731 $LD $t0,0($a1)
732 $LD $ta0,0($a2)
733 subu $a3,1
734 sltu $t8,$t0,$ta0
735 $SUBU $ta0,$t0,$ta0
736 $SUBU $t0,$ta0,$v0
737 sgtu $v0,$t0,$ta0
738 $ST $t0,0($a0)
739 $ADDU $v0,$t8
740 beqz $a3,.L_bn_sub_words_return
742 $LD $t1,$BNSZ($a1)
743 subu $a3,1
744 $LD $ta1,$BNSZ($a2)
745 sltu $t9,$t1,$ta1
746 $SUBU $ta1,$t1,$ta1
747 $SUBU $t1,$ta1,$v0
748 sgtu $v0,$t1,$ta1
749 $ST $t1,$BNSZ($a0)
750 $ADDU $v0,$t9
751 beqz $a3,.L_bn_sub_words_return
753 $LD $t2,2*$BNSZ($a1)
754 $LD $ta2,2*$BNSZ($a2)
755 sltu $t8,$t2,$ta2
756 $SUBU $ta2,$t2,$ta2
757 $SUBU $t2,$ta2,$v0
758 sgtu $v0,$t2,$ta2
759 $ST $t2,2*$BNSZ($a0)
760 $ADDU $v0,$t8
762 .L_bn_sub_words_return:
763 .set noreorder
765 $code.=<<___ if ($flavour =~ /nubi/i);
766 $REG_L $t3,4*$SZREG($sp)
767 $REG_L $t2,3*$SZREG($sp)
768 $REG_L $t1,2*$SZREG($sp)
769 $REG_L $t0,1*$SZREG($sp)
770 $REG_L $gp,0*$SZREG($sp)
771 $PTR_ADD $sp,6*$SZREG
773 $code.=<<___;
774 jr $ra
775 move $a0,$v0
776 .end bn_sub_words_internal
778 .align 5
779 .globl bn_div_3_words
780 .ent bn_div_3_words
781 bn_div_3_words:
782 .set noreorder
783 move $a3,$a0 # we know that bn_div_words does not
784 # touch $a3, $ta2, $ta3 and preserves $a2
785 # so that we can save two arguments
786 # and return address in registers
787 # instead of stack:-)
789 $LD $a0,($a3)
790 move $ta2,$a1
791 bne $a0,$a2,bn_div_3_words_internal
792 $LD $a1,-$BNSZ($a3)
793 li $v0,-1
794 jr $ra
795 move $a0,$v0
796 .end bn_div_3_words
798 .align 5
799 .ent bn_div_3_words_internal
800 bn_div_3_words_internal:
802 $code.=<<___ if ($flavour =~ /nubi/i);
803 .frame $sp,6*$SZREG,$ra
804 .mask 0x8000f008,-$SZREG
805 .set noreorder
806 $PTR_SUB $sp,6*$SZREG
807 $REG_S $ra,5*$SZREG($sp)
808 $REG_S $t3,4*$SZREG($sp)
809 $REG_S $t2,3*$SZREG($sp)
810 $REG_S $t1,2*$SZREG($sp)
811 $REG_S $t0,1*$SZREG($sp)
812 $REG_S $gp,0*$SZREG($sp)
814 $code.=<<___;
815 .set reorder
816 move $ta3,$ra
817 bal bn_div_words_internal
818 move $ra,$ta3
819 $MULTU $ta2,$v0
820 $LD $t2,-2*$BNSZ($a3)
821 move $ta0,$zero
822 mfhi $t1
823 mflo $t0
824 sltu $t8,$t1,$a1
825 .L_bn_div_3_words_inner_loop:
826 bnez $t8,.L_bn_div_3_words_inner_loop_done
827 sgeu $at,$t2,$t0
828 seq $t9,$t1,$a1
829 and $at,$t9
830 sltu $t3,$t0,$ta2
831 $ADDU $a1,$a2
832 $SUBU $t1,$t3
833 $SUBU $t0,$ta2
834 sltu $t8,$t1,$a1
835 sltu $ta0,$a1,$a2
836 or $t8,$ta0
837 .set noreorder
838 beqz $at,.L_bn_div_3_words_inner_loop
839 $SUBU $v0,1
840 $ADDU $v0,1
841 .set reorder
842 .L_bn_div_3_words_inner_loop_done:
843 .set noreorder
845 $code.=<<___ if ($flavour =~ /nubi/i);
846 $REG_L $t3,4*$SZREG($sp)
847 $REG_L $t2,3*$SZREG($sp)
848 $REG_L $t1,2*$SZREG($sp)
849 $REG_L $t0,1*$SZREG($sp)
850 $REG_L $gp,0*$SZREG($sp)
851 $PTR_ADD $sp,6*$SZREG
853 $code.=<<___;
854 jr $ra
855 move $a0,$v0
856 .end bn_div_3_words_internal
858 .align 5
859 .globl bn_div_words
860 .ent bn_div_words
861 bn_div_words:
862 .set noreorder
863 bnez $a2,bn_div_words_internal
864 li $v0,-1 # I would rather signal div-by-zero
865 # which can be done with 'break 7'
866 jr $ra
867 move $a0,$v0
868 .end bn_div_words
870 .align 5
871 .ent bn_div_words_internal
872 bn_div_words_internal:
874 $code.=<<___ if ($flavour =~ /nubi/i);
875 .frame $sp,6*$SZREG,$ra
876 .mask 0x8000f008,-$SZREG
877 .set noreorder
878 $PTR_SUB $sp,6*$SZREG
879 $REG_S $ra,5*$SZREG($sp)
880 $REG_S $t3,4*$SZREG($sp)
881 $REG_S $t2,3*$SZREG($sp)
882 $REG_S $t1,2*$SZREG($sp)
883 $REG_S $t0,1*$SZREG($sp)
884 $REG_S $gp,0*$SZREG($sp)
886 $code.=<<___;
887 move $v1,$zero
888 bltz $a2,.L_bn_div_words_body
889 move $t9,$v1
890 $SLL $a2,1
891 bgtz $a2,.-4
892 addu $t9,1
894 .set reorder
895 negu $t1,$t9
896 li $t2,-1
897 $SLL $t2,$t1
898 and $t2,$a0
899 $SRL $at,$a1,$t1
900 .set noreorder
901 beqz $t2,.+12
903 break 6 # signal overflow
904 .set reorder
905 $SLL $a0,$t9
906 $SLL $a1,$t9
907 or $a0,$at
909 $QT=$ta0;
910 $HH=$ta1;
911 $DH=$v1;
912 $code.=<<___;
913 .L_bn_div_words_body:
914 $SRL $DH,$a2,4*$BNSZ # bits
915 sgeu $at,$a0,$a2
916 .set noreorder
917 beqz $at,.+12
919 $SUBU $a0,$a2
920 .set reorder
922 li $QT,-1
923 $SRL $HH,$a0,4*$BNSZ # bits
924 $SRL $QT,4*$BNSZ # q=0xffffffff
925 beq $DH,$HH,.L_bn_div_words_skip_div1
926 $DIVU $zero,$a0,$DH
927 mflo $QT
928 .L_bn_div_words_skip_div1:
929 $MULTU $a2,$QT
930 $SLL $t3,$a0,4*$BNSZ # bits
931 $SRL $at,$a1,4*$BNSZ # bits
932 or $t3,$at
933 mflo $t0
934 mfhi $t1
935 .L_bn_div_words_inner_loop1:
936 sltu $t2,$t3,$t0
937 seq $t8,$HH,$t1
938 sltu $at,$HH,$t1
939 and $t2,$t8
940 sltu $v0,$t0,$a2
941 or $at,$t2
942 .set noreorder
943 beqz $at,.L_bn_div_words_inner_loop1_done
944 $SUBU $t1,$v0
945 $SUBU $t0,$a2
946 b .L_bn_div_words_inner_loop1
947 $SUBU $QT,1
948 .set reorder
949 .L_bn_div_words_inner_loop1_done:
951 $SLL $a1,4*$BNSZ # bits
952 $SUBU $a0,$t3,$t0
953 $SLL $v0,$QT,4*$BNSZ # bits
955 li $QT,-1
956 $SRL $HH,$a0,4*$BNSZ # bits
957 $SRL $QT,4*$BNSZ # q=0xffffffff
958 beq $DH,$HH,.L_bn_div_words_skip_div2
959 $DIVU $zero,$a0,$DH
960 mflo $QT
961 .L_bn_div_words_skip_div2:
962 $MULTU $a2,$QT
963 $SLL $t3,$a0,4*$BNSZ # bits
964 $SRL $at,$a1,4*$BNSZ # bits
965 or $t3,$at
966 mflo $t0
967 mfhi $t1
968 .L_bn_div_words_inner_loop2:
969 sltu $t2,$t3,$t0
970 seq $t8,$HH,$t1
971 sltu $at,$HH,$t1
972 and $t2,$t8
973 sltu $v1,$t0,$a2
974 or $at,$t2
975 .set noreorder
976 beqz $at,.L_bn_div_words_inner_loop2_done
977 $SUBU $t1,$v1
978 $SUBU $t0,$a2
979 b .L_bn_div_words_inner_loop2
980 $SUBU $QT,1
981 .set reorder
982 .L_bn_div_words_inner_loop2_done:
984 $SUBU $a0,$t3,$t0
985 or $v0,$QT
986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
987 $SRL $a2,$t9 # restore $a2
989 .set noreorder
990 move $a1,$v1
992 $code.=<<___ if ($flavour =~ /nubi/i);
993 $REG_L $t3,4*$SZREG($sp)
994 $REG_L $t2,3*$SZREG($sp)
995 $REG_L $t1,2*$SZREG($sp)
996 $REG_L $t0,1*$SZREG($sp)
997 $REG_L $gp,0*$SZREG($sp)
998 $PTR_ADD $sp,6*$SZREG
1000 $code.=<<___;
1001 jr $ra
1002 move $a0,$v0
1003 .end bn_div_words_internal
1005 undef $HH; undef $QT; undef $DH;
1007 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1010 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1013 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1015 $code.=<<___;
1017 .align 5
1018 .globl bn_mul_comba8
1019 .ent bn_mul_comba8
1020 bn_mul_comba8:
1021 .set noreorder
1023 $code.=<<___ if ($flavour =~ /nubi/i);
1024 .frame $sp,12*$SZREG,$ra
1025 .mask 0x803ff008,-$SZREG
1026 $PTR_SUB $sp,12*$SZREG
1027 $REG_S $ra,11*$SZREG($sp)
1028 $REG_S $s5,10*$SZREG($sp)
1029 $REG_S $s4,9*$SZREG($sp)
1030 $REG_S $s3,8*$SZREG($sp)
1031 $REG_S $s2,7*$SZREG($sp)
1032 $REG_S $s1,6*$SZREG($sp)
1033 $REG_S $s0,5*$SZREG($sp)
1034 $REG_S $t3,4*$SZREG($sp)
1035 $REG_S $t2,3*$SZREG($sp)
1036 $REG_S $t1,2*$SZREG($sp)
1037 $REG_S $t0,1*$SZREG($sp)
1038 $REG_S $gp,0*$SZREG($sp)
1040 $code.=<<___ if ($flavour !~ /nubi/i);
1041 .frame $sp,6*$SZREG,$ra
1042 .mask 0x003f0000,-$SZREG
1043 $PTR_SUB $sp,6*$SZREG
1044 $REG_S $s5,5*$SZREG($sp)
1045 $REG_S $s4,4*$SZREG($sp)
1046 $REG_S $s3,3*$SZREG($sp)
1047 $REG_S $s2,2*$SZREG($sp)
1048 $REG_S $s1,1*$SZREG($sp)
1049 $REG_S $s0,0*$SZREG($sp)
1051 $code.=<<___;
1053 .set reorder
1054 $LD $a_0,0($a1) # If compiled with -mips3 option on
1055 # R5000 box assembler barks on this
1056 # 1ine with "should not have mult/div
1057 # as last instruction in bb (R10K
1058 # bug)" warning. If anybody out there
1059 # has a clue about how to circumvent
1060 # this do send me a note.
1061 # <appro\@fy.chalmers.se>
1063 $LD $b_0,0($a2)
1064 $LD $a_1,$BNSZ($a1)
1065 $LD $a_2,2*$BNSZ($a1)
1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1067 $LD $a_3,3*$BNSZ($a1)
1068 $LD $b_1,$BNSZ($a2)
1069 $LD $b_2,2*$BNSZ($a2)
1070 $LD $b_3,3*$BNSZ($a2)
1071 mflo $c_1
1072 mfhi $c_2
1074 $LD $a_4,4*$BNSZ($a1)
1075 $LD $a_5,5*$BNSZ($a1)
1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1077 $LD $a_6,6*$BNSZ($a1)
1078 $LD $a_7,7*$BNSZ($a1)
1079 $LD $b_4,4*$BNSZ($a2)
1080 $LD $b_5,5*$BNSZ($a2)
1081 mflo $t_1
1082 mfhi $t_2
1083 $ADDU $c_2,$t_1
1084 sltu $at,$c_2,$t_1
1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1086 $ADDU $c_3,$t_2,$at
1087 $LD $b_6,6*$BNSZ($a2)
1088 $LD $b_7,7*$BNSZ($a2)
1089 $ST $c_1,0($a0) # r[0]=c1;
1090 mflo $t_1
1091 mfhi $t_2
1092 $ADDU $c_2,$t_1
1093 sltu $at,$c_2,$t_1
1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1095 $ADDU $t_2,$at
1096 $ADDU $c_3,$t_2
1097 sltu $c_1,$c_3,$t_2
1098 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1100 mflo $t_1
1101 mfhi $t_2
1102 $ADDU $c_3,$t_1
1103 sltu $at,$c_3,$t_1
1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1105 $ADDU $t_2,$at
1106 $ADDU $c_1,$t_2
1107 mflo $t_1
1108 mfhi $t_2
1109 $ADDU $c_3,$t_1
1110 sltu $at,$c_3,$t_1
1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1112 $ADDU $t_2,$at
1113 $ADDU $c_1,$t_2
1114 sltu $c_2,$c_1,$t_2
1115 mflo $t_1
1116 mfhi $t_2
1117 $ADDU $c_3,$t_1
1118 sltu $at,$c_3,$t_1
1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1120 $ADDU $t_2,$at
1121 $ADDU $c_1,$t_2
1122 sltu $at,$c_1,$t_2
1123 $ADDU $c_2,$at
1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1126 mflo $t_1
1127 mfhi $t_2
1128 $ADDU $c_1,$t_1
1129 sltu $at,$c_1,$t_1
1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1131 $ADDU $t_2,$at
1132 $ADDU $c_2,$t_2
1133 sltu $c_3,$c_2,$t_2
1134 mflo $t_1
1135 mfhi $t_2
1136 $ADDU $c_1,$t_1
1137 sltu $at,$c_1,$t_1
1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1139 $ADDU $t_2,$at
1140 $ADDU $c_2,$t_2
1141 sltu $at,$c_2,$t_2
1142 $ADDU $c_3,$at
1143 mflo $t_1
1144 mfhi $t_2
1145 $ADDU $c_1,$t_1
1146 sltu $at,$c_1,$t_1
1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1148 $ADDU $t_2,$at
1149 $ADDU $c_2,$t_2
1150 sltu $at,$c_2,$t_2
1151 $ADDU $c_3,$at
1152 mflo $t_1
1153 mfhi $t_2
1154 $ADDU $c_1,$t_1
1155 sltu $at,$c_1,$t_1
1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1157 $ADDU $t_2,$at
1158 $ADDU $c_2,$t_2
1159 sltu $at,$c_2,$t_2
1160 $ADDU $c_3,$at
1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1163 mflo $t_1
1164 mfhi $t_2
1165 $ADDU $c_2,$t_1
1166 sltu $at,$c_2,$t_1
1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1168 $ADDU $t_2,$at
1169 $ADDU $c_3,$t_2
1170 sltu $c_1,$c_3,$t_2
1171 mflo $t_1
1172 mfhi $t_2
1173 $ADDU $c_2,$t_1
1174 sltu $at,$c_2,$t_1
1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1176 $ADDU $t_2,$at
1177 $ADDU $c_3,$t_2
1178 sltu $at,$c_3,$t_2
1179 $ADDU $c_1,$at
1180 mflo $t_1
1181 mfhi $t_2
1182 $ADDU $c_2,$t_1
1183 sltu $at,$c_2,$t_1
1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1185 $ADDU $t_2,$at
1186 $ADDU $c_3,$t_2
1187 sltu $at,$c_3,$t_2
1188 $ADDU $c_1,$at
1189 mflo $t_1
1190 mfhi $t_2
1191 $ADDU $c_2,$t_1
1192 sltu $at,$c_2,$t_1
1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1194 $ADDU $t_2,$at
1195 $ADDU $c_3,$t_2
1196 sltu $at,$c_3,$t_2
1197 $ADDU $c_1,$at
1198 mflo $t_1
1199 mfhi $t_2
1200 $ADDU $c_2,$t_1
1201 sltu $at,$c_2,$t_1
1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1203 $ADDU $t_2,$at
1204 $ADDU $c_3,$t_2
1205 sltu $at,$c_3,$t_2
1206 $ADDU $c_1,$at
1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1209 mflo $t_1
1210 mfhi $t_2
1211 $ADDU $c_3,$t_1
1212 sltu $at,$c_3,$t_1
1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1214 $ADDU $t_2,$at
1215 $ADDU $c_1,$t_2
1216 sltu $c_2,$c_1,$t_2
1217 mflo $t_1
1218 mfhi $t_2
1219 $ADDU $c_3,$t_1
1220 sltu $at,$c_3,$t_1
1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1222 $ADDU $t_2,$at
1223 $ADDU $c_1,$t_2
1224 sltu $at,$c_1,$t_2
1225 $ADDU $c_2,$at
1226 mflo $t_1
1227 mfhi $t_2
1228 $ADDU $c_3,$t_1
1229 sltu $at,$c_3,$t_1
1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1231 $ADDU $t_2,$at
1232 $ADDU $c_1,$t_2
1233 sltu $at,$c_1,$t_2
1234 $ADDU $c_2,$at
1235 mflo $t_1
1236 mfhi $t_2
1237 $ADDU $c_3,$t_1
1238 sltu $at,$c_3,$t_1
1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1240 $ADDU $t_2,$at
1241 $ADDU $c_1,$t_2
1242 sltu $at,$c_1,$t_2
1243 $ADDU $c_2,$at
1244 mflo $t_1
1245 mfhi $t_2
1246 $ADDU $c_3,$t_1
1247 sltu $at,$c_3,$t_1
1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1249 $ADDU $t_2,$at
1250 $ADDU $c_1,$t_2
1251 sltu $at,$c_1,$t_2
1252 $ADDU $c_2,$at
1253 mflo $t_1
1254 mfhi $t_2
1255 $ADDU $c_3,$t_1
1256 sltu $at,$c_3,$t_1
1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1258 $ADDU $t_2,$at
1259 $ADDU $c_1,$t_2
1260 sltu $at,$c_1,$t_2
1261 $ADDU $c_2,$at
1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1264 mflo $t_1
1265 mfhi $t_2
1266 $ADDU $c_1,$t_1
1267 sltu $at,$c_1,$t_1
1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1269 $ADDU $t_2,$at
1270 $ADDU $c_2,$t_2
1271 sltu $c_3,$c_2,$t_2
1272 mflo $t_1
1273 mfhi $t_2
1274 $ADDU $c_1,$t_1
1275 sltu $at,$c_1,$t_1
1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1277 $ADDU $t_2,$at
1278 $ADDU $c_2,$t_2
1279 sltu $at,$c_2,$t_2
1280 $ADDU $c_3,$at
1281 mflo $t_1
1282 mfhi $t_2
1283 $ADDU $c_1,$t_1
1284 sltu $at,$c_1,$t_1
1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1286 $ADDU $t_2,$at
1287 $ADDU $c_2,$t_2
1288 sltu $at,$c_2,$t_2
1289 $ADDU $c_3,$at
1290 mflo $t_1
1291 mfhi $t_2
1292 $ADDU $c_1,$t_1
1293 sltu $at,$c_1,$t_1
1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1295 $ADDU $t_2,$at
1296 $ADDU $c_2,$t_2
1297 sltu $at,$c_2,$t_2
1298 $ADDU $c_3,$at
1299 mflo $t_1
1300 mfhi $t_2
1301 $ADDU $c_1,$t_1
1302 sltu $at,$c_1,$t_1
1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1304 $ADDU $t_2,$at
1305 $ADDU $c_2,$t_2
1306 sltu $at,$c_2,$t_2
1307 $ADDU $c_3,$at
1308 mflo $t_1
1309 mfhi $t_2
1310 $ADDU $c_1,$t_1
1311 sltu $at,$c_1,$t_1
1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1313 $ADDU $t_2,$at
1314 $ADDU $c_2,$t_2
1315 sltu $at,$c_2,$t_2
1316 $ADDU $c_3,$at
1317 mflo $t_1
1318 mfhi $t_2
1319 $ADDU $c_1,$t_1
1320 sltu $at,$c_1,$t_1
1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1322 $ADDU $t_2,$at
1323 $ADDU $c_2,$t_2
1324 sltu $at,$c_2,$t_2
1325 $ADDU $c_3,$at
1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1328 mflo $t_1
1329 mfhi $t_2
1330 $ADDU $c_2,$t_1
1331 sltu $at,$c_2,$t_1
1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1333 $ADDU $t_2,$at
1334 $ADDU $c_3,$t_2
1335 sltu $c_1,$c_3,$t_2
1336 mflo $t_1
1337 mfhi $t_2
1338 $ADDU $c_2,$t_1
1339 sltu $at,$c_2,$t_1
1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1341 $ADDU $t_2,$at
1342 $ADDU $c_3,$t_2
1343 sltu $at,$c_3,$t_2
1344 $ADDU $c_1,$at
1345 mflo $t_1
1346 mfhi $t_2
1347 $ADDU $c_2,$t_1
1348 sltu $at,$c_2,$t_1
1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1350 $ADDU $t_2,$at
1351 $ADDU $c_3,$t_2
1352 sltu $at,$c_3,$t_2
1353 $ADDU $c_1,$at
1354 mflo $t_1
1355 mfhi $t_2
1356 $ADDU $c_2,$t_1
1357 sltu $at,$c_2,$t_1
1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1359 $ADDU $t_2,$at
1360 $ADDU $c_3,$t_2
1361 sltu $at,$c_3,$t_2
1362 $ADDU $c_1,$at
1363 mflo $t_1
1364 mfhi $t_2
1365 $ADDU $c_2,$t_1
1366 sltu $at,$c_2,$t_1
1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1368 $ADDU $t_2,$at
1369 $ADDU $c_3,$t_2
1370 sltu $at,$c_3,$t_2
1371 $ADDU $c_1,$at
1372 mflo $t_1
1373 mfhi $t_2
1374 $ADDU $c_2,$t_1
1375 sltu $at,$c_2,$t_1
1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1377 $ADDU $t_2,$at
1378 $ADDU $c_3,$t_2
1379 sltu $at,$c_3,$t_2
1380 $ADDU $c_1,$at
1381 mflo $t_1
1382 mfhi $t_2
1383 $ADDU $c_2,$t_1
1384 sltu $at,$c_2,$t_1
1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1386 $ADDU $t_2,$at
1387 $ADDU $c_3,$t_2
1388 sltu $at,$c_3,$t_2
1389 $ADDU $c_1,$at
1390 mflo $t_1
1391 mfhi $t_2
1392 $ADDU $c_2,$t_1
1393 sltu $at,$c_2,$t_1
1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1395 $ADDU $t_2,$at
1396 $ADDU $c_3,$t_2
1397 sltu $at,$c_3,$t_2
1398 $ADDU $c_1,$at
1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1401 mflo $t_1
1402 mfhi $t_2
1403 $ADDU $c_3,$t_1
1404 sltu $at,$c_3,$t_1
1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1406 $ADDU $t_2,$at
1407 $ADDU $c_1,$t_2
1408 sltu $c_2,$c_1,$t_2
1409 mflo $t_1
1410 mfhi $t_2
1411 $ADDU $c_3,$t_1
1412 sltu $at,$c_3,$t_1
1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1414 $ADDU $t_2,$at
1415 $ADDU $c_1,$t_2
1416 sltu $at,$c_1,$t_2
1417 $ADDU $c_2,$at
1418 mflo $t_1
1419 mfhi $t_2
1420 $ADDU $c_3,$t_1
1421 sltu $at,$c_3,$t_1
1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1423 $ADDU $t_2,$at
1424 $ADDU $c_1,$t_2
1425 sltu $at,$c_1,$t_2
1426 $ADDU $c_2,$at
1427 mflo $t_1
1428 mfhi $t_2
1429 $ADDU $c_3,$t_1
1430 sltu $at,$c_3,$t_1
1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1432 $ADDU $t_2,$at
1433 $ADDU $c_1,$t_2
1434 sltu $at,$c_1,$t_2
1435 $ADDU $c_2,$at
1436 mflo $t_1
1437 mfhi $t_2
1438 $ADDU $c_3,$t_1
1439 sltu $at,$c_3,$t_1
1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1441 $ADDU $t_2,$at
1442 $ADDU $c_1,$t_2
1443 sltu $at,$c_1,$t_2
1444 $ADDU $c_2,$at
1445 mflo $t_1
1446 mfhi $t_2
1447 $ADDU $c_3,$t_1
1448 sltu $at,$c_3,$t_1
1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1450 $ADDU $t_2,$at
1451 $ADDU $c_1,$t_2
1452 sltu $at,$c_1,$t_2
1453 $ADDU $c_2,$at
1454 mflo $t_1
1455 mfhi $t_2
1456 $ADDU $c_3,$t_1
1457 sltu $at,$c_3,$t_1
1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1459 $ADDU $t_2,$at
1460 $ADDU $c_1,$t_2
1461 sltu $at,$c_1,$t_2
1462 $ADDU $c_2,$at
1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1465 mflo $t_1
1466 mfhi $t_2
1467 $ADDU $c_1,$t_1
1468 sltu $at,$c_1,$t_1
1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1470 $ADDU $t_2,$at
1471 $ADDU $c_2,$t_2
1472 sltu $c_3,$c_2,$t_2
1473 mflo $t_1
1474 mfhi $t_2
1475 $ADDU $c_1,$t_1
1476 sltu $at,$c_1,$t_1
1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1478 $ADDU $t_2,$at
1479 $ADDU $c_2,$t_2
1480 sltu $at,$c_2,$t_2
1481 $ADDU $c_3,$at
1482 mflo $t_1
1483 mfhi $t_2
1484 $ADDU $c_1,$t_1
1485 sltu $at,$c_1,$t_1
1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1487 $ADDU $t_2,$at
1488 $ADDU $c_2,$t_2
1489 sltu $at,$c_2,$t_2
1490 $ADDU $c_3,$at
1491 mflo $t_1
1492 mfhi $t_2
1493 $ADDU $c_1,$t_1
1494 sltu $at,$c_1,$t_1
1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1496 $ADDU $t_2,$at
1497 $ADDU $c_2,$t_2
1498 sltu $at,$c_2,$t_2
1499 $ADDU $c_3,$at
1500 mflo $t_1
1501 mfhi $t_2
1502 $ADDU $c_1,$t_1
1503 sltu $at,$c_1,$t_1
1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1505 $ADDU $t_2,$at
1506 $ADDU $c_2,$t_2
1507 sltu $at,$c_2,$t_2
1508 $ADDU $c_3,$at
1509 mflo $t_1
1510 mfhi $t_2
1511 $ADDU $c_1,$t_1
1512 sltu $at,$c_1,$t_1
1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1514 $ADDU $t_2,$at
1515 $ADDU $c_2,$t_2
1516 sltu $at,$c_2,$t_2
1517 $ADDU $c_3,$at
1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1520 mflo $t_1
1521 mfhi $t_2
1522 $ADDU $c_2,$t_1
1523 sltu $at,$c_2,$t_1
1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1525 $ADDU $t_2,$at
1526 $ADDU $c_3,$t_2
1527 sltu $c_1,$c_3,$t_2
1528 mflo $t_1
1529 mfhi $t_2
1530 $ADDU $c_2,$t_1
1531 sltu $at,$c_2,$t_1
1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1533 $ADDU $t_2,$at
1534 $ADDU $c_3,$t_2
1535 sltu $at,$c_3,$t_2
1536 $ADDU $c_1,$at
1537 mflo $t_1
1538 mfhi $t_2
1539 $ADDU $c_2,$t_1
1540 sltu $at,$c_2,$t_1
1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1542 $ADDU $t_2,$at
1543 $ADDU $c_3,$t_2
1544 sltu $at,$c_3,$t_2
1545 $ADDU $c_1,$at
1546 mflo $t_1
1547 mfhi $t_2
1548 $ADDU $c_2,$t_1
1549 sltu $at,$c_2,$t_1
1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1551 $ADDU $t_2,$at
1552 $ADDU $c_3,$t_2
1553 sltu $at,$c_3,$t_2
1554 $ADDU $c_1,$at
1555 mflo $t_1
1556 mfhi $t_2
1557 $ADDU $c_2,$t_1
1558 sltu $at,$c_2,$t_1
1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1560 $ADDU $t_2,$at
1561 $ADDU $c_3,$t_2
1562 sltu $at,$c_3,$t_2
1563 $ADDU $c_1,$at
1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1566 mflo $t_1
1567 mfhi $t_2
1568 $ADDU $c_3,$t_1
1569 sltu $at,$c_3,$t_1
1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1571 $ADDU $t_2,$at
1572 $ADDU $c_1,$t_2
1573 sltu $c_2,$c_1,$t_2
1574 mflo $t_1
1575 mfhi $t_2
1576 $ADDU $c_3,$t_1
1577 sltu $at,$c_3,$t_1
1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1579 $ADDU $t_2,$at
1580 $ADDU $c_1,$t_2
1581 sltu $at,$c_1,$t_2
1582 $ADDU $c_2,$at
1583 mflo $t_1
1584 mfhi $t_2
1585 $ADDU $c_3,$t_1
1586 sltu $at,$c_3,$t_1
1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1588 $ADDU $t_2,$at
1589 $ADDU $c_1,$t_2
1590 sltu $at,$c_1,$t_2
1591 $ADDU $c_2,$at
1592 mflo $t_1
1593 mfhi $t_2
1594 $ADDU $c_3,$t_1
1595 sltu $at,$c_3,$t_1
1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1597 $ADDU $t_2,$at
1598 $ADDU $c_1,$t_2
1599 sltu $at,$c_1,$t_2
1600 $ADDU $c_2,$at
1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1603 mflo $t_1
1604 mfhi $t_2
1605 $ADDU $c_1,$t_1
1606 sltu $at,$c_1,$t_1
1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1608 $ADDU $t_2,$at
1609 $ADDU $c_2,$t_2
1610 sltu $c_3,$c_2,$t_2
1611 mflo $t_1
1612 mfhi $t_2
1613 $ADDU $c_1,$t_1
1614 sltu $at,$c_1,$t_1
1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1616 $ADDU $t_2,$at
1617 $ADDU $c_2,$t_2
1618 sltu $at,$c_2,$t_2
1619 $ADDU $c_3,$at
1620 mflo $t_1
1621 mfhi $t_2
1622 $ADDU $c_1,$t_1
1623 sltu $at,$c_1,$t_1
1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1625 $ADDU $t_2,$at
1626 $ADDU $c_2,$t_2
1627 sltu $at,$c_2,$t_2
1628 $ADDU $c_3,$at
1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1631 mflo $t_1
1632 mfhi $t_2
1633 $ADDU $c_2,$t_1
1634 sltu $at,$c_2,$t_1
1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1636 $ADDU $t_2,$at
1637 $ADDU $c_3,$t_2
1638 sltu $c_1,$c_3,$t_2
1639 mflo $t_1
1640 mfhi $t_2
1641 $ADDU $c_2,$t_1
1642 sltu $at,$c_2,$t_1
1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1644 $ADDU $t_2,$at
1645 $ADDU $c_3,$t_2
1646 sltu $at,$c_3,$t_2
1647 $ADDU $c_1,$at
1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1650 mflo $t_1
1651 mfhi $t_2
1652 $ADDU $c_3,$t_1
1653 sltu $at,$c_3,$t_1
1654 $ADDU $t_2,$at
1655 $ADDU $c_1,$t_2
1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1659 .set noreorder
1661 $code.=<<___ if ($flavour =~ /nubi/i);
1662 $REG_L $s5,10*$SZREG($sp)
1663 $REG_L $s4,9*$SZREG($sp)
1664 $REG_L $s3,8*$SZREG($sp)
1665 $REG_L $s2,7*$SZREG($sp)
1666 $REG_L $s1,6*$SZREG($sp)
1667 $REG_L $s0,5*$SZREG($sp)
1668 $REG_L $t3,4*$SZREG($sp)
1669 $REG_L $t2,3*$SZREG($sp)
1670 $REG_L $t1,2*$SZREG($sp)
1671 $REG_L $t0,1*$SZREG($sp)
1672 $REG_L $gp,0*$SZREG($sp)
1673 jr $ra
1674 $PTR_ADD $sp,12*$SZREG
1676 $code.=<<___ if ($flavour !~ /nubi/i);
1677 $REG_L $s5,5*$SZREG($sp)
1678 $REG_L $s4,4*$SZREG($sp)
1679 $REG_L $s3,3*$SZREG($sp)
1680 $REG_L $s2,2*$SZREG($sp)
1681 $REG_L $s1,1*$SZREG($sp)
1682 $REG_L $s0,0*$SZREG($sp)
1683 jr $ra
1684 $PTR_ADD $sp,6*$SZREG
1686 $code.=<<___;
1687 .end bn_mul_comba8
1689 .align 5
1690 .globl bn_mul_comba4
1691 .ent bn_mul_comba4
1692 bn_mul_comba4:
1694 $code.=<<___ if ($flavour =~ /nubi/i);
1695 .frame $sp,6*$SZREG,$ra
1696 .mask 0x8000f008,-$SZREG
1697 .set noreorder
1698 $PTR_SUB $sp,6*$SZREG
1699 $REG_S $ra,5*$SZREG($sp)
1700 $REG_S $t3,4*$SZREG($sp)
1701 $REG_S $t2,3*$SZREG($sp)
1702 $REG_S $t1,2*$SZREG($sp)
1703 $REG_S $t0,1*$SZREG($sp)
1704 $REG_S $gp,0*$SZREG($sp)
1706 $code.=<<___;
1707 .set reorder
1708 $LD $a_0,0($a1)
1709 $LD $b_0,0($a2)
1710 $LD $a_1,$BNSZ($a1)
1711 $LD $a_2,2*$BNSZ($a1)
1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1713 $LD $a_3,3*$BNSZ($a1)
1714 $LD $b_1,$BNSZ($a2)
1715 $LD $b_2,2*$BNSZ($a2)
1716 $LD $b_3,3*$BNSZ($a2)
1717 mflo $c_1
1718 mfhi $c_2
1719 $ST $c_1,0($a0)
1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1722 mflo $t_1
1723 mfhi $t_2
1724 $ADDU $c_2,$t_1
1725 sltu $at,$c_2,$t_1
1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1727 $ADDU $c_3,$t_2,$at
1728 mflo $t_1
1729 mfhi $t_2
1730 $ADDU $c_2,$t_1
1731 sltu $at,$c_2,$t_1
1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1733 $ADDU $t_2,$at
1734 $ADDU $c_3,$t_2
1735 sltu $c_1,$c_3,$t_2
1736 $ST $c_2,$BNSZ($a0)
1738 mflo $t_1
1739 mfhi $t_2
1740 $ADDU $c_3,$t_1
1741 sltu $at,$c_3,$t_1
1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1743 $ADDU $t_2,$at
1744 $ADDU $c_1,$t_2
1745 mflo $t_1
1746 mfhi $t_2
1747 $ADDU $c_3,$t_1
1748 sltu $at,$c_3,$t_1
1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1750 $ADDU $t_2,$at
1751 $ADDU $c_1,$t_2
1752 sltu $c_2,$c_1,$t_2
1753 mflo $t_1
1754 mfhi $t_2
1755 $ADDU $c_3,$t_1
1756 sltu $at,$c_3,$t_1
1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1758 $ADDU $t_2,$at
1759 $ADDU $c_1,$t_2
1760 sltu $at,$c_1,$t_2
1761 $ADDU $c_2,$at
1762 $ST $c_3,2*$BNSZ($a0)
1764 mflo $t_1
1765 mfhi $t_2
1766 $ADDU $c_1,$t_1
1767 sltu $at,$c_1,$t_1
1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1769 $ADDU $t_2,$at
1770 $ADDU $c_2,$t_2
1771 sltu $c_3,$c_2,$t_2
1772 mflo $t_1
1773 mfhi $t_2
1774 $ADDU $c_1,$t_1
1775 sltu $at,$c_1,$t_1
1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1777 $ADDU $t_2,$at
1778 $ADDU $c_2,$t_2
1779 sltu $at,$c_2,$t_2
1780 $ADDU $c_3,$at
1781 mflo $t_1
1782 mfhi $t_2
1783 $ADDU $c_1,$t_1
1784 sltu $at,$c_1,$t_1
1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1786 $ADDU $t_2,$at
1787 $ADDU $c_2,$t_2
1788 sltu $at,$c_2,$t_2
1789 $ADDU $c_3,$at
1790 mflo $t_1
1791 mfhi $t_2
1792 $ADDU $c_1,$t_1
1793 sltu $at,$c_1,$t_1
1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1795 $ADDU $t_2,$at
1796 $ADDU $c_2,$t_2
1797 sltu $at,$c_2,$t_2
1798 $ADDU $c_3,$at
1799 $ST $c_1,3*$BNSZ($a0)
1801 mflo $t_1
1802 mfhi $t_2
1803 $ADDU $c_2,$t_1
1804 sltu $at,$c_2,$t_1
1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1806 $ADDU $t_2,$at
1807 $ADDU $c_3,$t_2
1808 sltu $c_1,$c_3,$t_2
1809 mflo $t_1
1810 mfhi $t_2
1811 $ADDU $c_2,$t_1
1812 sltu $at,$c_2,$t_1
1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1814 $ADDU $t_2,$at
1815 $ADDU $c_3,$t_2
1816 sltu $at,$c_3,$t_2
1817 $ADDU $c_1,$at
1818 mflo $t_1
1819 mfhi $t_2
1820 $ADDU $c_2,$t_1
1821 sltu $at,$c_2,$t_1
1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1823 $ADDU $t_2,$at
1824 $ADDU $c_3,$t_2
1825 sltu $at,$c_3,$t_2
1826 $ADDU $c_1,$at
1827 $ST $c_2,4*$BNSZ($a0)
1829 mflo $t_1
1830 mfhi $t_2
1831 $ADDU $c_3,$t_1
1832 sltu $at,$c_3,$t_1
1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1834 $ADDU $t_2,$at
1835 $ADDU $c_1,$t_2
1836 sltu $c_2,$c_1,$t_2
1837 mflo $t_1
1838 mfhi $t_2
1839 $ADDU $c_3,$t_1
1840 sltu $at,$c_3,$t_1
1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1842 $ADDU $t_2,$at
1843 $ADDU $c_1,$t_2
1844 sltu $at,$c_1,$t_2
1845 $ADDU $c_2,$at
1846 $ST $c_3,5*$BNSZ($a0)
1848 mflo $t_1
1849 mfhi $t_2
1850 $ADDU $c_1,$t_1
1851 sltu $at,$c_1,$t_1
1852 $ADDU $t_2,$at
1853 $ADDU $c_2,$t_2
1854 $ST $c_1,6*$BNSZ($a0)
1855 $ST $c_2,7*$BNSZ($a0)
1857 .set noreorder
1859 $code.=<<___ if ($flavour =~ /nubi/i);
1860 $REG_L $t3,4*$SZREG($sp)
1861 $REG_L $t2,3*$SZREG($sp)
1862 $REG_L $t1,2*$SZREG($sp)
1863 $REG_L $t0,1*$SZREG($sp)
1864 $REG_L $gp,0*$SZREG($sp)
1865 $PTR_ADD $sp,6*$SZREG
1867 $code.=<<___;
1868 jr $ra
1870 .end bn_mul_comba4
1873 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1875 $code.=<<___;
1877 .align 5
1878 .globl bn_sqr_comba8
1879 .ent bn_sqr_comba8
1880 bn_sqr_comba8:
1882 $code.=<<___ if ($flavour =~ /nubi/i);
1883 .frame $sp,6*$SZREG,$ra
1884 .mask 0x8000f008,-$SZREG
1885 .set noreorder
1886 $PTR_SUB $sp,6*$SZREG
1887 $REG_S $ra,5*$SZREG($sp)
1888 $REG_S $t3,4*$SZREG($sp)
1889 $REG_S $t2,3*$SZREG($sp)
1890 $REG_S $t1,2*$SZREG($sp)
1891 $REG_S $t0,1*$SZREG($sp)
1892 $REG_S $gp,0*$SZREG($sp)
1894 $code.=<<___;
1895 .set reorder
1896 $LD $a_0,0($a1)
1897 $LD $a_1,$BNSZ($a1)
1898 $LD $a_2,2*$BNSZ($a1)
1899 $LD $a_3,3*$BNSZ($a1)
1901 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1902 $LD $a_4,4*$BNSZ($a1)
1903 $LD $a_5,5*$BNSZ($a1)
1904 $LD $a_6,6*$BNSZ($a1)
1905 $LD $a_7,7*$BNSZ($a1)
1906 mflo $c_1
1907 mfhi $c_2
1908 $ST $c_1,0($a0)
1910 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1911 mflo $t_1
1912 mfhi $t_2
1913 slt $c_1,$t_2,$zero
1914 $SLL $t_2,1
1915 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1916 slt $a2,$t_1,$zero
1917 $ADDU $t_2,$a2
1918 $SLL $t_1,1
1919 $ADDU $c_2,$t_1
1920 sltu $at,$c_2,$t_1
1921 $ADDU $c_3,$t_2,$at
1922 $ST $c_2,$BNSZ($a0)
1924 mflo $t_1
1925 mfhi $t_2
1926 slt $c_2,$t_2,$zero
1927 $SLL $t_2,1
1928 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1929 slt $a2,$t_1,$zero
1930 $ADDU $t_2,$a2
1931 $SLL $t_1,1
1932 $ADDU $c_3,$t_1
1933 sltu $at,$c_3,$t_1
1934 $ADDU $t_2,$at
1935 $ADDU $c_1,$t_2
1936 sltu $at,$c_1,$t_2
1937 $ADDU $c_2,$at
1938 mflo $t_1
1939 mfhi $t_2
1940 $ADDU $c_3,$t_1
1941 sltu $at,$c_3,$t_1
1942 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1943 $ADDU $t_2,$at
1944 $ADDU $c_1,$t_2
1945 sltu $at,$c_1,$t_2
1946 $ADDU $c_2,$at
1947 $ST $c_3,2*$BNSZ($a0)
1949 mflo $t_1
1950 mfhi $t_2
1951 slt $c_3,$t_2,$zero
1952 $SLL $t_2,1
1953 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
1954 slt $a2,$t_1,$zero
1955 $ADDU $t_2,$a2
1956 $SLL $t_1,1
1957 $ADDU $c_1,$t_1
1958 sltu $at,$c_1,$t_1
1959 $ADDU $t_2,$at
1960 $ADDU $c_2,$t_2
1961 sltu $at,$c_2,$t_2
1962 $ADDU $c_3,$at
1963 mflo $t_1
1964 mfhi $t_2
1965 slt $at,$t_2,$zero
1966 $ADDU $c_3,$at
1967 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
1968 $SLL $t_2,1
1969 slt $a2,$t_1,$zero
1970 $ADDU $t_2,$a2
1971 $SLL $t_1,1
1972 $ADDU $c_1,$t_1
1973 sltu $at,$c_1,$t_1
1974 $ADDU $t_2,$at
1975 $ADDU $c_2,$t_2
1976 sltu $at,$c_2,$t_2
1977 $ADDU $c_3,$at
1978 $ST $c_1,3*$BNSZ($a0)
1980 mflo $t_1
1981 mfhi $t_2
1982 slt $c_1,$t_2,$zero
1983 $SLL $t_2,1
1984 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
1985 slt $a2,$t_1,$zero
1986 $ADDU $t_2,$a2
1987 $SLL $t_1,1
1988 $ADDU $c_2,$t_1
1989 sltu $at,$c_2,$t_1
1990 $ADDU $t_2,$at
1991 $ADDU $c_3,$t_2
1992 sltu $at,$c_3,$t_2
1993 $ADDU $c_1,$at
1994 mflo $t_1
1995 mfhi $t_2
1996 slt $at,$t_2,$zero
1997 $ADDU $c_1,$at
1998 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1999 $SLL $t_2,1
2000 slt $a2,$t_1,$zero
2001 $ADDU $t_2,$a2
2002 $SLL $t_1,1
2003 $ADDU $c_2,$t_1
2004 sltu $at,$c_2,$t_1
2005 $ADDU $t_2,$at
2006 $ADDU $c_3,$t_2
2007 sltu $at,$c_3,$t_2
2008 $ADDU $c_1,$at
2009 mflo $t_1
2010 mfhi $t_2
2011 $ADDU $c_2,$t_1
2012 sltu $at,$c_2,$t_1
2013 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
2014 $ADDU $t_2,$at
2015 $ADDU $c_3,$t_2
2016 sltu $at,$c_3,$t_2
2017 $ADDU $c_1,$at
2018 $ST $c_2,4*$BNSZ($a0)
2020 mflo $t_1
2021 mfhi $t_2
2022 slt $c_2,$t_2,$zero
2023 $SLL $t_2,1
2024 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
2025 slt $a2,$t_1,$zero
2026 $ADDU $t_2,$a2
2027 $SLL $t_1,1
2028 $ADDU $c_3,$t_1
2029 sltu $at,$c_3,$t_1
2030 $ADDU $t_2,$at
2031 $ADDU $c_1,$t_2
2032 sltu $at,$c_1,$t_2
2033 $ADDU $c_2,$at
2034 mflo $t_1
2035 mfhi $t_2
2036 slt $at,$t_2,$zero
2037 $ADDU $c_2,$at
2038 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2039 $SLL $t_2,1
2040 slt $a2,$t_1,$zero
2041 $ADDU $t_2,$a2
2042 $SLL $t_1,1
2043 $ADDU $c_3,$t_1
2044 sltu $at,$c_3,$t_1
2045 $ADDU $t_2,$at
2046 $ADDU $c_1,$t_2
2047 sltu $at,$c_1,$t_2
2048 $ADDU $c_2,$at
2049 mflo $t_1
2050 mfhi $t_2
2051 slt $at,$t_2,$zero
2052 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
2053 $ADDU $c_2,$at
2054 $SLL $t_2,1
2055 slt $a2,$t_1,$zero
2056 $ADDU $t_2,$a2
2057 $SLL $t_1,1
2058 $ADDU $c_3,$t_1
2059 sltu $at,$c_3,$t_1
2060 $ADDU $t_2,$at
2061 $ADDU $c_1,$t_2
2062 sltu $at,$c_1,$t_2
2063 $ADDU $c_2,$at
2064 $ST $c_3,5*$BNSZ($a0)
2066 mflo $t_1
2067 mfhi $t_2
2068 slt $c_3,$t_2,$zero
2069 $SLL $t_2,1
2070 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
2071 slt $a2,$t_1,$zero
2072 $ADDU $t_2,$a2
2073 $SLL $t_1,1
2074 $ADDU $c_1,$t_1
2075 sltu $at,$c_1,$t_1
2076 $ADDU $t_2,$at
2077 $ADDU $c_2,$t_2
2078 sltu $at,$c_2,$t_2
2079 $ADDU $c_3,$at
2080 mflo $t_1
2081 mfhi $t_2
2082 slt $at,$t_2,$zero
2083 $ADDU $c_3,$at
2084 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
2085 $SLL $t_2,1
2086 slt $a2,$t_1,$zero
2087 $ADDU $t_2,$a2
2088 $SLL $t_1,1
2089 $ADDU $c_1,$t_1
2090 sltu $at,$c_1,$t_1
2091 $ADDU $t_2,$at
2092 $ADDU $c_2,$t_2
2093 sltu $at,$c_2,$t_2
2094 $ADDU $c_3,$at
2095 mflo $t_1
2096 mfhi $t_2
2097 slt $at,$t_2,$zero
2098 $ADDU $c_3,$at
2099 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2100 $SLL $t_2,1
2101 slt $a2,$t_1,$zero
2102 $ADDU $t_2,$a2
2103 $SLL $t_1,1
2104 $ADDU $c_1,$t_1
2105 sltu $at,$c_1,$t_1
2106 $ADDU $t_2,$at
2107 $ADDU $c_2,$t_2
2108 sltu $at,$c_2,$t_2
2109 $ADDU $c_3,$at
2110 mflo $t_1
2111 mfhi $t_2
2112 $ADDU $c_1,$t_1
2113 sltu $at,$c_1,$t_1
2114 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2115 $ADDU $t_2,$at
2116 $ADDU $c_2,$t_2
2117 sltu $at,$c_2,$t_2
2118 $ADDU $c_3,$at
2119 $ST $c_1,6*$BNSZ($a0)
2121 mflo $t_1
2122 mfhi $t_2
2123 slt $c_1,$t_2,$zero
2124 $SLL $t_2,1
2125 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
2126 slt $a2,$t_1,$zero
2127 $ADDU $t_2,$a2
2128 $SLL $t_1,1
2129 $ADDU $c_2,$t_1
2130 sltu $at,$c_2,$t_1
2131 $ADDU $t_2,$at
2132 $ADDU $c_3,$t_2
2133 sltu $at,$c_3,$t_2
2134 $ADDU $c_1,$at
2135 mflo $t_1
2136 mfhi $t_2
2137 slt $at,$t_2,$zero
2138 $ADDU $c_1,$at
2139 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
2140 $SLL $t_2,1
2141 slt $a2,$t_1,$zero
2142 $ADDU $t_2,$a2
2143 $SLL $t_1,1
2144 $ADDU $c_2,$t_1
2145 sltu $at,$c_2,$t_1
2146 $ADDU $t_2,$at
2147 $ADDU $c_3,$t_2
2148 sltu $at,$c_3,$t_2
2149 $ADDU $c_1,$at
2150 mflo $t_1
2151 mfhi $t_2
2152 slt $at,$t_2,$zero
2153 $ADDU $c_1,$at
2154 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
2155 $SLL $t_2,1
2156 slt $a2,$t_1,$zero
2157 $ADDU $t_2,$a2
2158 $SLL $t_1,1
2159 $ADDU $c_2,$t_1
2160 sltu $at,$c_2,$t_1
2161 $ADDU $t_2,$at
2162 $ADDU $c_3,$t_2
2163 sltu $at,$c_3,$t_2
2164 $ADDU $c_1,$at
2165 mflo $t_1
2166 mfhi $t_2
2167 slt $at,$t_2,$zero
2168 $ADDU $c_1,$at
2169 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
2170 $SLL $t_2,1
2171 slt $a2,$t_1,$zero
2172 $ADDU $t_2,$a2
2173 $SLL $t_1,1
2174 $ADDU $c_2,$t_1
2175 sltu $at,$c_2,$t_1
2176 $ADDU $t_2,$at
2177 $ADDU $c_3,$t_2
2178 sltu $at,$c_3,$t_2
2179 $ADDU $c_1,$at
2180 $ST $c_2,7*$BNSZ($a0)
2182 mflo $t_1
2183 mfhi $t_2
2184 slt $c_2,$t_2,$zero
2185 $SLL $t_2,1
2186 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
2187 slt $a2,$t_1,$zero
2188 $ADDU $t_2,$a2
2189 $SLL $t_1,1
2190 $ADDU $c_3,$t_1
2191 sltu $at,$c_3,$t_1
2192 $ADDU $t_2,$at
2193 $ADDU $c_1,$t_2
2194 sltu $at,$c_1,$t_2
2195 $ADDU $c_2,$at
2196 mflo $t_1
2197 mfhi $t_2
2198 slt $at,$t_2,$zero
2199 $ADDU $c_2,$at
2200 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
2201 $SLL $t_2,1
2202 slt $a2,$t_1,$zero
2203 $ADDU $t_2,$a2
2204 $SLL $t_1,1
2205 $ADDU $c_3,$t_1
2206 sltu $at,$c_3,$t_1
2207 $ADDU $t_2,$at
2208 $ADDU $c_1,$t_2
2209 sltu $at,$c_1,$t_2
2210 $ADDU $c_2,$at
2211 mflo $t_1
2212 mfhi $t_2
2213 slt $at,$t_2,$zero
2214 $ADDU $c_2,$at
2215 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
2216 $SLL $t_2,1
2217 slt $a2,$t_1,$zero
2218 $ADDU $t_2,$a2
2219 $SLL $t_1,1
2220 $ADDU $c_3,$t_1
2221 sltu $at,$c_3,$t_1
2222 $ADDU $t_2,$at
2223 $ADDU $c_1,$t_2
2224 sltu $at,$c_1,$t_2
2225 $ADDU $c_2,$at
2226 mflo $t_1
2227 mfhi $t_2
2228 $ADDU $c_3,$t_1
2229 sltu $at,$c_3,$t_1
2230 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2231 $ADDU $t_2,$at
2232 $ADDU $c_1,$t_2
2233 sltu $at,$c_1,$t_2
2234 $ADDU $c_2,$at
2235 $ST $c_3,8*$BNSZ($a0)
2237 mflo $t_1
2238 mfhi $t_2
2239 slt $c_3,$t_2,$zero
2240 $SLL $t_2,1
2241 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
2242 slt $a2,$t_1,$zero
2243 $ADDU $t_2,$a2
2244 $SLL $t_1,1
2245 $ADDU $c_1,$t_1
2246 sltu $at,$c_1,$t_1
2247 $ADDU $t_2,$at
2248 $ADDU $c_2,$t_2
2249 sltu $at,$c_2,$t_2
2250 $ADDU $c_3,$at
2251 mflo $t_1
2252 mfhi $t_2
2253 slt $at,$t_2,$zero
2254 $ADDU $c_3,$at
2255 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
2256 $SLL $t_2,1
2257 slt $a2,$t_1,$zero
2258 $ADDU $t_2,$a2
2259 $SLL $t_1,1
2260 $ADDU $c_1,$t_1
2261 sltu $at,$c_1,$t_1
2262 $ADDU $t_2,$at
2263 $ADDU $c_2,$t_2
2264 sltu $at,$c_2,$t_2
2265 $ADDU $c_3,$at
2266 mflo $t_1
2267 mfhi $t_2
2268 slt $at,$t_2,$zero
2269 $ADDU $c_3,$at
2270 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
2271 $SLL $t_2,1
2272 slt $a2,$t_1,$zero
2273 $ADDU $t_2,$a2
2274 $SLL $t_1,1
2275 $ADDU $c_1,$t_1
2276 sltu $at,$c_1,$t_1
2277 $ADDU $t_2,$at
2278 $ADDU $c_2,$t_2
2279 sltu $at,$c_2,$t_2
2280 $ADDU $c_3,$at
2281 $ST $c_1,9*$BNSZ($a0)
2283 mflo $t_1
2284 mfhi $t_2
2285 slt $c_1,$t_2,$zero
2286 $SLL $t_2,1
2287 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
2288 slt $a2,$t_1,$zero
2289 $ADDU $t_2,$a2
2290 $SLL $t_1,1
2291 $ADDU $c_2,$t_1
2292 sltu $at,$c_2,$t_1
2293 $ADDU $t_2,$at
2294 $ADDU $c_3,$t_2
2295 sltu $at,$c_3,$t_2
2296 $ADDU $c_1,$at
2297 mflo $t_1
2298 mfhi $t_2
2299 slt $at,$t_2,$zero
2300 $ADDU $c_1,$at
2301 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
2302 $SLL $t_2,1
2303 slt $a2,$t_1,$zero
2304 $ADDU $t_2,$a2
2305 $SLL $t_1,1
2306 $ADDU $c_2,$t_1
2307 sltu $at,$c_2,$t_1
2308 $ADDU $t_2,$at
2309 $ADDU $c_3,$t_2
2310 sltu $at,$c_3,$t_2
2311 $ADDU $c_1,$at
2312 mflo $t_1
2313 mfhi $t_2
2314 $ADDU $c_2,$t_1
2315 sltu $at,$c_2,$t_1
2316 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2317 $ADDU $t_2,$at
2318 $ADDU $c_3,$t_2
2319 sltu $at,$c_3,$t_2
2320 $ADDU $c_1,$at
2321 $ST $c_2,10*$BNSZ($a0)
2323 mflo $t_1
2324 mfhi $t_2
2325 slt $c_2,$t_2,$zero
2326 $SLL $t_2,1
2327 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
2328 slt $a2,$t_1,$zero
2329 $ADDU $t_2,$a2
2330 $SLL $t_1,1
2331 $ADDU $c_3,$t_1
2332 sltu $at,$c_3,$t_1
2333 $ADDU $t_2,$at
2334 $ADDU $c_1,$t_2
2335 sltu $at,$c_1,$t_2
2336 $ADDU $c_2,$at
2337 mflo $t_1
2338 mfhi $t_2
2339 slt $at,$t_2,$zero
2340 $ADDU $c_2,$at
2341 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
2342 $SLL $t_2,1
2343 slt $a2,$t_1,$zero
2344 $ADDU $t_2,$a2
2345 $SLL $t_1,1
2346 $ADDU $c_3,$t_1
2347 sltu $at,$c_3,$t_1
2348 $ADDU $t_2,$at
2349 $ADDU $c_1,$t_2
2350 sltu $at,$c_1,$t_2
2351 $ADDU $c_2,$at
2352 $ST $c_3,11*$BNSZ($a0)
2354 mflo $t_1
2355 mfhi $t_2
2356 slt $c_3,$t_2,$zero
2357 $SLL $t_2,1
2358 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
2359 slt $a2,$t_1,$zero
2360 $ADDU $t_2,$a2
2361 $SLL $t_1,1
2362 $ADDU $c_1,$t_1
2363 sltu $at,$c_1,$t_1
2364 $ADDU $t_2,$at
2365 $ADDU $c_2,$t_2
2366 sltu $at,$c_2,$t_2
2367 $ADDU $c_3,$at
2368 mflo $t_1
2369 mfhi $t_2
2370 $ADDU $c_1,$t_1
2371 sltu $at,$c_1,$t_1
2372 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2373 $ADDU $t_2,$at
2374 $ADDU $c_2,$t_2
2375 sltu $at,$c_2,$t_2
2376 $ADDU $c_3,$at
2377 $ST $c_1,12*$BNSZ($a0)
2379 mflo $t_1
2380 mfhi $t_2
2381 slt $c_1,$t_2,$zero
2382 $SLL $t_2,1
2383 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
2384 slt $a2,$t_1,$zero
2385 $ADDU $t_2,$a2
2386 $SLL $t_1,1
2387 $ADDU $c_2,$t_1
2388 sltu $at,$c_2,$t_1
2389 $ADDU $t_2,$at
2390 $ADDU $c_3,$t_2
2391 sltu $at,$c_3,$t_2
2392 $ADDU $c_1,$at
2393 $ST $c_2,13*$BNSZ($a0)
2395 mflo $t_1
2396 mfhi $t_2
2397 $ADDU $c_3,$t_1
2398 sltu $at,$c_3,$t_1
2399 $ADDU $t_2,$at
2400 $ADDU $c_1,$t_2
2401 $ST $c_3,14*$BNSZ($a0)
2402 $ST $c_1,15*$BNSZ($a0)
2404 .set noreorder
2406 $code.=<<___ if ($flavour =~ /nubi/i);
2407 $REG_L $t3,4*$SZREG($sp)
2408 $REG_L $t2,3*$SZREG($sp)
2409 $REG_L $t1,2*$SZREG($sp)
2410 $REG_L $t0,1*$SZREG($sp)
2411 $REG_L $gp,0*$SZREG($sp)
2412 $PTR_ADD $sp,6*$SZREG
2414 $code.=<<___;
2415 jr $ra
2417 .end bn_sqr_comba8
2419 .align 5
2420 .globl bn_sqr_comba4
2421 .ent bn_sqr_comba4
2422 bn_sqr_comba4:
2424 $code.=<<___ if ($flavour =~ /nubi/i);
2425 .frame $sp,6*$SZREG,$ra
2426 .mask 0x8000f008,-$SZREG
2427 .set noreorder
2428 $PTR_SUB $sp,6*$SZREG
2429 $REG_S $ra,5*$SZREG($sp)
2430 $REG_S $t3,4*$SZREG($sp)
2431 $REG_S $t2,3*$SZREG($sp)
2432 $REG_S $t1,2*$SZREG($sp)
2433 $REG_S $t0,1*$SZREG($sp)
2434 $REG_S $gp,0*$SZREG($sp)
2436 $code.=<<___;
2437 .set reorder
2438 $LD $a_0,0($a1)
2439 $LD $a_1,$BNSZ($a1)
2440 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2441 $LD $a_2,2*$BNSZ($a1)
2442 $LD $a_3,3*$BNSZ($a1)
2443 mflo $c_1
2444 mfhi $c_2
2445 $ST $c_1,0($a0)
2447 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2448 mflo $t_1
2449 mfhi $t_2
2450 slt $c_1,$t_2,$zero
2451 $SLL $t_2,1
2452 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2453 slt $a2,$t_1,$zero
2454 $ADDU $t_2,$a2
2455 $SLL $t_1,1
2456 $ADDU $c_2,$t_1
2457 sltu $at,$c_2,$t_1
2458 $ADDU $c_3,$t_2,$at
2459 $ST $c_2,$BNSZ($a0)
2461 mflo $t_1
2462 mfhi $t_2
2463 slt $c_2,$t_2,$zero
2464 $SLL $t_2,1
2465 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
2466 slt $a2,$t_1,$zero
2467 $ADDU $t_2,$a2
2468 $SLL $t_1,1
2469 $ADDU $c_3,$t_1
2470 sltu $at,$c_3,$t_1
2471 $ADDU $t_2,$at
2472 $ADDU $c_1,$t_2
2473 sltu $at,$c_1,$t_2
2474 $ADDU $c_2,$at
2475 mflo $t_1
2476 mfhi $t_2
2477 $ADDU $c_3,$t_1
2478 sltu $at,$c_3,$t_1
2479 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2480 $ADDU $t_2,$at
2481 $ADDU $c_1,$t_2
2482 sltu $at,$c_1,$t_2
2483 $ADDU $c_2,$at
2484 $ST $c_3,2*$BNSZ($a0)
2486 mflo $t_1
2487 mfhi $t_2
2488 slt $c_3,$t_2,$zero
2489 $SLL $t_2,1
2490 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
2491 slt $a2,$t_1,$zero
2492 $ADDU $t_2,$a2
2493 $SLL $t_1,1
2494 $ADDU $c_1,$t_1
2495 sltu $at,$c_1,$t_1
2496 $ADDU $t_2,$at
2497 $ADDU $c_2,$t_2
2498 sltu $at,$c_2,$t_2
2499 $ADDU $c_3,$at
2500 mflo $t_1
2501 mfhi $t_2
2502 slt $at,$t_2,$zero
2503 $ADDU $c_3,$at
2504 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
2505 $SLL $t_2,1
2506 slt $a2,$t_1,$zero
2507 $ADDU $t_2,$a2
2508 $SLL $t_1,1
2509 $ADDU $c_1,$t_1
2510 sltu $at,$c_1,$t_1
2511 $ADDU $t_2,$at
2512 $ADDU $c_2,$t_2
2513 sltu $at,$c_2,$t_2
2514 $ADDU $c_3,$at
2515 $ST $c_1,3*$BNSZ($a0)
2517 mflo $t_1
2518 mfhi $t_2
2519 slt $c_1,$t_2,$zero
2520 $SLL $t_2,1
2521 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2522 slt $a2,$t_1,$zero
2523 $ADDU $t_2,$a2
2524 $SLL $t_1,1
2525 $ADDU $c_2,$t_1
2526 sltu $at,$c_2,$t_1
2527 $ADDU $t_2,$at
2528 $ADDU $c_3,$t_2
2529 sltu $at,$c_3,$t_2
2530 $ADDU $c_1,$at
2531 mflo $t_1
2532 mfhi $t_2
2533 $ADDU $c_2,$t_1
2534 sltu $at,$c_2,$t_1
2535 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2536 $ADDU $t_2,$at
2537 $ADDU $c_3,$t_2
2538 sltu $at,$c_3,$t_2
2539 $ADDU $c_1,$at
2540 $ST $c_2,4*$BNSZ($a0)
2542 mflo $t_1
2543 mfhi $t_2
2544 slt $c_2,$t_2,$zero
2545 $SLL $t_2,1
2546 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2547 slt $a2,$t_1,$zero
2548 $ADDU $t_2,$a2
2549 $SLL $t_1,1
2550 $ADDU $c_3,$t_1
2551 sltu $at,$c_3,$t_1
2552 $ADDU $t_2,$at
2553 $ADDU $c_1,$t_2
2554 sltu $at,$c_1,$t_2
2555 $ADDU $c_2,$at
2556 $ST $c_3,5*$BNSZ($a0)
2558 mflo $t_1
2559 mfhi $t_2
2560 $ADDU $c_1,$t_1
2561 sltu $at,$c_1,$t_1
2562 $ADDU $t_2,$at
2563 $ADDU $c_2,$t_2
2564 $ST $c_1,6*$BNSZ($a0)
2565 $ST $c_2,7*$BNSZ($a0)
2567 .set noreorder
2569 $code.=<<___ if ($flavour =~ /nubi/i);
2570 $REG_L $t3,4*$SZREG($sp)
2571 $REG_L $t2,3*$SZREG($sp)
2572 $REG_L $t1,2*$SZREG($sp)
2573 $REG_L $t0,1*$SZREG($sp)
2574 $REG_L $gp,0*$SZREG($sp)
2575 $PTR_ADD $sp,6*$SZREG
2577 $code.=<<___;
2578 jr $ra
2580 .end bn_sqr_comba4
2582 print $code;
2583 close STDOUT;