1 /* This is an assembly language implementation of mulsi3, divsi3, and modsi3
2 for the sparc processor.
4 These routines are derived from the SPARC Architecture Manual, version 8,
5 slightly edited to match the desired calling convention, and also to
6 optimize them for our purposes. */
8 /* An executable stack is *not* required for these functions. */
9 #if defined(__ELF__) && defined(__linux__)
10 .section .note.GNU-stack,"",%progbits
20 or %o0, %o1, %o4 ! logical or of multiplier and multiplicand
21 mov %o0, %y ! multiplier to Y register
22 andncc %o4, 0xfff, %o5 ! mask out lower 12 bits
23 be mul_shortway ! can do it the short way
24 andcc %g0, %g0, %o4 ! zero the partial product and clear NV cc
28 mulscc %o4, %o1, %o4 ! first iteration of 33
59 mulscc %o4, %o1, %o4 ! 32nd iteration
60 mulscc %o4, %g0, %o4 ! last iteration only shifts
61 ! the upper 32 bits of product are wrong, but we do not care
68 mulscc %o4, %o1, %o4 ! first iteration of 13
79 mulscc %o4, %o1, %o4 ! 12th iteration
80 mulscc %o4, %g0, %o4 ! last iteration only shifts
82 sll %o4, 12, %o4 ! left shift partial product by 12 bits
83 srl %o5, 20, %o5 ! right shift partial product by 20 bits
85 or %o5, %o4, %o0 ! merge for true product
90 * Division and remainder, from Appendix E of the SPARC Version 8
91 * Architecture Manual, with fixes from Gordon Irlam.
95 * Input: dividend and divisor in %o0 and %o1 respectively.
98 * .div name of function to generate
99 * div div=div => %o0 / %o1; div=rem => %o0 % %o1
100 * true true=true => signed; true=false => unsigned
102 * Algorithm parameters:
103 * N how many bits per iteration we try to get (4)
104 * WORDSIZE total number of bits (32)
107 * TOPBITS number of bits in the top decade of a number
109 * Important variables:
110 * Q the partial quotient under development (initially 0)
111 * R the remainder so far, initially the dividend
112 * ITER number of main division loop iterations required;
113 * equal to ceil(log2(quotient) / N). Note that this
114 * is the log base (2^N) of the quotient.
115 * V the current comparand, initially divisor*2^(ITER*N-1)
118 * Current estimate for non-large dividend is
119 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
120 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
121 * different path, as the upper bits of the quotient must be developed
130 mov 0, %g3 ! result is always positive
137 ! compute sign of result; if neither is negative, no problem
138 orcc %o1, %o0, %g0 ! either negative?
139 bge ready_to_divide ! no, go do the divide
140 xor %o1, %o0, %g3 ! compute sign in any case
144 ! %o1 is definitely negative; %o0 might also be negative
145 bge ready_to_divide ! if %o0 not negative...
146 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
147 1: ! %o0 is negative, %o1 is nonnegative
148 sub %g0, %o0, %o0 ! make %o0 nonnegative
153 ! Ready to divide. Compute size of quotient; scale comparand.
158 ! Divide by zero trap. If it returns, return 0 (about as
159 ! wrong as possible, but that is what SunOS does...).
165 cmp %o3, %o5 ! if %o1 exceeds %o0, done
166 blu got_result ! (and algorithm fails otherwise)
168 sethi %hi(1 << (32 - 4 - 1)), %g1
173 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
174 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
175 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
176 ! Compute ITER in an unorthodox manner: know we need to shift V into
177 ! the top decade: so do not even bother to compare to R.
187 2: addcc %o5, %o5, %o5
191 ! We get here if the %o1 overflowed while shifting.
192 ! This means that %o3 has the high-order bit set.
193 ! Restore %o5 and subtract from %o3.
194 sll %g1, 4, %g1 ! high order bit
195 srl %o5, 1, %o5 ! rest of %o5
206 /* NB: these are commented out in the V8-SPARC manual as well */
207 /* (I do not understand this) */
208 ! %o5 > %o3: went too far: back up 1 step
211 ! do single-bit divide steps
213 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
214 ! first divide step without thinking. BUT, the others are conditional,
215 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
216 ! order bit set in the first step, just falling into the regular
217 ! division loop will mess up the first time around.
218 ! So we unroll slightly...
221 bl end_regular_divide
243 b,a end_regular_divide
254 tst %o3 ! set up for initial iteration
257 ! depth 1, accumulated bits 0
260 ! remainder is positive
262 ! depth 2, accumulated bits 1
265 ! remainder is positive
267 ! depth 3, accumulated bits 3
270 ! remainder is positive
272 ! depth 4, accumulated bits 7
275 ! remainder is positive
278 add %o2, (7*2+1), %o2
281 ! remainder is negative
284 add %o2, (7*2-1), %o2
288 ! remainder is negative
290 ! depth 4, accumulated bits 5
293 ! remainder is positive
296 add %o2, (5*2+1), %o2
299 ! remainder is negative
302 add %o2, (5*2-1), %o2
305 ! remainder is negative
307 ! depth 3, accumulated bits 1
310 ! remainder is positive
312 ! depth 4, accumulated bits 3
315 ! remainder is positive
318 add %o2, (3*2+1), %o2
321 ! remainder is negative
324 add %o2, (3*2-1), %o2
327 ! remainder is negative
329 ! depth 4, accumulated bits 1
332 ! remainder is positive
335 add %o2, (1*2+1), %o2
338 ! remainder is negative
341 add %o2, (1*2-1), %o2
344 ! remainder is negative
346 ! depth 2, accumulated bits -1
349 ! remainder is positive
351 ! depth 3, accumulated bits -1
354 ! remainder is positive
356 ! depth 4, accumulated bits -1
359 ! remainder is positive
362 add %o2, (-1*2+1), %o2
365 ! remainder is negative
368 add %o2, (-1*2-1), %o2
371 ! remainder is negative
373 ! depth 4, accumulated bits -3
376 ! remainder is positive
379 add %o2, (-3*2+1), %o2
382 ! remainder is negative
385 add %o2, (-3*2-1), %o2
388 ! remainder is negative
390 ! depth 3, accumulated bits -3
393 ! remainder is positive
395 ! depth 4, accumulated bits -5
398 ! remainder is positive
401 add %o2, (-5*2+1), %o2
404 ! remainder is negative
407 add %o2, (-5*2-1), %o2
410 ! remainder is negative
412 ! depth 4, accumulated bits -7
415 ! remainder is positive
418 add %o2, (-7*2+1), %o2
421 ! remainder is negative
424 add %o2, (-7*2-1), %o2
432 ! non-restoring fixup here (one instruction only!)
437 ! check to see if answer should be < 0
447 /* This implementation was taken from glibc:
449 * Input: dividend and divisor in %o0 and %o1 respectively.
451 * Algorithm parameters:
452 * N how many bits per iteration we try to get (4)
453 * WORDSIZE total number of bits (32)
456 * TOPBITS number of bits in the top decade of a number
458 * Important variables:
459 * Q the partial quotient under development (initially 0)
460 * R the remainder so far, initially the dividend
461 * ITER number of main division loop iterations required;
462 * equal to ceil(log2(quotient) / N). Note that this
463 * is the log base (2^N) of the quotient.
464 * V the current comparand, initially divisor*2^(ITER*N-1)
467 * Current estimate for non-large dividend is
468 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
469 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
470 * different path, as the upper bits of the quotient must be developed
479 mov 0, %g3 ! result always positive
485 ! compute sign of result; if neither is negative, no problem
486 orcc %o1, %o0, %g0 ! either negative?
487 bge 2f ! no, go do the divide
488 mov %o0, %g3 ! sign of remainder matches %o0
492 ! %o1 is definitely negative; %o0 might also be negative
493 bge 2f ! if %o0 not negative...
494 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
495 1: ! %o0 is negative, %o1 is nonnegative
496 sub %g0, %o0, %o0 ! make %o0 nonnegative
499 ! Ready to divide. Compute size of quotient; scale comparand.
505 ! Divide by zero trap. If it returns, return 0 (about as
506 ! wrong as possible, but that is what SunOS does...).
512 cmp %o3, %o5 ! if %o1 exceeds %o0, done
513 blu got_result ! (and algorithm fails otherwise)
515 sethi %hi(1 << (32 - 4 - 1)), %g1
520 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
521 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
522 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
523 ! Compute ITER in an unorthodox manner: know we need to shift V into
524 ! the top decade: so do not even bother to compare to R.
534 2: addcc %o5, %o5, %o5
538 ! We get here if the %o1 overflowed while shifting.
539 ! This means that %o3 has the high-order bit set.
540 ! Restore %o5 and subtract from %o3.
541 sll %g1, 4, %g1 ! high order bit
542 srl %o5, 1, %o5 ! rest of %o5
553 /* NB: these are commented out in the V8-SPARC manual as well */
554 /* (I do not understand this) */
555 ! %o5 > %o3: went too far: back up 1 step
558 ! do single-bit divide steps
560 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
561 ! first divide step without thinking. BUT, the others are conditional,
562 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
563 ! order bit set in the first step, just falling into the regular
564 ! division loop will mess up the first time around.
565 ! So we unroll slightly...
568 bl end_regular_divide
590 b,a end_regular_divide
601 tst %o3 ! set up for initial iteration
604 ! depth 1, accumulated bits 0
607 ! remainder is positive
609 ! depth 2, accumulated bits 1
612 ! remainder is positive
614 ! depth 3, accumulated bits 3
617 ! remainder is positive
619 ! depth 4, accumulated bits 7
622 ! remainder is positive
625 add %o2, (7*2+1), %o2
627 ! remainder is negative
630 add %o2, (7*2-1), %o2
633 ! remainder is negative
635 ! depth 4, accumulated bits 5
638 ! remainder is positive
641 add %o2, (5*2+1), %o2
644 ! remainder is negative
647 add %o2, (5*2-1), %o2
650 ! remainder is negative
652 ! depth 3, accumulated bits 1
655 ! remainder is positive
657 ! depth 4, accumulated bits 3
660 ! remainder is positive
663 add %o2, (3*2+1), %o2
666 ! remainder is negative
669 add %o2, (3*2-1), %o2
672 ! remainder is negative
674 ! depth 4, accumulated bits 1
677 ! remainder is positive
680 add %o2, (1*2+1), %o2
683 ! remainder is negative
686 add %o2, (1*2-1), %o2
689 ! remainder is negative
691 ! depth 2, accumulated bits -1
694 ! remainder is positive
696 ! depth 3, accumulated bits -1
699 ! remainder is positive
701 ! depth 4, accumulated bits -1
704 ! remainder is positive
707 add %o2, (-1*2+1), %o2
710 ! remainder is negative
713 add %o2, (-1*2-1), %o2
716 ! remainder is negative
718 ! depth 4, accumulated bits -3
721 ! remainder is positive
724 add %o2, (-3*2+1), %o2
727 ! remainder is negative
730 add %o2, (-3*2-1), %o2
733 ! remainder is negative
735 ! depth 3, accumulated bits -3
738 ! remainder is positive
740 ! depth 4, accumulated bits -5
743 ! remainder is positive
746 add %o2, (-5*2+1), %o2
749 ! remainder is negative
752 add %o2, (-5*2-1), %o2
755 ! remainder is negative
757 ! depth 4, accumulated bits -7
760 ! remainder is positive
763 add %o2, (-7*2+1), %o2
766 ! remainder is negative
769 add %o2, (-7*2-1), %o2
777 ! non-restoring fixup here (one instruction only!)
781 ! check to see if answer should be < 0