sysdeps/alpha/remq.S

   1 /* Copyright (C) 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    The GNU C Library is free software; you can redistribute it and/or
   5    modify it under the terms of the GNU Lesser General Public
   6    License as published by the Free Software Foundation; either
   7    version 2.1 of the License, or (at your option) any later version.
   8
   9    The GNU C Library is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public
  15    License along with the GNU C Library; if not, write to the Free
  16    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  17    02111-1307 USA.  */
  18
  19 #include "div_libc.h"
  20
  21
  22 /* 64-bit signed long remainder.  These are not normal C functions.  Argument
  23    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
  24    be clobbered.
  25
  26    Theory of operation here is that we can use the FPU divider for virtually
  27    all operands that we see: all dividend values between -2**53 and 2**53-1
  28    can be computed directly.  Note that divisor values need not be checked
  29    against that range because the rounded fp value will be close enough such
  30    that the quotient is < 1, which will properly be truncated to zero when we
  31    convert back to integer.
  32
  33    When the dividend is outside the range for which we can compute exact
  34    results, we use the fp quotent as an estimate from which we begin refining
  35    an exact integral value.  This reduces the number of iterations in the
  36    shift-and-subtract loop significantly.
  37
  38    The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
  39    for cvttq/c even without /sui being set.  It will not, however, properly
  40    raise the exception, so we don't have to worry about FPCR_INED being clear
  41    and so dying by SIGFPE.  */
  42
  43         .text
  44         .align  4
  45         .globl  __remq
  46         .type   __remq, @funcnoplt
  47         .usepv  __remq, no
  48
  49         cfi_startproc
  50         cfi_return_column (RA)
  51 __remq:
  52         lda     sp, -FRAME(sp)
  53         cfi_def_cfa_offset (FRAME)
  54         CALL_MCOUNT
  55
  56         /* Get the fp divide insn issued as quickly as possible.  After
  57            that's done, we have at least 22 cycles until its results are
  58            ready -- all the time in the world to figure out how we're
  59            going to use the results.  */
  60         stt     $f0, 0(sp)
  61         excb
  62         beq     Y, DIVBYZERO
  63
  64         stt     $f1, 8(sp)
  65         stt     $f3, 48(sp)
  66         cfi_rel_offset ($f0, 0)
  67         cfi_rel_offset ($f1, 8)
  68         cfi_rel_offset ($f3, 48)
  69         mf_fpcr $f3
  70
  71         _ITOFT2 X, $f0, 16, Y, $f1, 24
  72         cvtqt   $f0, $f0
  73         cvtqt   $f1, $f1
  74         divt/c  $f0, $f1, $f0
  75
  76         /* Check to see if X fit in the double as an exact value.  */
  77         sll     X, (64-53), AT
  78         ldt     $f1, 8(sp)
  79         sra     AT, (64-53), AT
  80         cmpeq   X, AT, AT
  81         beq     AT, $x_big
  82
  83         /* If we get here, we're expecting exact results from the division.
  84            Do nothing else besides convert, compute remainder, clean up.  */
  85         cvttq/c $f0, $f0
  86         excb
  87         mt_fpcr $f3
  88         _FTOIT  $f0, AT, 16
  89         mulq    AT, Y, AT
  90         ldt     $f0, 0(sp)
  91         ldt     $f3, 48(sp)
  92         cfi_restore ($f1)
  93         cfi_remember_state
  94         cfi_restore ($f0)
  95         cfi_restore ($f3)
  96         cfi_def_cfa_offset (0)
  97         lda     sp, FRAME(sp)
  98         subq    X, AT, RV
  99         ret     $31, (RA), 1
 100
 101         .align  4
 102         cfi_restore_state
 103 $x_big:
 104         /* If we get here, X is large enough that we don't expect exact
 105            results, and neither X nor Y got mis-translated for the fp
 106            division.  Our task is to take the fp result, figure out how
 107            far it's off from the correct result and compute a fixup.  */
 108         stq     t0, 16(sp)
 109         stq     t1, 24(sp)
 110         stq     t2, 32(sp)
 111         stq     t5, 40(sp)
 112         cfi_rel_offset (t0, 16)
 113         cfi_rel_offset (t1, 24)
 114         cfi_rel_offset (t2, 32)
 115         cfi_rel_offset (t5, 40)
 116
 117 #define Q       t0              /* quotient */
 118 #define R       RV              /* remainder */
 119 #define SY      t1              /* scaled Y */
 120 #define S       t2              /* scalar */
 121 #define QY      t3              /* Q*Y */
 122
 123         /* The fixup code below can only handle unsigned values.  */
 124         or      X, Y, AT
 125         mov     $31, t5
 126         blt     AT, $fix_sign_in
 127 $fix_sign_in_ret1:
 128         cvttq/c $f0, $f0
 129
 130         _FTOIT  $f0, Q, 8
 131         .align  3
 132 $fix_sign_in_ret2:
 133         ldt     $f0, 0(sp)
 134         stq     t3, 0(sp)
 135         cfi_restore ($f0)
 136         cfi_rel_offset (t3, 0)
 137
 138         mulq    Q, Y, QY
 139         excb
 140         stq     t4, 8(sp)
 141         mt_fpcr $f3
 142         cfi_rel_offset (t4, 8)
 143
 144         subq    QY, X, R
 145         mov     Y, SY
 146         mov     1, S
 147         bgt     R, $q_high
 148
 149 $q_high_ret:
 150         subq    X, QY, R
 151         mov     Y, SY
 152         mov     1, S
 153         bgt     R, $q_low
 154
 155 $q_low_ret:
 156         ldq     t0, 16(sp)
 157         ldq     t1, 24(sp)
 158         ldq     t2, 32(sp)
 159         bne     t5, $fix_sign_out
 160
 161 $fix_sign_out_ret:
 162         ldq     t3, 0(sp)
 163         ldq     t4, 8(sp)
 164         ldq     t5, 40(sp)
 165         ldt     $f3, 48(sp)
 166         lda     sp, FRAME(sp)
 167         cfi_remember_state
 168         cfi_restore (t0)
 169         cfi_restore (t1)
 170         cfi_restore (t2)
 171         cfi_restore (t3)
 172         cfi_restore (t4)
 173         cfi_restore (t5)
 174         cfi_restore ($f3)
 175         cfi_def_cfa_offset (0)
 176         ret     $31, (RA), 1
 177
 178         .align  4
 179         cfi_restore_state
 180         /* The quotient that we computed was too large.  We need to reduce
 181            it by S such that Y*S >= R.  Obviously the closer we get to the
 182            correct value the better, but overshooting high is ok, as we'll
 183            fix that up later.  */
 184 0:
 185         addq    SY, SY, SY
 186         addq    S, S, S
 187 $q_high:
 188         cmpult  SY, R, AT
 189         bne     AT, 0b
 190
 191         subq    Q, S, Q
 192         unop
 193         subq    QY, SY, QY
 194         br      $q_high_ret
 195
 196         .align  4
 197         /* The quotient that we computed was too small.  Divide Y by the
 198            current remainder (R) and add that to the existing quotient (Q).
 199            The expectation, of course, is that R is much smaller than X.  */
 200         /* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 201            already have a copy of Y in SY and the value 1 in S.  */
 202 0:
 203         addq    SY, SY, SY
 204         addq    S, S, S
 205 $q_low:
 206         cmpult  SY, R, AT
 207         bne     AT, 0b
 208
 209         /* Shift-down and subtract loop.  Each iteration compares our scaled
 210            Y (SY) with the remainder (R); if SY <= R then X is divisible by
 211            Y's scalar (S) so add it to the quotient (Q).  */
 212 2:      addq    Q, S, t3
 213         srl     S, 1, S
 214         cmpule  SY, R, AT
 215         subq    R, SY, t4
 216
 217         cmovne  AT, t3, Q
 218         cmovne  AT, t4, R
 219         srl     SY, 1, SY
 220         bne     S, 2b
 221
 222         br      $q_low_ret
 223
 224         .align  4
 225 $fix_sign_in:
 226         /* If we got here, then X|Y is negative.  Need to adjust everything
 227            such that we're doing unsigned division in the fixup loop.  */
 228         /* T5 records the changes we had to make:
 229                 bit 0:  set if X was negated.  Note that the sign of the
 230                         remainder follows the sign of the divisor.
 231                 bit 2:  set if Y was negated.
 232         */
 233         xor     X, Y, t1
 234         cmplt   X, 0, t5
 235         negq    X, t0
 236         cmovne  t5, t0, X
 237
 238         cmplt   Y, 0, AT
 239         negq    Y, t0
 240         s4addq  AT, t5, t5
 241         cmovne  AT, t0, Y
 242
 243         bge     t1, $fix_sign_in_ret1
 244         cvttq/c $f0, $f0
 245         _FTOIT  $f0, Q, 8
 246         .align  3
 247         negq    Q, Q
 248         br      $fix_sign_in_ret2
 249
 250         .align  4
 251 $fix_sign_out:
 252         /* Now we get to undo what we did above.  */
 253         /* ??? Is this really faster than just increasing the size of
 254            the stack frame and storing X and Y in memory?  */
 255         and     t5, 4, AT
 256         negq    Y, t4
 257         cmovne  AT, t4, Y
 258
 259         negq    X, t4
 260         cmovlbs t5, t4, X
 261         negq    RV, t4
 262         cmovlbs t5, t4, RV
 263
 264         br      $fix_sign_out_ret
 265
 266         cfi_endproc
 267         .size   __remq, .-__remq
 268
 269         DO_DIVBYZERO