compiler-rt/lib/builtins/hexagon/dfmul.S

   1 //===----------------------Hexagon builtin routine ------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 // Double Precision Multiply
  10 #define A r1:0
  11 #define AH r1
  12 #define AL r0
  13 #define B r3:2
  14 #define BH r3
  15 #define BL r2
  16
  17 #define BTMP r5:4
  18 #define BTMPH r5
  19 #define BTMPL r4
  20
  21 #define PP_ODD r7:6
  22 #define PP_ODD_H r7
  23 #define PP_ODD_L r6
  24
  25 #define ONE r9:8
  26 #define S_ONE r8
  27 #define S_ZERO r9
  28
  29 #define PP_HH r11:10
  30 #define PP_HH_H r11
  31 #define PP_HH_L r10
  32
  33 #define ATMP r13:12
  34 #define ATMPH r13
  35 #define ATMPL r12
  36
  37 #define PP_LL r15:14
  38 #define PP_LL_H r15
  39 #define PP_LL_L r14
  40
  41 #define TMP r28
  42
  43 #define MANTBITS 52
  44 #define HI_MANTBITS 20
  45 #define EXPBITS 11
  46 #define BIAS 1024
  47 #define MANTISSA_TO_INT_BIAS 52
  48
  49 // Some constant to adjust normalization amount in error code
  50 // Amount to right shift the partial product to get to a denorm
  51 #define FUDGE 5
  52
  53 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
  54 #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
  55 #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
  56 #define END(TAG) .size TAG,.-TAG
  57
  58 #define SR_ROUND_OFF 22
  59         .text
  60         .global __hexagon_muldf3
  61         .type __hexagon_muldf3,@function
  62         Q6_ALIAS(muldf3)
  63   FAST_ALIAS(muldf3)
  64   FAST2_ALIAS(muldf3)
  65         .p2align 5
  66 __hexagon_muldf3:
  67         {
  68                 p0 = dfclass(A,#2)
  69                 p0 = dfclass(B,#2)
  70                 ATMP = combine(##0x40000000,#0)
  71         }
  72         {
  73                 ATMP = insert(A,#MANTBITS,#EXPBITS-1)
  74                 BTMP = asl(B,#EXPBITS-1)
  75                 TMP = #-BIAS
  76                 ONE = #1
  77         }
  78         {
  79                 PP_ODD = mpyu(BTMPL,ATMPH)
  80                 BTMP = insert(ONE,#2,#62)
  81         }
  82         // since we know that the MSB of the H registers is zero, we should never carry
  83         // H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1
  84         // Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
  85         // Therefore, we can add 3 2^32-1 values safely without carry.  We only need one.
  86         {
  87                 PP_LL = mpyu(ATMPL,BTMPL)
  88                 PP_ODD += mpyu(ATMPL,BTMPH)
  89         }
  90         {
  91                 PP_ODD += lsr(PP_LL,#32)
  92                 PP_HH = mpyu(ATMPH,BTMPH)
  93                 BTMP = combine(##BIAS+BIAS-4,#0)
  94         }
  95         {
  96                 PP_HH += lsr(PP_ODD,#32)
  97                 if (!p0) jump .Lmul_abnormal
  98                 p1 = cmp.eq(PP_LL_L,#0)         // 64 lsb's 0?
  99                 p1 = cmp.eq(PP_ODD_L,#0)        // 64 lsb's 0?
 100         }
 101
 102         // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
 103         // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
 104
 105 #undef PP_ODD
 106 #undef PP_ODD_H
 107 #undef PP_ODD_L
 108 #define EXP10 r7:6
 109 #define EXP1 r7
 110 #define EXP0 r6
 111         {
 112                 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
 113                 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
 114                 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
 115         }
 116         {
 117                 PP_LL = neg(PP_HH)
 118                 EXP0 += add(TMP,EXP1)
 119                 TMP = xor(AH,BH)
 120         }
 121         {
 122                 if (!p2.new) PP_HH = PP_LL
 123                 p2 = cmp.gt(TMP,#-1)
 124                 p0 = !cmp.gt(EXP0,BTMPH)
 125                 p0 = cmp.gt(EXP0,BTMPL)
 126                 if (!p0.new) jump:nt .Lmul_ovf_unf
 127         }
 128         {
 129                 A = convert_d2df(PP_HH)
 130                 EXP0 = add(EXP0,#-BIAS-58)
 131         }
 132         {
 133                 AH += asl(EXP0,#HI_MANTBITS)
 134                 jumpr r31
 135         }
 136
 137         .falign
 138 .Lpossible_unf:
 139         // We end up with a positive exponent
 140         // But we may have rounded up to an exponent of 1.
 141         // If the exponent is 1, if we rounded up to it
 142         // we need to also raise underflow
 143         // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
 144         // And the PP should also have more than one bit set
 145         //
 146         // Note: ATMP should have abs(PP_HH)
 147         // Note: BTMPL should have 0x7FEFFFFF
 148         {
 149                 p0 = cmp.eq(AL,#0)
 150                 p0 = bitsclr(AH,BTMPL)
 151                 if (!p0.new) jumpr:t r31
 152                 BTMPH = #0x7fff
 153         }
 154         {
 155                 p0 = bitsset(ATMPH,BTMPH)
 156                 BTMPL = USR
 157                 BTMPH = #0x030
 158         }
 159         {
 160                 if (p0) BTMPL = or(BTMPL,BTMPH)
 161         }
 162         {
 163                 USR = BTMPL
 164         }
 165         {
 166                 p0 = dfcmp.eq(A,A)
 167                 jumpr r31
 168         }
 169         .falign
 170 .Lmul_ovf_unf:
 171         {
 172                 A = convert_d2df(PP_HH)
 173                 ATMP = abs(PP_HH)                       // take absolute value
 174                 EXP1 = add(EXP0,#-BIAS-58)
 175         }
 176         {
 177                 AH += asl(EXP1,#HI_MANTBITS)
 178                 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
 179                 BTMPL = ##0x7FEFFFFF
 180         }
 181         {
 182                 EXP1 += add(EXP0,##-BIAS-58)
 183                 //BTMPH = add(clb(ATMP),#-2)
 184                 BTMPH = #0
 185         }
 186         {
 187                 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
 188                 if (p0.new) jump:nt .Lmul_ovf
 189         }
 190         {
 191                 p0 = cmp.gt(EXP1,#0)
 192                 if (p0.new) jump:nt .Lpossible_unf
 193                 BTMPH = sub(EXP0,BTMPH)
 194                 TMP = #63                               // max amount to shift
 195         }
 196         // Underflow
 197         //
 198         // PP_HH has the partial product with sticky LSB.
 199         // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
 200         // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
 201         // The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
 202         // That's the exponent that happens after the normalization
 203         //
 204         // EXP0 has the exponent that, when added to the normalized value, is out of range.
 205         //
 206         // Strategy:
 207         //
 208         // * Shift down bits, with sticky bit, such that the bits are aligned according
 209         //   to the LZ count and appropriate exponent, but not all the way to mantissa
 210         //   field, keep around the last few bits.
 211         // * Put a 1 near the MSB
 212         // * Check the LSBs for inexact; if inexact also set underflow
 213         // * Convert [u]d2df -- will correctly round according to rounding mode
 214         // * Replace exponent field with zero
 215
 216         {
 217                 BTMPL = #0                              // offset for extract
 218                 BTMPH = sub(#FUDGE,BTMPH)               // amount to right shift
 219         }
 220         {
 221                 p3 = cmp.gt(PP_HH_H,#-1)                // is it positive?
 222                 BTMPH = min(BTMPH,TMP)                  // Don't shift more than 63
 223                 PP_HH = ATMP
 224         }
 225         {
 226                 TMP = USR
 227                 PP_LL = extractu(PP_HH,BTMP)
 228         }
 229         {
 230                 PP_HH = asr(PP_HH,BTMPH)
 231                 BTMPL = #0x0030                                 // underflow flag
 232                 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
 233         }
 234         {
 235                 p0 = cmp.gtu(ONE,PP_LL)                         // Did we extract all zeros?
 236                 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)        // add sticky bit
 237                 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)        // Add back in a bit so we can use convert instruction
 238         }
 239         {
 240                 PP_LL = neg(PP_HH)
 241                 p1 = bitsclr(PP_HH_L,#0x7)              // Are the LSB's clear?
 242                 if (!p1.new) TMP = or(BTMPL,TMP)        // If not, Inexact+Underflow
 243         }
 244         {
 245                 if (!p3) PP_HH = PP_LL
 246                 USR = TMP
 247         }
 248         {
 249                 A = convert_d2df(PP_HH)                 // Do rounding
 250                 p0 = dfcmp.eq(A,A)                      // realize exception
 251         }
 252         {
 253                 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)           // Insert correct exponent
 254                 jumpr r31
 255         }
 256         .falign
 257 .Lmul_ovf:
 258         // We get either max finite value or infinity.  Either way, overflow+inexact
 259         {
 260                 TMP = USR
 261                 ATMP = combine(##0x7fefffff,#-1)        // positive max finite
 262                 A = PP_HH
 263         }
 264         {
 265                 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)        // rounding bits
 266                 TMP = or(TMP,#0x28)                     // inexact + overflow
 267                 BTMP = combine(##0x7ff00000,#0)         // positive infinity
 268         }
 269         {
 270                 USR = TMP
 271                 PP_LL_L ^= lsr(AH,#31)                  // Does sign match rounding?
 272                 TMP = PP_LL_L                           // unmodified rounding mode
 273         }
 274         {
 275                 p0 = !cmp.eq(TMP,#1)                    // If not round-to-zero and
 276                 p0 = !cmp.eq(PP_LL_L,#2)                // Not rounding the other way,
 277                 if (p0.new) ATMP = BTMP                 // we should get infinity
 278                 p0 = dfcmp.eq(A,A)                      // Realize FP exception if enabled
 279         }
 280         {
 281                 A = insert(ATMP,#63,#0)                 // insert inf/maxfinite, leave sign
 282                 jumpr r31
 283         }
 284
 285 .Lmul_abnormal:
 286         {
 287                 ATMP = extractu(A,#63,#0)               // strip off sign
 288                 BTMP = extractu(B,#63,#0)               // strip off sign
 289         }
 290         {
 291                 p3 = cmp.gtu(ATMP,BTMP)
 292                 if (!p3.new) A = B                      // sort values
 293                 if (!p3.new) B = A                      // sort values
 294         }
 295         {
 296                 // Any NaN --> NaN, possibly raise invalid if sNaN
 297                 p0 = dfclass(A,#0x0f)           // A not NaN?
 298                 if (!p0.new) jump:nt .Linvalid_nan
 299                 if (!p3) ATMP = BTMP
 300                 if (!p3) BTMP = ATMP
 301         }
 302         {
 303                 // Infinity * nonzero number is infinity
 304                 p1 = dfclass(A,#0x08)           // A is infinity
 305                 p1 = dfclass(B,#0x0e)           // B is nonzero
 306         }
 307         {
 308                 // Infinity * zero --> NaN, raise invalid
 309                 // Other zeros return zero
 310                 p0 = dfclass(A,#0x08)           // A is infinity
 311                 p0 = dfclass(B,#0x01)           // B is zero
 312         }
 313         {
 314                 if (p1) jump .Ltrue_inf
 315                 p2 = dfclass(B,#0x01)
 316         }
 317         {
 318                 if (p0) jump .Linvalid_zeroinf
 319                 if (p2) jump .Ltrue_zero                // so return zero
 320                 TMP = ##0x7c000000
 321         }
 322         // We are left with a normal or subnormal times a subnormal. A > B
 323         // If A and B are both very small (exp(a) < BIAS-MANTBITS),
 324         // we go to a single sticky bit, which we can round easily.
 325         // If A and B might multiply to something bigger, decrease A exponent and increase
 326         // B exponent and try again
 327         {
 328                 p0 = bitsclr(AH,TMP)
 329                 if (p0.new) jump:nt .Lmul_tiny
 330         }
 331         {
 332                 TMP = cl0(BTMP)
 333         }
 334         {
 335                 TMP = add(TMP,#-EXPBITS)
 336         }
 337         {
 338                 BTMP = asl(BTMP,TMP)
 339         }
 340         {
 341                 B = insert(BTMP,#63,#0)
 342                 AH -= asl(TMP,#HI_MANTBITS)
 343         }
 344         jump __hexagon_muldf3
 345 .Lmul_tiny:
 346         {
 347                 TMP = USR
 348                 A = xor(A,B)                            // get sign bit
 349         }
 350         {
 351                 TMP = or(TMP,#0x30)                     // Inexact + Underflow
 352                 A = insert(ONE,#63,#0)                  // put in rounded up value
 353                 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)  // get rounding mode
 354         }
 355         {
 356                 USR = TMP
 357                 p0 = cmp.gt(BTMPH,#1)                   // Round towards pos/neg inf?
 358                 if (!p0.new) AL = #0                    // If not, zero
 359                 BTMPH ^= lsr(AH,#31)                    // rounding my way --> set LSB
 360         }
 361         {
 362                 p0 = cmp.eq(BTMPH,#3)                   // if rounding towards right inf
 363                 if (!p0.new) AL = #0                    // don't go to zero
 364                 jumpr r31
 365         }
 366 .Linvalid_zeroinf:
 367         {
 368                 TMP = USR
 369         }
 370         {
 371                 A = #-1
 372                 TMP = or(TMP,#2)
 373         }
 374         {
 375                 USR = TMP
 376         }
 377         {
 378                 p0 = dfcmp.uo(A,A)                      // force exception if enabled
 379                 jumpr r31
 380         }
 381 .Linvalid_nan:
 382         {
 383                 p0 = dfclass(B,#0x0f)                   // if B is not NaN
 384                 TMP = convert_df2sf(A)                  // will generate invalid if sNaN
 385                 if (p0.new) B = A                       // make it whatever A is
 386         }
 387         {
 388                 BL = convert_df2sf(B)                   // will generate invalid if sNaN
 389                 A = #-1
 390                 jumpr r31
 391         }
 392         .falign
 393 .Ltrue_zero:
 394         {
 395                 A = B
 396                 B = A
 397         }
 398 .Ltrue_inf:
 399         {
 400                 BH = extract(BH,#1,#31)
 401         }
 402         {
 403                 AH ^= asl(BH,#31)
 404                 jumpr r31
 405         }
 406 END(__hexagon_muldf3)
 407
 408 #undef ATMP
 409 #undef ATMPL
 410 #undef ATMPH
 411 #undef BTMP
 412 #undef BTMPL
 413 #undef BTMPH