1 //===----------------------Hexagon builtin routine ------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Double Precision Multiply
44 #define HI_MANTBITS 20
47 #define MANTISSA_TO_INT_BIAS 52
49 // Some constant to adjust normalization amount in error code
50 // Amount to right shift the partial product to get to a denorm
53 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
54 #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
55 #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
56 #define END(TAG) .size TAG,.-TAG
58 #define SR_ROUND_OFF 22
60 .global __hexagon_muldf3
61 .type __hexagon_muldf3,@function
70 ATMP = combine(##0x40000000,#0)
73 ATMP = insert(A,#MANTBITS,#EXPBITS-1)
74 BTMP = asl(B,#EXPBITS-1)
79 PP_ODD = mpyu(BTMPL,ATMPH)
80 BTMP = insert(ONE,#2,#62)
82 // since we know that the MSB of the H registers is zero, we should never carry
83 // H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1
84 // Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
85 // Therefore, we can add 3 2^32-1 values safely without carry. We only need one.
87 PP_LL = mpyu(ATMPL,BTMPL)
88 PP_ODD += mpyu(ATMPL,BTMPH)
91 PP_ODD += lsr(PP_LL,#32)
92 PP_HH = mpyu(ATMPH,BTMPH)
93 BTMP = combine(##BIAS+BIAS-4,#0)
96 PP_HH += lsr(PP_ODD,#32)
97 if (!p0) jump .Lmul_abnormal
98 p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
99 p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
102 // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
103 // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
112 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
113 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
114 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
118 EXP0 += add(TMP,EXP1)
122 if (!p2.new) PP_HH = PP_LL
124 p0 = !cmp.gt(EXP0,BTMPH)
125 p0 = cmp.gt(EXP0,BTMPL)
126 if (!p0.new) jump:nt .Lmul_ovf_unf
129 A = convert_d2df(PP_HH)
130 EXP0 = add(EXP0,#-BIAS-58)
133 AH += asl(EXP0,#HI_MANTBITS)
139 // We end up with a positive exponent
140 // But we may have rounded up to an exponent of 1.
141 // If the exponent is 1, if we rounded up to it
142 // we need to also raise underflow
143 // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
144 // And the PP should also have more than one bit set
146 // Note: ATMP should have abs(PP_HH)
147 // Note: BTMPL should have 0x7FEFFFFF
150 p0 = bitsclr(AH,BTMPL)
151 if (!p0.new) jumpr:t r31
155 p0 = bitsset(ATMPH,BTMPH)
160 if (p0) BTMPL = or(BTMPL,BTMPH)
172 A = convert_d2df(PP_HH)
173 ATMP = abs(PP_HH) // take absolute value
174 EXP1 = add(EXP0,#-BIAS-58)
177 AH += asl(EXP1,#HI_MANTBITS)
178 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
182 EXP1 += add(EXP0,##-BIAS-58)
183 //BTMPH = add(clb(ATMP),#-2)
187 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
188 if (p0.new) jump:nt .Lmul_ovf
192 if (p0.new) jump:nt .Lpossible_unf
193 BTMPH = sub(EXP0,BTMPH)
194 TMP = #63 // max amount to shift
198 // PP_HH has the partial product with sticky LSB.
199 // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
200 // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
201 // The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
202 // That's the exponent that happens after the normalization
204 // EXP0 has the exponent that, when added to the normalized value, is out of range.
208 // * Shift down bits, with sticky bit, such that the bits are aligned according
209 // to the LZ count and appropriate exponent, but not all the way to mantissa
210 // field, keep around the last few bits.
211 // * Put a 1 near the MSB
212 // * Check the LSBs for inexact; if inexact also set underflow
213 // * Convert [u]d2df -- will correctly round according to rounding mode
214 // * Replace exponent field with zero
217 BTMPL = #0 // offset for extract
218 BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
221 p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
222 BTMPH = min(BTMPH,TMP) // Don't shift more than 63
227 PP_LL = extractu(PP_HH,BTMP)
230 PP_HH = asr(PP_HH,BTMPH)
231 BTMPL = #0x0030 // underflow flag
232 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
235 p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
236 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
237 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
241 p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
242 if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
245 if (!p3) PP_HH = PP_LL
249 A = convert_d2df(PP_HH) // Do rounding
250 p0 = dfcmp.eq(A,A) // realize exception
253 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
258 // We get either max finite value or infinity. Either way, overflow+inexact
261 ATMP = combine(##0x7fefffff,#-1) // positive max finite
265 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
266 TMP = or(TMP,#0x28) // inexact + overflow
267 BTMP = combine(##0x7ff00000,#0) // positive infinity
271 PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
272 TMP = PP_LL_L // unmodified rounding mode
275 p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
276 p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
277 if (p0.new) ATMP = BTMP // we should get infinity
278 p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
281 A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
287 ATMP = extractu(A,#63,#0) // strip off sign
288 BTMP = extractu(B,#63,#0) // strip off sign
291 p3 = cmp.gtu(ATMP,BTMP)
292 if (!p3.new) A = B // sort values
293 if (!p3.new) B = A // sort values
296 // Any NaN --> NaN, possibly raise invalid if sNaN
297 p0 = dfclass(A,#0x0f) // A not NaN?
298 if (!p0.new) jump:nt .Linvalid_nan
303 // Infinity * nonzero number is infinity
304 p1 = dfclass(A,#0x08) // A is infinity
305 p1 = dfclass(B,#0x0e) // B is nonzero
308 // Infinity * zero --> NaN, raise invalid
309 // Other zeros return zero
310 p0 = dfclass(A,#0x08) // A is infinity
311 p0 = dfclass(B,#0x01) // B is zero
314 if (p1) jump .Ltrue_inf
315 p2 = dfclass(B,#0x01)
318 if (p0) jump .Linvalid_zeroinf
319 if (p2) jump .Ltrue_zero // so return zero
322 // We are left with a normal or subnormal times a subnormal. A > B
323 // If A and B are both very small (exp(a) < BIAS-MANTBITS),
324 // we go to a single sticky bit, which we can round easily.
325 // If A and B might multiply to something bigger, decrease A exponent and increase
326 // B exponent and try again
329 if (p0.new) jump:nt .Lmul_tiny
335 TMP = add(TMP,#-EXPBITS)
341 B = insert(BTMP,#63,#0)
342 AH -= asl(TMP,#HI_MANTBITS)
344 jump __hexagon_muldf3
348 A = xor(A,B) // get sign bit
351 TMP = or(TMP,#0x30) // Inexact + Underflow
352 A = insert(ONE,#63,#0) // put in rounded up value
353 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
357 p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
358 if (!p0.new) AL = #0 // If not, zero
359 BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
362 p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
363 if (!p0.new) AL = #0 // don't go to zero
378 p0 = dfcmp.uo(A,A) // force exception if enabled
383 p0 = dfclass(B,#0x0f) // if B is not NaN
384 TMP = convert_df2sf(A) // will generate invalid if sNaN
385 if (p0.new) B = A // make it whatever A is
388 BL = convert_df2sf(B) // will generate invalid if sNaN
400 BH = extract(BH,#1,#31)
406 END(__hexagon_muldf3)