Bug 497723 - forgot to restore callgrind output cleanup
[valgrind.git] / VEX / priv / host_generic_maddf.c
blob7855068c64901c9405afc04388f93ec7c1ccb459
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_maddf.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 Compute x * y + z as ternary operation.
8 Copyright (C) 2010-2017 Free Software Foundation, Inc.
9 This file is part of the GNU C Library.
10 Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
12 The GNU C Library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 2.1 of the License, or (at your option) any later version.
17 The GNU C Library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with the GNU C Library; if not, see
24 <http://www.gnu.org/licenses/>.
27 /* Generic helper functions for doing FMA, i.e. compute x * y + z
28 as ternary operation.
29 These are purely back-end entities and cannot be seen/referenced
30 from IR. */
32 #include "libvex_basictypes.h"
33 #include "host_generic_maddf.h"
34 #include "main_util.h"
36 /* This implementation relies on Double being more than twice as
37 precise as Float and uses rounding to odd in order to avoid problems
38 with double rounding.
39 See a paper by Boldo and Melquiond:
40 http://www.lri.fr/~melquion/doc/08-tc.pdf */
42 #define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
44 #if defined(__x86_64__) && defined(__SSE2_MATH__)
45 # define ENV_TYPE unsigned int
46 /* Save current rounding mode into ENV, hold exceptions, set rounding
47 mode to rounding toward zero. */
48 # define ROUNDTOZERO(env) \
49 do { \
50 unsigned int mxcsr; \
51 __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
52 (env) = mxcsr; \
53 mxcsr = (mxcsr | 0x7f80) & ~0x3f; \
54 __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
55 } while (0)
56 /* Restore exceptions from ENV, return if inexact exception has been raised
57 since ROUNDTOZERO. */
58 # define RESET_TESTINEXACT(env) \
59 ({ \
60 unsigned int mxcsr, ret; \
61 __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
62 ret = (mxcsr >> 5) & 1; \
63 mxcsr = (mxcsr & 0x3d) | (env); \
64 __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
65 ret; \
67 /* Return if inexact exception has been raised since ROUNDTOZERO. */
68 # define TESTINEXACT() \
69 ({ \
70 unsigned int mxcsr; \
71 __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
72 (mxcsr >> 5) & 1; \
74 #endif
76 #define DBL_MANT_DIG 53
77 #define IEEE754_DOUBLE_BIAS 0x3ff
79 union vg_ieee754_double {
80 Double d;
82 /* This is the IEEE 754 double-precision format. */
83 struct {
84 #ifdef VKI_BIG_ENDIAN
85 unsigned int negative:1;
86 unsigned int exponent:11;
87 unsigned int mantissa0:20;
88 unsigned int mantissa1:32;
89 #else
90 unsigned int mantissa1:32;
91 unsigned int mantissa0:20;
92 unsigned int exponent:11;
93 unsigned int negative:1;
94 #endif
95 } ieee;
98 void VEX_REGPARM(3)
99 h_generic_calc_MAddF32 ( /*OUT*/Float* res,
100 Float* argX, Float* argY, Float* argZ )
102 #ifndef ENV_TYPE
103 /* Lame fallback implementation. */
104 *res = *argX * *argY + *argZ;
105 #else
106 ENV_TYPE env;
107 /* Multiplication is always exact. */
108 Double temp = (Double) *argX * (Double) *argY;
109 union vg_ieee754_double u;
111 ROUNDTOZERO (env);
113 /* Perform addition with round to odd. */
114 u.d = temp + (Double) *argZ;
115 /* Ensure the addition is not scheduled after fetestexcept call. */
116 FORCE_EVAL (u.d);
118 /* Reset rounding mode and test for inexact simultaneously. */
119 int j = RESET_TESTINEXACT (env);
121 if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
122 u.ieee.mantissa1 |= j;
124 /* And finally truncation with round to nearest. */
125 *res = (Float) u.d;
126 #endif
130 void VEX_REGPARM(3)
131 h_generic_calc_MAddF64 ( /*OUT*/Double* res,
132 Double* argX, Double* argY, Double* argZ )
134 #ifndef ENV_TYPE
135 /* Lame fallback implementation. */
136 *res = *argX * *argY + *argZ;
137 #else
138 Double x = *argX, y = *argY, z = *argZ;
139 union vg_ieee754_double u, v, w;
140 int adjust = 0;
141 u.d = x;
142 v.d = y;
143 w.d = z;
144 if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
145 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
146 || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
147 || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
148 || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
149 || UNLIKELY (u.ieee.exponent + v.ieee.exponent
150 <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
151 /* If z is Inf, but x and y are finite, the result should be
152 z rather than NaN. */
153 if (w.ieee.exponent == 0x7ff
154 && u.ieee.exponent != 0x7ff
155 && v.ieee.exponent != 0x7ff) {
156 *res = (z + x) + y;
157 return;
159 /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
160 or if x * y is less than half of DBL_DENORM_MIN,
161 compute as x * y + z. */
162 if (u.ieee.exponent == 0x7ff
163 || v.ieee.exponent == 0x7ff
164 || w.ieee.exponent == 0x7ff
165 || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
166 || u.ieee.exponent + v.ieee.exponent
167 < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
168 *res = x * y + z;
169 return;
171 if (u.ieee.exponent + v.ieee.exponent
172 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
173 /* Compute 1p-53 times smaller result and multiply
174 at the end. */
175 if (u.ieee.exponent > v.ieee.exponent)
176 u.ieee.exponent -= DBL_MANT_DIG;
177 else
178 v.ieee.exponent -= DBL_MANT_DIG;
179 /* If x + y exponent is very large and z exponent is very small,
180 it doesn't matter if we don't adjust it. */
181 if (w.ieee.exponent > DBL_MANT_DIG)
182 w.ieee.exponent -= DBL_MANT_DIG;
183 adjust = 1;
184 } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
185 /* Similarly.
186 If z exponent is very large and x and y exponents are
187 very small, it doesn't matter if we don't adjust it. */
188 if (u.ieee.exponent > v.ieee.exponent) {
189 if (u.ieee.exponent > DBL_MANT_DIG)
190 u.ieee.exponent -= DBL_MANT_DIG;
191 } else if (v.ieee.exponent > DBL_MANT_DIG)
192 v.ieee.exponent -= DBL_MANT_DIG;
193 w.ieee.exponent -= DBL_MANT_DIG;
194 adjust = 1;
195 } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
196 u.ieee.exponent -= DBL_MANT_DIG;
197 if (v.ieee.exponent)
198 v.ieee.exponent += DBL_MANT_DIG;
199 else
200 v.d *= 0x1p53;
201 } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
202 v.ieee.exponent -= DBL_MANT_DIG;
203 if (u.ieee.exponent)
204 u.ieee.exponent += DBL_MANT_DIG;
205 else
206 u.d *= 0x1p53;
207 } else /* if (u.ieee.exponent + v.ieee.exponent
208 <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
209 if (u.ieee.exponent > v.ieee.exponent)
210 u.ieee.exponent += 2 * DBL_MANT_DIG;
211 else
212 v.ieee.exponent += 2 * DBL_MANT_DIG;
213 if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
214 if (w.ieee.exponent)
215 w.ieee.exponent += 2 * DBL_MANT_DIG;
216 else
217 w.d *= 0x1p106;
218 adjust = -1;
220 /* Otherwise x * y should just affect inexact
221 and nothing else. */
223 x = u.d;
224 y = v.d;
225 z = w.d;
227 /* Multiplication m1 + m2 = x * y using Dekker's algorithm. */
228 # define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
229 Double x1 = x * C;
230 Double y1 = y * C;
231 Double m1 = x * y;
232 x1 = (x - x1) + x1;
233 y1 = (y - y1) + y1;
234 Double x2 = x - x1;
235 Double y2 = y - y1;
236 Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
237 # undef C
239 /* Addition a1 + a2 = z + m1 using Knuth's algorithm. */
240 Double a1 = z + m1;
241 Double t1 = a1 - z;
242 Double t2 = a1 - t1;
243 t1 = m1 - t1;
244 t2 = z - t2;
245 Double a2 = t1 + t2;
247 ENV_TYPE env;
248 ROUNDTOZERO (env);
250 /* Perform m2 + a2 addition with round to odd. */
251 u.d = a2 + m2;
253 if (UNLIKELY (adjust < 0)) {
254 if ((u.ieee.mantissa1 & 1) == 0)
255 u.ieee.mantissa1 |= TESTINEXACT ();
256 v.d = a1 + u.d;
257 /* Ensure the addition is not scheduled after fetestexcept call. */
258 FORCE_EVAL (v.d);
261 /* Reset rounding mode and test for inexact simultaneously. */
262 int j = RESET_TESTINEXACT (env) != 0;
264 if (LIKELY (adjust == 0)) {
265 if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
266 u.ieee.mantissa1 |= j;
267 /* Result is a1 + u.d. */
268 *res = a1 + u.d;
269 } else if (LIKELY (adjust > 0)) {
270 if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
271 u.ieee.mantissa1 |= j;
272 /* Result is a1 + u.d, scaled up. */
273 *res = (a1 + u.d) * 0x1p53;
274 } else {
275 /* If a1 + u.d is exact, the only rounding happens during
276 scaling down. */
277 if (j == 0) {
278 *res = v.d * 0x1p-106;
279 return;
281 /* If result rounded to zero is not subnormal, no double
282 rounding will occur. */
283 if (v.ieee.exponent > 106) {
284 *res = (a1 + u.d) * 0x1p-106;
285 return;
287 /* If v.d * 0x1p-106 with round to zero is a subnormal above
288 or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
289 down just by 1 bit, which means v.ieee.mantissa1 |= j would
290 change the round bit, not sticky or guard bit.
291 v.d * 0x1p-106 never normalizes by shifting up,
292 so round bit plus sticky bit should be already enough
293 for proper rounding. */
294 if (v.ieee.exponent == 106) {
295 /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
296 v.ieee.mantissa1 & 1 is the round bit and j is our sticky
297 bit. In round-to-nearest 001 rounds down like 00,
298 011 rounds up, even though 01 rounds down (thus we need
299 to adjust), 101 rounds down like 10 and 111 rounds up
300 like 11. */
301 if ((v.ieee.mantissa1 & 3) == 1) {
302 v.d *= 0x1p-106;
303 if (v.ieee.negative)
304 *res = v.d - 0x1p-1074;
305 else
306 *res = v.d + 0x1p-1074;
307 } else
308 *res = v.d * 0x1p-106;
309 return;
311 v.ieee.mantissa1 |= j;
312 *res = v.d * 0x1p-106;
313 return;
315 #endif
318 /*---------------------------------------------------------------*/
319 /*--- end host_generic_maddf.c --*/
320 /*---------------------------------------------------------------*/