Make sure frexp() returns correct for argument 0.0
[gromacs.git] / src / gromacs / simd / simd_math.h
blobabd001c5f55878197167250cc3d429097b9352f4
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
5 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_SIMD_MATH_H
37 #define GMX_SIMD_SIMD_MATH_H
39 /*! \libinternal \file
41 * \brief Math functions for SIMD datatypes.
43 * \attention This file is generic for all SIMD architectures, so you cannot
44 * assume that any of the optional SIMD features (as defined in simd.h) are
45 * present. In particular, this means you cannot assume support for integers,
46 * logical operations (neither on floating-point nor integer values), shifts,
47 * and the architecture might only have SIMD for either float or double.
48 * Second, to keep this file clean and general, any additions to this file
49 * must work for all possible SIMD architectures in both single and double
50 * precision (if they support it), and you cannot make any assumptions about
51 * SIMD width.
53 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \inlibraryapi
56 * \ingroup module_simd
59 #include "config.h"
61 #include <cmath>
63 #include <limits>
65 #include "gromacs/math/utilities.h"
66 #include "gromacs/simd/simd.h"
67 #include "gromacs/utility/basedefinitions.h"
68 #include "gromacs/utility/real.h"
70 namespace gmx
73 #if GMX_SIMD
75 /*! \cond libapi */
76 /*! \addtogroup module_simd */
77 /*! \{ */
79 /*! \name Implementation accuracy settings
80 * \{
83 /*! \} */
85 # if GMX_SIMD_HAVE_FLOAT
87 /*! \name Single precision SIMD math functions
89 * \note In most cases you should use the real-precision functions instead.
90 * \{
93 /****************************************
94 * SINGLE PRECISION SIMD MATH FUNCTIONS *
95 ****************************************/
97 # if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
98 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
100 * \param x Values to set sign for
101 * \param y Values used to set sign
102 * \return Magnitude of x, sign of y
104 static inline SimdFloat gmx_simdcall copysign(SimdFloat x, SimdFloat y)
106 # if GMX_SIMD_HAVE_LOGICAL
107 return abs(x) | (SimdFloat(GMX_FLOAT_NEGZERO) & y);
108 # else
109 return blend(abs(x), -abs(x), y < setZero());
110 # endif
112 # endif
114 # if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
115 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
117 * This is a low-level routine that should only be used by SIMD math routine
118 * that evaluates the inverse square root.
120 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
121 * \param x The reference (starting) value x for which we want 1/sqrt(x).
122 * \return An improved approximation with roughly twice as many bits of accuracy.
124 static inline SimdFloat gmx_simdcall rsqrtIter(SimdFloat lu, SimdFloat x)
126 SimdFloat tmp1 = x * lu;
127 SimdFloat tmp2 = SimdFloat(-0.5F) * lu;
128 tmp1 = fma(tmp1, lu, SimdFloat(-3.0F));
129 return tmp1 * tmp2;
131 # endif
133 /*! \brief Calculate 1/sqrt(x) for SIMD float.
135 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
136 * GMX_FLOAT_MAX, i.e. within the range of single precision.
137 * For the single precision implementation this is obviously always
138 * true for positive values, but for double precision it adds an
139 * extra restriction since the first lookup step might have to be
140 * performed in single precision on some architectures. Note that the
141 * responsibility for checking falls on you - this routine does not
142 * check arguments.
144 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
146 static inline SimdFloat gmx_simdcall invsqrt(SimdFloat x)
148 SimdFloat lu = rsqrt(x);
149 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
150 lu = rsqrtIter(lu, x);
151 # endif
152 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
153 lu = rsqrtIter(lu, x);
154 # endif
155 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
156 lu = rsqrtIter(lu, x);
157 # endif
158 return lu;
161 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
163 * \param x0 First set of arguments, x0 must be in single range (see below).
164 * \param x1 Second set of arguments, x1 must be in single range (see below).
165 * \param[out] out0 Result 1/sqrt(x0)
166 * \param[out] out1 Result 1/sqrt(x1)
168 * In particular for double precision we can sometimes calculate square root
169 * pairs slightly faster by using single precision until the very last step.
171 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
172 * GMX_FLOAT_MAX, i.e. within the range of single precision.
173 * For the single precision implementation this is obviously always
174 * true for positive values, but for double precision it adds an
175 * extra restriction since the first lookup step might have to be
176 * performed in single precision on some architectures. Note that the
177 * responsibility for checking falls on you - this routine does not
178 * check arguments.
180 static inline void gmx_simdcall invsqrtPair(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
182 *out0 = invsqrt(x0);
183 *out1 = invsqrt(x1);
186 # if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
187 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
189 * This is a low-level routine that should only be used by SIMD math routine
190 * that evaluates the reciprocal.
192 * \param lu Approximation of 1/x, typically obtained from lookup.
193 * \param x The reference (starting) value x for which we want 1/x.
194 * \return An improved approximation with roughly twice as many bits of accuracy.
196 static inline SimdFloat gmx_simdcall rcpIter(SimdFloat lu, SimdFloat x)
198 return lu * fnma(lu, x, SimdFloat(2.0F));
200 # endif
202 /*! \brief Calculate 1/x for SIMD float.
204 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
205 * GMX_FLOAT_MAX, i.e. within the range of single precision.
206 * For the single precision implementation this is obviously always
207 * true for positive values, but for double precision it adds an
208 * extra restriction since the first lookup step might have to be
209 * performed in single precision on some architectures. Note that the
210 * responsibility for checking falls on you - this routine does not
211 * check arguments.
213 * \return 1/x. Result is undefined if your argument was invalid.
215 static inline SimdFloat gmx_simdcall inv(SimdFloat x)
217 SimdFloat lu = rcp(x);
218 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
219 lu = rcpIter(lu, x);
220 # endif
221 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
222 lu = rcpIter(lu, x);
223 # endif
224 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
225 lu = rcpIter(lu, x);
226 # endif
227 return lu;
230 /*! \brief Division for SIMD floats
232 * \param nom Nominator
233 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
234 * For single precision this is equivalent to a nonzero argument,
235 * but in double precision it adds an extra restriction since
236 * the first lookup step might have to be performed in single
237 * precision on some architectures. Note that the responsibility
238 * for checking falls on you - this routine does not check arguments.
240 * \return nom/denom
242 * \note This function does not use any masking to avoid problems with
243 * zero values in the denominator.
245 static inline SimdFloat gmx_simdcall operator/(SimdFloat nom, SimdFloat denom)
247 return nom * inv(denom);
250 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
252 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
253 * Illegal values in the masked-out elements will not lead to
254 * floating-point exceptions.
256 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
257 * GMX_FLOAT_MAX for masked-in entries.
258 * See \ref invsqrt for the discussion about argument restrictions.
259 * \param m Mask
260 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
261 * entry was not masked, and 0.0 for masked-out entries.
263 static inline SimdFloat maskzInvsqrt(SimdFloat x, SimdFBool m)
265 SimdFloat lu = maskzRsqrt(x, m);
266 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
267 lu = rsqrtIter(lu, x);
268 # endif
269 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
270 lu = rsqrtIter(lu, x);
271 # endif
272 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
273 lu = rsqrtIter(lu, x);
274 # endif
275 return lu;
278 /*! \brief Calculate 1/x for SIMD float, masked version.
280 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
281 * GMX_FLOAT_MAX for masked-in entries.
282 * See \ref invsqrt for the discussion about argument restrictions.
283 * \param m Mask
284 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
286 static inline SimdFloat gmx_simdcall maskzInv(SimdFloat x, SimdFBool m)
288 SimdFloat lu = maskzRcp(x, m);
289 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
290 lu = rcpIter(lu, x);
291 # endif
292 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
293 lu = rcpIter(lu, x);
294 # endif
295 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
296 lu = rcpIter(lu, x);
297 # endif
298 return lu;
301 /*! \brief Calculate sqrt(x) for SIMD floats
303 * \tparam opt By default, this function checks if the input value is 0.0
304 * and masks this to return the correct result. If you are certain
305 * your argument will never be zero, and you know you need to save
306 * every single cycle you can, you can alternatively call the
307 * function as sqrt<MathOptimization::Unsafe>(x).
309 * \param x Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
310 * lookup step often has to be implemented in single precision.
311 * Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
312 * result, even in double precision. If you are using the unsafe
313 * math optimization parameter, the argument must be in the range
314 * GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
316 * \return sqrt(x). The result is undefined if the input value does not fall
317 * in the allowed range specified for the argument.
319 template<MathOptimization opt = MathOptimization::Safe>
320 static inline SimdFloat gmx_simdcall sqrt(SimdFloat x)
322 if (opt == MathOptimization::Safe)
324 SimdFloat res = maskzInvsqrt(x, setZero() < x);
325 return res * x;
327 else
329 return x * invsqrt(x);
333 /*! \brief Cube root for SIMD floats
335 * \param x Argument to calculate cube root of. Can be negative or zero,
336 * but NaN or Inf values are not supported. Denormal values will
337 * be treated as 0.0.
338 * \return Cube root of x.
340 static inline SimdFloat gmx_simdcall cbrt(SimdFloat x)
342 const SimdFloat signBit(GMX_FLOAT_NEGZERO);
343 const SimdFloat minFloat(std::numeric_limits<float>::min());
344 // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
345 // negative exponent from frexp() is -126, we can subtract one more unit to get 126
346 // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
347 // division mixed with FP, we let the divided value (42) be the original constant.
348 const std::int32_t offsetDiv3(42);
349 const SimdFloat c2(-0.191502161678719066F);
350 const SimdFloat c1(0.697570460207922770F);
351 const SimdFloat c0(0.492659620528969547F);
352 const SimdFloat one(1.0F);
353 const SimdFloat two(2.0F);
354 const SimdFloat three(3.0F);
355 const SimdFloat oneThird(1.0F / 3.0F);
356 const SimdFloat cbrt2(1.2599210498948731648F);
357 const SimdFloat sqrCbrt2(1.5874010519681994748F);
359 // To calculate cbrt(x) we first take the absolute value of x but save the sign,
360 // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
361 // the main step.
362 // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
363 // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
364 // The cube root can the be evaluated by calculating the cube root of the fraction
365 // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or
366 // +/-2^(2/3), and then we load this into a new IEEE754 fp number with the exponent 2^n, where
367 // n is the integer part of the original exponent divided by 3.
369 SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
370 SimdFloat xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
371 SimdFBool xIsNonZero = (minFloat <= xAbs); // treat denormals as 0
373 SimdFInt32 exponent;
374 SimdFloat y = frexp(xAbs, &exponent);
375 // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
376 // by first using a polynomial and then evaluating
377 // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
378 // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
379 SimdFloat z = fma(fma(y, c2, c1), y, c0);
380 SimdFloat w = z * z * z;
381 SimdFloat nom = z * fma(two, y, w);
382 SimdFloat invDenom = inv(fma(two, w, y));
384 // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
385 // division converted to multiplication... but we can't do that since our SIMD layer cannot
386 // assume the presence of integer shift operations!
387 // However, when I first worked with the integer algorithm I still came up with a neat
388 // optimization, so I'll describe the full algorithm here in case we ever want to use it
389 // in the future:
391 // Our dividend is signed, which is a complication, but let's consider the unsigned case
392 // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
393 // our dividend is less than 16 bits (exponent range) we can accomplish this by
394 // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
395 // right-shifting by 16 bits to divide out the 2^16 part.
396 // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
397 // To handle the signed exponent one alternative would be to take absolute values, saving
398 // signs, etc - but that gets a bit complicated with 2-complement integers.
399 // Instead, we remember that we don't really want the exact division per se - what we're
400 // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
401 // we require that m must be positive (fewer cases to handle) instead of having n as the
402 // strict e/3.
403 // To handle this we start by adding 127 to the exponent. This value corresponds to the
404 // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
405 // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
406 // and then multiply by 21845 to get a division result offsetExpDiv3.
407 // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
408 // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
409 // the extra 1 corrects for the value we added to the exponent to get correct division.
410 // This remainder would have the value 0,1, or 2, but since we only use it to select
411 // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
413 // OK; end of long detour. Here's how we actually do it in our implementation by using
414 // floating-point for the exponent instead to avoid needing integer shifts:
416 // 1) Convert the exponent (obtained from frexp) to a float
417 // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
418 // do floating-point division instead of our integer hack, so it's the exponent bias-1, or
419 // the largest exponent minus 2.
420 // 3) Divide the float by 3 by multiplying with 1/3
421 // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
422 // combination with floating-point, because many integers cannot be represented exactly in
423 // floating point, and if we are just epsilon below the result might be truncated to a lower
424 // integer. I have not observed this on x86, but to have a safety margin we can add a small
425 // fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
426 // We can even save this extra floating-point addition by adding a small fraction (0.1) when
427 // we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
428 // 5) Get the remainder part by subtracting the truncated floating-point part.
429 // Here too we will have a plain division, so the remainder is a strict modulus
430 // and will have the values 0, 1 or 2.
432 // Before worrying about the few wasted cycles due to longer fp latency, this has the
433 // additional advantage that we don't use a single integer operation, so the algorithm
434 // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
436 // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
437 SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
439 SimdFloat offsetExpDiv3 =
440 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
442 SimdFInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdFloat(static_cast<float>(offsetDiv3)));
444 SimdFloat remainder = offsetExp - offsetExpDiv3 * three;
446 // If remainder is 0 we should just have the factor 1.0,
447 // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
448 SimdFloat factor = blend(one, cbrt2, SimdFloat(0.5) < remainder);
449 // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
450 factor = blend(factor, sqrCbrt2, SimdFloat(1.5) < remainder);
452 // Assemble the non-signed fraction, and add the sign back by xor
453 SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
454 // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
455 SimdFloat result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
457 return result;
460 /*! \brief Inverse cube root for SIMD floats
462 * \param x Argument to calculate cube root of. Can be positive or
463 * negative, but the magnitude cannot be lower than
464 * the smallest normal number.
465 * \return Cube root of x. Undefined for values that don't
466 * fulfill the restriction of abs(x) > minFloat.
468 static inline SimdFloat gmx_simdcall invcbrt(SimdFloat x)
470 const SimdFloat signBit(GMX_FLOAT_NEGZERO);
471 const SimdFloat minFloat(std::numeric_limits<float>::min());
472 // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
473 // negative exponent from frexp() is -126, we can subtract one more unit to get 126
474 // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
475 // division mixed with FP, we let the divided value (42) be the original constant.
476 const std::int32_t offsetDiv3(42);
477 const SimdFloat c2(-0.191502161678719066F);
478 const SimdFloat c1(0.697570460207922770F);
479 const SimdFloat c0(0.492659620528969547F);
480 const SimdFloat one(1.0F);
481 const SimdFloat two(2.0F);
482 const SimdFloat three(3.0F);
483 const SimdFloat oneThird(1.0F / 3.0F);
484 const SimdFloat invCbrt2(1.0F / 1.2599210498948731648F);
485 const SimdFloat invSqrCbrt2(1.0F / 1.5874010519681994748F);
487 // We use pretty much exactly the same implementation as for cbrt(x),
488 // but to compute the inverse we swap the nominator/denominator
489 // in the quotient, and also swap the sign of the exponent parts.
491 SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
492 SimdFloat xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
494 SimdFInt32 exponent;
495 SimdFloat y = frexp(xAbs, &exponent);
496 // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
497 // by first using a polynomial and then evaluating
498 // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
499 // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
500 SimdFloat z = fma(fma(y, c2, c1), y, c0);
501 SimdFloat w = z * z * z;
502 SimdFloat nom = fma(two, w, y);
503 SimdFloat invDenom = inv(z * fma(two, y, w));
505 // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
506 SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
507 SimdFloat offsetExpDiv3 =
508 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
510 // We should swap the sign here, so we change order of the terms in the subtraction
511 SimdFInt32 expDiv3 = cvtR2I(SimdFloat(static_cast<float>(offsetDiv3)) - offsetExpDiv3);
513 // Swap sign here too, so remainder is either 0, -1 or -2
514 SimdFloat remainder = offsetExpDiv3 * three - offsetExp;
516 // If remainder is 0 we should just have the factor 1.0,
517 // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
518 SimdFloat factor = blend(one, invCbrt2, remainder < SimdFloat(-0.5));
519 // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
520 factor = blend(factor, invSqrCbrt2, remainder < SimdFloat(-1.5));
522 // Assemble the non-signed fraction, and add the sign back by xor
523 SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
524 // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
525 SimdFloat result = ldexp(fraction, expDiv3);
527 return result;
530 /*! \brief SIMD float log2(x). This is the base-2 logarithm.
532 * \param x Argument, should be >0.
533 * \result The base-2 logarithm of x. Undefined if argument is invalid.
535 static inline SimdFloat gmx_simdcall log2(SimdFloat x)
537 // This implementation computes log2 by
538 // 1) Extracting the exponent and adding it to...
539 // 2) A 9th-order minimax approximation using only odd
540 // terms of (x-1)/(x+1), where x is the mantissa.
542 # if GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
543 // Just rescale if native log2() is not present, but log() is.
544 return log(x) * SimdFloat(std::log2(std::exp(1.0)));
545 # else
546 const SimdFloat one(1.0F);
547 const SimdFloat two(2.0F);
548 const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
549 const SimdFloat CL9(0.342149508897807708152F);
550 const SimdFloat CL7(0.411570606888219447939F);
551 const SimdFloat CL5(0.577085979152320294183F);
552 const SimdFloat CL3(0.961796550607099898222F);
553 const SimdFloat CL1(2.885390081777926774009F);
554 SimdFloat fExp, x2, p;
555 SimdFBool m;
556 SimdFInt32 iExp;
558 // For the log2() function, the argument can never be 0, so use the faster version of frexp()
559 x = frexp<MathOptimization::Unsafe>(x, &iExp);
560 fExp = cvtI2R(iExp);
562 m = x < invsqrt2;
563 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
564 fExp = fExp - selectByMask(one, m);
565 x = x * blend(one, two, m);
567 x = (x - one) * inv(x + one);
568 x2 = x * x;
570 p = fma(CL9, x2, CL7);
571 p = fma(p, x2, CL5);
572 p = fma(p, x2, CL3);
573 p = fma(p, x2, CL1);
574 p = fma(p, x, fExp);
576 return p;
577 # endif
580 # if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
581 /*! \brief SIMD float log(x). This is the natural logarithm.
583 * \param x Argument, should be >0.
584 * \result The natural logarithm of x. Undefined if argument is invalid.
586 static inline SimdFloat gmx_simdcall log(SimdFloat x)
588 const SimdFloat one(1.0F);
589 const SimdFloat two(2.0F);
590 const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
591 const SimdFloat corr(0.693147180559945286226764F);
592 const SimdFloat CL9(0.2371599674224853515625F);
593 const SimdFloat CL7(0.285279005765914916992188F);
594 const SimdFloat CL5(0.400005519390106201171875F);
595 const SimdFloat CL3(0.666666567325592041015625F);
596 const SimdFloat CL1(2.0F);
597 SimdFloat fExp, x2, p;
598 SimdFBool m;
599 SimdFInt32 iExp;
601 // For log(), the argument cannot be 0, so use the faster version of frexp()
602 x = frexp<MathOptimization::Unsafe>(x, &iExp);
603 fExp = cvtI2R(iExp);
605 m = x < invsqrt2;
606 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
607 fExp = fExp - selectByMask(one, m);
608 x = x * blend(one, two, m);
610 x = (x - one) * inv(x + one);
611 x2 = x * x;
613 p = fma(CL9, x2, CL7);
614 p = fma(p, x2, CL5);
615 p = fma(p, x2, CL3);
616 p = fma(p, x2, CL1);
617 p = fma(p, x, corr * fExp);
619 return p;
621 # endif
623 # if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
624 /*! \brief SIMD float 2^x
626 * \tparam opt If this is changed from the default (safe) into the unsafe
627 * option, input values that would otherwise lead to zero-clamped
628 * results are not allowed and will lead to undefined results.
630 * \param x Argument. For the default (safe) function version this can be
631 * arbitrarily small value, but the routine might clamp the result to
632 * zero for arguments that would produce subnormal IEEE754-2008 results.
633 * This corresponds to inputs below -126 in single or -1022 in double,
634 * and it might overflow for arguments reaching 127 (single) or
635 * 1023 (double). If you enable the unsafe math optimization,
636 * very small arguments will not necessarily be zero-clamped, but
637 * can produce undefined results.
639 * \result 2^x. The result is undefined for very large arguments that cause
640 * internal floating-point overflow. If unsafe optimizations are enabled,
641 * this is also true for very small values.
643 * \note The definition range of this function is just-so-slightly smaller
644 * than the allowed IEEE exponents for many architectures. This is due
645 * to the implementation, which will hopefully improve in the future.
647 * \warning You cannot rely on this implementation returning inf for arguments
648 * that cause overflow. If you have some very large
649 * values and need to rely on getting a valid numerical output,
650 * take the minimum of your variable and the largest valid argument
651 * before calling this routine.
653 template<MathOptimization opt = MathOptimization::Safe>
654 static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
656 const SimdFloat CC6(0.0001534581200287996416911311F);
657 const SimdFloat CC5(0.001339993121934088894618990F);
658 const SimdFloat CC4(0.009618488957115180159497841F);
659 const SimdFloat CC3(0.05550328776964726865751735F);
660 const SimdFloat CC2(0.2402264689063408646490722F);
661 const SimdFloat CC1(0.6931472057372680777553816F);
662 const SimdFloat one(1.0F);
664 SimdFloat intpart;
665 SimdFloat fexppart;
666 SimdFloat p;
668 // Large negative values are valid arguments to exp2(), so there are two
669 // things we need to account for:
670 // 1. When the exponents reaches -127, the (biased) exponent field will be
671 // zero and we can no longer multiply with it. There are special IEEE
672 // formats to handle this range, but for now we have to accept that
673 // we cannot handle those arguments. If input value becomes even more
674 // negative, it will start to loop and we would end up with invalid
675 // exponents. Thus, we need to limit or mask this.
676 // 2. For VERY large negative values, we will have problems that the
677 // subtraction to get the fractional part loses accuracy, and then we
678 // can end up with overflows in the polynomial.
680 // For now, we handle this by forwarding the math optimization setting to
681 // ldexp, where the routine will return zero for very small arguments.
683 // However, before doing that we need to make sure we do not call cvtR2I
684 // with an argument that is so negative it cannot be converted to an integer.
685 if (opt == MathOptimization::Safe)
687 x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()));
690 fexppart = ldexp<opt>(one, cvtR2I(x));
691 intpart = round(x);
692 x = x - intpart;
694 p = fma(CC6, x, CC5);
695 p = fma(p, x, CC4);
696 p = fma(p, x, CC3);
697 p = fma(p, x, CC2);
698 p = fma(p, x, CC1);
699 p = fma(p, x, one);
700 x = p * fexppart;
701 return x;
703 # endif
705 # if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
706 /*! \brief SIMD float exp(x).
708 * In addition to scaling the argument for 2^x this routine correctly does
709 * extended precision arithmetics to improve accuracy.
711 * \tparam opt If this is changed from the default (safe) into the unsafe
712 * option, input values that would otherwise lead to zero-clamped
713 * results are not allowed and will lead to undefined results.
715 * \param x Argument. For the default (safe) function version this can be
716 * arbitrarily small value, but the routine might clamp the result to
717 * zero for arguments that would produce subnormal IEEE754-2008 results.
718 * This corresponds to input arguments reaching
719 * -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
720 * Similarly, it might overflow for arguments reaching
721 * 127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
722 * unsafe math optimizations are enabled, small input values that would
723 * result in zero-clamped output are not allowed.
725 * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
726 * depending on the underlying implementation. If unsafe optimizations
727 * are enabled, this is also true for very small values.
729 * \note The definition range of this function is just-so-slightly smaller
730 * than the allowed IEEE exponents for many architectures. This is due
731 * to the implementation, which will hopefully improve in the future.
733 * \warning You cannot rely on this implementation returning inf for arguments
734 * that cause overflow. If you have some very large
735 * values and need to rely on getting a valid numerical output,
736 * take the minimum of your variable and the largest valid argument
737 * before calling this routine.
739 template<MathOptimization opt = MathOptimization::Safe>
740 static inline SimdFloat gmx_simdcall exp(SimdFloat x)
742 const SimdFloat argscale(1.44269504088896341F);
743 const SimdFloat invargscale0(-0.693145751953125F);
744 const SimdFloat invargscale1(-1.428606765330187045e-06F);
745 const SimdFloat CC4(0.00136324646882712841033936F);
746 const SimdFloat CC3(0.00836596917361021041870117F);
747 const SimdFloat CC2(0.0416710823774337768554688F);
748 const SimdFloat CC1(0.166665524244308471679688F);
749 const SimdFloat CC0(0.499999850988388061523438F);
750 const SimdFloat one(1.0F);
751 SimdFloat fexppart;
752 SimdFloat intpart;
753 SimdFloat y, p;
755 // Large negative values are valid arguments to exp2(), so there are two
756 // things we need to account for:
757 // 1. When the exponents reaches -127, the (biased) exponent field will be
758 // zero and we can no longer multiply with it. There are special IEEE
759 // formats to handle this range, but for now we have to accept that
760 // we cannot handle those arguments. If input value becomes even more
761 // negative, it will start to loop and we would end up with invalid
762 // exponents. Thus, we need to limit or mask this.
763 // 2. For VERY large negative values, we will have problems that the
764 // subtraction to get the fractional part loses accuracy, and then we
765 // can end up with overflows in the polynomial.
767 // For now, we handle this by forwarding the math optimization setting to
768 // ldexp, where the routine will return zero for very small arguments.
770 // However, before doing that we need to make sure we do not call cvtR2I
771 // with an argument that is so negative it cannot be converted to an integer
772 // after being multiplied by argscale.
774 if (opt == MathOptimization::Safe)
776 x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()) / argscale);
779 y = x * argscale;
782 fexppart = ldexp<opt>(one, cvtR2I(y));
783 intpart = round(y);
785 // Extended precision arithmetics
786 x = fma(invargscale0, intpart, x);
787 x = fma(invargscale1, intpart, x);
789 p = fma(CC4, x, CC3);
790 p = fma(p, x, CC2);
791 p = fma(p, x, CC1);
792 p = fma(p, x, CC0);
793 p = fma(x * x, p, x);
794 # if GMX_SIMD_HAVE_FMA
795 x = fma(p, fexppart, fexppart);
796 # else
797 x = (p + one) * fexppart;
798 # endif
799 return x;
801 # endif
803 /*! \brief SIMD float pow(x,y)
805 * This returns x^y for SIMD values.
807 * \tparam opt If this is changed from the default (safe) into the unsafe
808 * option, there are no guarantees about correct results for x==0.
810 * \param x Base.
812 * \param y exponent.
814 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
815 * depending on the underlying implementation. If unsafe optimizations
816 * are enabled, this is also true for x==0.
818 * \warning You cannot rely on this implementation returning inf for arguments
819 * that cause overflow. If you have some very large
820 * values and need to rely on getting a valid numerical output,
821 * take the minimum of your variable and the largest valid argument
822 * before calling this routine.
824 template<MathOptimization opt = MathOptimization::Safe>
825 static inline SimdFloat gmx_simdcall pow(SimdFloat x, SimdFloat y)
827 SimdFloat xcorr;
829 if (opt == MathOptimization::Safe)
831 xcorr = max(x, SimdFloat(std::numeric_limits<float>::min()));
833 else
835 xcorr = x;
838 SimdFloat result = exp2<opt>(y * log2(xcorr));
840 if (opt == MathOptimization::Safe)
842 // if x==0 and y>0 we explicitly set the result to 0.0
843 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
844 result = blend(result, setZero(), x == setZero() && setZero() < y);
847 return result;
851 /*! \brief SIMD float erf(x).
853 * \param x The value to calculate erf(x) for.
854 * \result erf(x)
856 * This routine achieves very close to full precision, but we do not care about
857 * the last bit or the subnormal result range.
859 static inline SimdFloat gmx_simdcall erf(SimdFloat x)
861 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
862 const SimdFloat CA6(7.853861353153693e-5F);
863 const SimdFloat CA5(-8.010193625184903e-4F);
864 const SimdFloat CA4(5.188327685732524e-3F);
865 const SimdFloat CA3(-2.685381193529856e-2F);
866 const SimdFloat CA2(1.128358514861418e-1F);
867 const SimdFloat CA1(-3.761262582423300e-1F);
868 const SimdFloat CA0(1.128379165726710F);
869 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
870 const SimdFloat CB9(-0.0018629930017603923F);
871 const SimdFloat CB8(0.003909821287598495F);
872 const SimdFloat CB7(-0.0052094582210355615F);
873 const SimdFloat CB6(0.005685614362160572F);
874 const SimdFloat CB5(-0.0025367682853477272F);
875 const SimdFloat CB4(-0.010199799682318782F);
876 const SimdFloat CB3(0.04369575504816542F);
877 const SimdFloat CB2(-0.11884063474674492F);
878 const SimdFloat CB1(0.2732120154030589F);
879 const SimdFloat CB0(0.42758357702025784F);
880 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
881 const SimdFloat CC10(-0.0445555913112064F);
882 const SimdFloat CC9(0.21376355144663348F);
883 const SimdFloat CC8(-0.3473187200259257F);
884 const SimdFloat CC7(0.016690861551248114F);
885 const SimdFloat CC6(0.7560973182491192F);
886 const SimdFloat CC5(-1.2137903600145787F);
887 const SimdFloat CC4(0.8411872321232948F);
888 const SimdFloat CC3(-0.08670413896296343F);
889 const SimdFloat CC2(-0.27124782687240334F);
890 const SimdFloat CC1(-0.0007502488047806069F);
891 const SimdFloat CC0(0.5642114853803148F);
892 const SimdFloat one(1.0F);
893 const SimdFloat two(2.0F);
895 SimdFloat x2, x4, y;
896 SimdFloat t, t2, w, w2;
897 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
898 SimdFloat expmx2;
899 SimdFloat res_erf, res_erfc, res;
900 SimdFBool m, maskErf;
902 // Calculate erf()
903 x2 = x * x;
904 x4 = x2 * x2;
906 pA0 = fma(CA6, x4, CA4);
907 pA1 = fma(CA5, x4, CA3);
908 pA0 = fma(pA0, x4, CA2);
909 pA1 = fma(pA1, x4, CA1);
910 pA0 = pA0 * x4;
911 pA0 = fma(pA1, x2, pA0);
912 // Constant term must come last for precision reasons
913 pA0 = pA0 + CA0;
915 res_erf = x * pA0;
917 // Calculate erfc
918 y = abs(x);
919 maskErf = SimdFloat(0.75F) <= y;
920 t = maskzInv(y, maskErf);
921 w = t - one;
922 t2 = t * t;
923 w2 = w * w;
925 // No need for a floating-point sieve here (as in erfc), since erf()
926 // will never return values that are extremely small for large args.
927 expmx2 = exp(-y * y);
929 pB1 = fma(CB9, w2, CB7);
930 pB0 = fma(CB8, w2, CB6);
931 pB1 = fma(pB1, w2, CB5);
932 pB0 = fma(pB0, w2, CB4);
933 pB1 = fma(pB1, w2, CB3);
934 pB0 = fma(pB0, w2, CB2);
935 pB1 = fma(pB1, w2, CB1);
936 pB0 = fma(pB0, w2, CB0);
937 pB0 = fma(pB1, w, pB0);
939 pC0 = fma(CC10, t2, CC8);
940 pC1 = fma(CC9, t2, CC7);
941 pC0 = fma(pC0, t2, CC6);
942 pC1 = fma(pC1, t2, CC5);
943 pC0 = fma(pC0, t2, CC4);
944 pC1 = fma(pC1, t2, CC3);
945 pC0 = fma(pC0, t2, CC2);
946 pC1 = fma(pC1, t2, CC1);
948 pC0 = fma(pC0, t2, CC0);
949 pC0 = fma(pC1, t, pC0);
950 pC0 = pC0 * t;
952 // Select pB0 or pC0 for erfc()
953 m = two < y;
954 res_erfc = blend(pB0, pC0, m);
955 res_erfc = res_erfc * expmx2;
957 // erfc(x<0) = 2-erfc(|x|)
958 m = x < setZero();
959 res_erfc = blend(res_erfc, two - res_erfc, m);
961 // Select erf() or erfc()
962 res = blend(res_erf, one - res_erfc, maskErf);
964 return res;
967 /*! \brief SIMD float erfc(x).
969 * \param x The value to calculate erfc(x) for.
970 * \result erfc(x)
972 * This routine achieves full precision (bar the last bit) over most of the
973 * input range, but for large arguments where the result is getting close
974 * to the minimum representable numbers we accept slightly larger errors
975 * (think results that are in the ballpark of 10^-30 for single precision)
976 * since that is not relevant for MD.
978 static inline SimdFloat gmx_simdcall erfc(SimdFloat x)
980 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
981 const SimdFloat CA6(7.853861353153693e-5F);
982 const SimdFloat CA5(-8.010193625184903e-4F);
983 const SimdFloat CA4(5.188327685732524e-3F);
984 const SimdFloat CA3(-2.685381193529856e-2F);
985 const SimdFloat CA2(1.128358514861418e-1F);
986 const SimdFloat CA1(-3.761262582423300e-1F);
987 const SimdFloat CA0(1.128379165726710F);
988 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
989 const SimdFloat CB9(-0.0018629930017603923F);
990 const SimdFloat CB8(0.003909821287598495F);
991 const SimdFloat CB7(-0.0052094582210355615F);
992 const SimdFloat CB6(0.005685614362160572F);
993 const SimdFloat CB5(-0.0025367682853477272F);
994 const SimdFloat CB4(-0.010199799682318782F);
995 const SimdFloat CB3(0.04369575504816542F);
996 const SimdFloat CB2(-0.11884063474674492F);
997 const SimdFloat CB1(0.2732120154030589F);
998 const SimdFloat CB0(0.42758357702025784F);
999 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
1000 const SimdFloat CC10(-0.0445555913112064F);
1001 const SimdFloat CC9(0.21376355144663348F);
1002 const SimdFloat CC8(-0.3473187200259257F);
1003 const SimdFloat CC7(0.016690861551248114F);
1004 const SimdFloat CC6(0.7560973182491192F);
1005 const SimdFloat CC5(-1.2137903600145787F);
1006 const SimdFloat CC4(0.8411872321232948F);
1007 const SimdFloat CC3(-0.08670413896296343F);
1008 const SimdFloat CC2(-0.27124782687240334F);
1009 const SimdFloat CC1(-0.0007502488047806069F);
1010 const SimdFloat CC0(0.5642114853803148F);
1011 // Coefficients for expansion of exp(x) in [0,0.1]
1012 // CD0 and CD1 are both 1.0, so no need to declare them separately
1013 const SimdFloat CD2(0.5000066608081202F);
1014 const SimdFloat CD3(0.1664795422874624F);
1015 const SimdFloat CD4(0.04379839977652482F);
1016 const SimdFloat one(1.0F);
1017 const SimdFloat two(2.0F);
1019 /* We need to use a small trick here, since we cannot assume all SIMD
1020 * architectures support integers, and the flag we want (0xfffff000) would
1021 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
1022 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
1023 * fp numbers, and perform a logical or. Since the expression is constant,
1024 * we can at least hope it is evaluated at compile-time.
1026 # if GMX_SIMD_HAVE_LOGICAL
1027 const SimdFloat sieve(SimdFloat(-5.965323564e+29F) | SimdFloat(7.05044434e-30F));
1028 # else
1029 const int isieve = 0xFFFFF000;
1030 alignas(GMX_SIMD_ALIGNMENT) float mem[GMX_SIMD_FLOAT_WIDTH];
1032 union {
1033 float f;
1034 int i;
1035 } conv;
1036 int i;
1037 # endif
1039 SimdFloat x2, x4, y;
1040 SimdFloat q, z, t, t2, w, w2;
1041 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
1042 SimdFloat expmx2, corr;
1043 SimdFloat res_erf, res_erfc, res;
1044 SimdFBool m, msk_erf;
1046 // Calculate erf()
1047 x2 = x * x;
1048 x4 = x2 * x2;
1050 pA0 = fma(CA6, x4, CA4);
1051 pA1 = fma(CA5, x4, CA3);
1052 pA0 = fma(pA0, x4, CA2);
1053 pA1 = fma(pA1, x4, CA1);
1054 pA1 = pA1 * x2;
1055 pA0 = fma(pA0, x4, pA1);
1056 // Constant term must come last for precision reasons
1057 pA0 = pA0 + CA0;
1059 res_erf = x * pA0;
1061 // Calculate erfc
1062 y = abs(x);
1063 msk_erf = SimdFloat(0.75F) <= y;
1064 t = maskzInv(y, msk_erf);
1065 w = t - one;
1066 t2 = t * t;
1067 w2 = w * w;
1069 * We cannot simply calculate exp(-y2) directly in single precision, since
1070 * that will lose a couple of bits of precision due to the multiplication.
1071 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
1072 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
1074 * The only drawback with this is that it requires TWO separate exponential
1075 * evaluations, which would be horrible performance-wise. However, the argument
1076 * for the second exp() call is always small, so there we simply use a
1077 * low-order minimax expansion on [0,0.1].
1079 * However, this neat idea requires support for logical ops (and) on
1080 * FP numbers, which some vendors decided isn't necessary in their SIMD
1081 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
1082 * in double, but we still need memory as a backup when that is not available,
1083 * and this case is rare enough that we go directly there...
1085 # if GMX_SIMD_HAVE_LOGICAL
1086 z = y & sieve;
1087 # else
1088 store(mem, y);
1089 for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1091 conv.f = mem[i];
1092 conv.i = conv.i & isieve;
1093 mem[i] = conv.f;
1095 z = load<SimdFloat>(mem);
1096 # endif
1097 q = (z - y) * (z + y);
1098 corr = fma(CD4, q, CD3);
1099 corr = fma(corr, q, CD2);
1100 corr = fma(corr, q, one);
1101 corr = fma(corr, q, one);
1103 expmx2 = exp(-z * z);
1104 expmx2 = expmx2 * corr;
1106 pB1 = fma(CB9, w2, CB7);
1107 pB0 = fma(CB8, w2, CB6);
1108 pB1 = fma(pB1, w2, CB5);
1109 pB0 = fma(pB0, w2, CB4);
1110 pB1 = fma(pB1, w2, CB3);
1111 pB0 = fma(pB0, w2, CB2);
1112 pB1 = fma(pB1, w2, CB1);
1113 pB0 = fma(pB0, w2, CB0);
1114 pB0 = fma(pB1, w, pB0);
1116 pC0 = fma(CC10, t2, CC8);
1117 pC1 = fma(CC9, t2, CC7);
1118 pC0 = fma(pC0, t2, CC6);
1119 pC1 = fma(pC1, t2, CC5);
1120 pC0 = fma(pC0, t2, CC4);
1121 pC1 = fma(pC1, t2, CC3);
1122 pC0 = fma(pC0, t2, CC2);
1123 pC1 = fma(pC1, t2, CC1);
1125 pC0 = fma(pC0, t2, CC0);
1126 pC0 = fma(pC1, t, pC0);
1127 pC0 = pC0 * t;
1129 // Select pB0 or pC0 for erfc()
1130 m = two < y;
1131 res_erfc = blend(pB0, pC0, m);
1132 res_erfc = res_erfc * expmx2;
1134 // erfc(x<0) = 2-erfc(|x|)
1135 m = x < setZero();
1136 res_erfc = blend(res_erfc, two - res_erfc, m);
1138 // Select erf() or erfc()
1139 res = blend(one - res_erf, res_erfc, msk_erf);
1141 return res;
1144 /*! \brief SIMD float sin \& cos.
1146 * \param x The argument to evaluate sin/cos for
1147 * \param[out] sinval Sin(x)
1148 * \param[out] cosval Cos(x)
1150 * This version achieves close to machine precision, but for very large
1151 * magnitudes of the argument we inherently begin to lose accuracy due to the
1152 * argument reduction, despite using extended precision arithmetics internally.
1154 static inline void gmx_simdcall sincos(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
1156 // Constants to subtract Pi/4*x from y while minimizing precision loss
1157 const SimdFloat argred0(-1.5703125);
1158 const SimdFloat argred1(-4.83751296997070312500e-04F);
1159 const SimdFloat argred2(-7.54953362047672271729e-08F);
1160 const SimdFloat argred3(-2.56334406825708960298e-12F);
1161 const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1162 const SimdFloat const_sin2(-1.9515295891e-4F);
1163 const SimdFloat const_sin1(8.3321608736e-3F);
1164 const SimdFloat const_sin0(-1.6666654611e-1F);
1165 const SimdFloat const_cos2(2.443315711809948e-5F);
1166 const SimdFloat const_cos1(-1.388731625493765e-3F);
1167 const SimdFloat const_cos0(4.166664568298827e-2F);
1168 const SimdFloat half(0.5F);
1169 const SimdFloat one(1.0F);
1170 SimdFloat ssign, csign;
1171 SimdFloat x2, y, z, psin, pcos, sss, ccc;
1172 SimdFBool m;
1174 # if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1175 const SimdFInt32 ione(1);
1176 const SimdFInt32 itwo(2);
1177 SimdFInt32 iy;
1179 z = x * two_over_pi;
1180 iy = cvtR2I(z);
1181 y = round(z);
1183 m = cvtIB2B((iy & ione) == SimdFInt32(0));
1184 ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
1185 csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
1186 # else
1187 const SimdFloat quarter(0.25f);
1188 const SimdFloat minusquarter(-0.25f);
1189 SimdFloat q;
1190 SimdFBool m1, m2, m3;
1192 /* The most obvious way to find the arguments quadrant in the unit circle
1193 * to calculate the sign is to use integer arithmetic, but that is not
1194 * present in all SIMD implementations. As an alternative, we have devised a
1195 * pure floating-point algorithm that uses truncation for argument reduction
1196 * so that we get a new value 0<=q<1 over the unit circle, and then
1197 * do floating-point comparisons with fractions. This is likely to be
1198 * slightly slower (~10%) due to the longer latencies of floating-point, so
1199 * we only use it when integer SIMD arithmetic is not present.
1201 ssign = x;
1202 x = abs(x);
1203 // It is critical that half-way cases are rounded down
1204 z = fma(x, two_over_pi, half);
1205 y = trunc(z);
1206 q = z * quarter;
1207 q = q - trunc(q);
1208 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
1209 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
1210 * This removes the 2*Pi periodicity without using any integer arithmetic.
1211 * First check if y had the value 2 or 3, set csign if true.
1213 q = q - half;
1214 /* If we have logical operations we can work directly on the signbit, which
1215 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
1216 * Thus, if you are altering defines to debug alternative code paths, the
1217 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
1218 * active or inactive - you will get errors if only one is used.
1220 # if GMX_SIMD_HAVE_LOGICAL
1221 ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
1222 csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
1223 ssign = ssign ^ csign;
1224 # else
1225 ssign = copysign(SimdFloat(1.0f), ssign);
1226 csign = copysign(SimdFloat(1.0f), q);
1227 csign = -csign;
1228 ssign = ssign * csign; // swap ssign if csign was set.
1229 # endif
1230 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
1231 m1 = (q < minusquarter);
1232 m2 = (setZero() <= q);
1233 m3 = (q < quarter);
1234 m2 = m2 && m3;
1235 m = m1 || m2;
1236 // where mask is FALSE, swap sign.
1237 csign = csign * blend(SimdFloat(-1.0f), one, m);
1238 # endif
1239 x = fma(y, argred0, x);
1240 x = fma(y, argred1, x);
1241 x = fma(y, argred2, x);
1242 x = fma(y, argred3, x);
1243 x2 = x * x;
1245 psin = fma(const_sin2, x2, const_sin1);
1246 psin = fma(psin, x2, const_sin0);
1247 psin = fma(psin, x * x2, x);
1248 pcos = fma(const_cos2, x2, const_cos1);
1249 pcos = fma(pcos, x2, const_cos0);
1250 pcos = fms(pcos, x2, half);
1251 pcos = fma(pcos, x2, one);
1253 sss = blend(pcos, psin, m);
1254 ccc = blend(psin, pcos, m);
1255 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
1256 # if GMX_SIMD_HAVE_LOGICAL
1257 *sinval = sss ^ ssign;
1258 *cosval = ccc ^ csign;
1259 # else
1260 *sinval = sss * ssign;
1261 *cosval = ccc * csign;
1262 # endif
1265 /*! \brief SIMD float sin(x).
1267 * \param x The argument to evaluate sin for
1268 * \result Sin(x)
1270 * \attention Do NOT call both sin & cos if you need both results, since each of them
1271 * will then call \ref sincos and waste a factor 2 in performance.
1273 static inline SimdFloat gmx_simdcall sin(SimdFloat x)
1275 SimdFloat s, c;
1276 sincos(x, &s, &c);
1277 return s;
1280 /*! \brief SIMD float cos(x).
1282 * \param x The argument to evaluate cos for
1283 * \result Cos(x)
1285 * \attention Do NOT call both sin & cos if you need both results, since each of them
1286 * will then call \ref sincos and waste a factor 2 in performance.
1288 static inline SimdFloat gmx_simdcall cos(SimdFloat x)
1290 SimdFloat s, c;
1291 sincos(x, &s, &c);
1292 return c;
1295 /*! \brief SIMD float tan(x).
1297 * \param x The argument to evaluate tan for
1298 * \result Tan(x)
1300 static inline SimdFloat gmx_simdcall tan(SimdFloat x)
1302 const SimdFloat argred0(-1.5703125);
1303 const SimdFloat argred1(-4.83751296997070312500e-04F);
1304 const SimdFloat argred2(-7.54953362047672271729e-08F);
1305 const SimdFloat argred3(-2.56334406825708960298e-12F);
1306 const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1307 const SimdFloat CT6(0.009498288995810566122993911);
1308 const SimdFloat CT5(0.002895755790837379295226923);
1309 const SimdFloat CT4(0.02460087336161924491836265);
1310 const SimdFloat CT3(0.05334912882656359828045988);
1311 const SimdFloat CT2(0.1333989091464957704418495);
1312 const SimdFloat CT1(0.3333307599244198227797507);
1314 SimdFloat x2, p, y, z;
1315 SimdFBool m;
1317 # if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1318 SimdFInt32 iy;
1319 SimdFInt32 ione(1);
1321 z = x * two_over_pi;
1322 iy = cvtR2I(z);
1323 y = round(z);
1324 m = cvtIB2B((iy & ione) == ione);
1326 x = fma(y, argred0, x);
1327 x = fma(y, argred1, x);
1328 x = fma(y, argred2, x);
1329 x = fma(y, argred3, x);
1330 x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1331 # else
1332 const SimdFloat quarter(0.25f);
1333 const SimdFloat half(0.5f);
1334 const SimdFloat threequarter(0.75f);
1335 SimdFloat w, q;
1336 SimdFBool m1, m2, m3;
1338 w = abs(x);
1339 z = fma(w, two_over_pi, half);
1340 y = trunc(z);
1341 q = z * quarter;
1342 q = q - trunc(q);
1343 m1 = quarter <= q;
1344 m2 = q < half;
1345 m3 = threequarter <= q;
1346 m1 = m1 && m2;
1347 m = m1 || m3;
1348 w = fma(y, argred0, w);
1349 w = fma(y, argred1, w);
1350 w = fma(y, argred2, w);
1351 w = fma(y, argred3, w);
1352 w = blend(w, -w, m);
1353 x = w * copysign(SimdFloat(1.0), x);
1354 # endif
1355 x2 = x * x;
1356 p = fma(CT6, x2, CT5);
1357 p = fma(p, x2, CT4);
1358 p = fma(p, x2, CT3);
1359 p = fma(p, x2, CT2);
1360 p = fma(p, x2, CT1);
1361 p = fma(x2 * p, x, x);
1363 p = blend(p, maskzInv(p, m), m);
1364 return p;
1367 /*! \brief SIMD float asin(x).
1369 * \param x The argument to evaluate asin for
1370 * \result Asin(x)
1372 static inline SimdFloat gmx_simdcall asin(SimdFloat x)
1374 const SimdFloat limitlow(1e-4F);
1375 const SimdFloat half(0.5F);
1376 const SimdFloat one(1.0F);
1377 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1378 const SimdFloat CC5(4.2163199048E-2F);
1379 const SimdFloat CC4(2.4181311049E-2F);
1380 const SimdFloat CC3(4.5470025998E-2F);
1381 const SimdFloat CC2(7.4953002686E-2F);
1382 const SimdFloat CC1(1.6666752422E-1F);
1383 SimdFloat xabs;
1384 SimdFloat z, z1, z2, q, q1, q2;
1385 SimdFloat pA, pB;
1386 SimdFBool m, m2;
1388 xabs = abs(x);
1389 m = half < xabs;
1390 z1 = half * (one - xabs);
1391 m2 = xabs < one;
1392 q1 = z1 * maskzInvsqrt(z1, m2);
1393 q2 = xabs;
1394 z2 = q2 * q2;
1395 z = blend(z2, z1, m);
1396 q = blend(q2, q1, m);
1398 z2 = z * z;
1399 pA = fma(CC5, z2, CC3);
1400 pB = fma(CC4, z2, CC2);
1401 pA = fma(pA, z2, CC1);
1402 pA = pA * z;
1403 z = fma(pB, z2, pA);
1404 z = fma(z, q, q);
1405 q2 = halfpi - z;
1406 q2 = q2 - z;
1407 z = blend(z, q2, m);
1409 m = limitlow < xabs;
1410 z = blend(xabs, z, m);
1411 z = copysign(z, x);
1413 return z;
1416 /*! \brief SIMD float acos(x).
1418 * \param x The argument to evaluate acos for
1419 * \result Acos(x)
1421 static inline SimdFloat gmx_simdcall acos(SimdFloat x)
1423 const SimdFloat one(1.0F);
1424 const SimdFloat half(0.5F);
1425 const SimdFloat pi(static_cast<float>(M_PI));
1426 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1427 SimdFloat xabs;
1428 SimdFloat z, z1, z2, z3;
1429 SimdFBool m1, m2, m3;
1431 xabs = abs(x);
1432 m1 = half < xabs;
1433 m2 = setZero() < x;
1435 z = fnma(half, xabs, half);
1436 m3 = xabs < one;
1437 z = z * maskzInvsqrt(z, m3);
1438 z = blend(x, z, m1);
1439 z = asin(z);
1441 z2 = z + z;
1442 z1 = pi - z2;
1443 z3 = halfpi - z;
1444 z = blend(z1, z2, m2);
1445 z = blend(z3, z, m1);
1447 return z;
1450 /*! \brief SIMD float asin(x).
1452 * \param x The argument to evaluate atan for
1453 * \result Atan(x), same argument/value range as standard math library.
1455 static inline SimdFloat gmx_simdcall atan(SimdFloat x)
1457 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1458 const SimdFloat CA17(0.002823638962581753730774F);
1459 const SimdFloat CA15(-0.01595690287649631500244F);
1460 const SimdFloat CA13(0.04250498861074447631836F);
1461 const SimdFloat CA11(-0.07489009201526641845703F);
1462 const SimdFloat CA9(0.1063479334115982055664F);
1463 const SimdFloat CA7(-0.1420273631811141967773F);
1464 const SimdFloat CA5(0.1999269574880599975585F);
1465 const SimdFloat CA3(-0.3333310186862945556640F);
1466 const SimdFloat one(1.0F);
1467 SimdFloat x2, x3, x4, pA, pB;
1468 SimdFBool m, m2;
1470 m = x < setZero();
1471 x = abs(x);
1472 m2 = one < x;
1473 x = blend(x, maskzInv(x, m2), m2);
1475 x2 = x * x;
1476 x3 = x2 * x;
1477 x4 = x2 * x2;
1478 pA = fma(CA17, x4, CA13);
1479 pB = fma(CA15, x4, CA11);
1480 pA = fma(pA, x4, CA9);
1481 pB = fma(pB, x4, CA7);
1482 pA = fma(pA, x4, CA5);
1483 pB = fma(pB, x4, CA3);
1484 pA = fma(pA, x2, pB);
1485 pA = fma(pA, x3, x);
1487 pA = blend(pA, halfpi - pA, m2);
1488 pA = blend(pA, -pA, m);
1490 return pA;
1493 /*! \brief SIMD float atan2(y,x).
1495 * \param y Y component of vector, any quartile
1496 * \param x X component of vector, any quartile
1497 * \result Atan(y,x), same argument/value range as standard math library.
1499 * \note This routine should provide correct results for all finite
1500 * non-zero or positive-zero arguments. However, negative zero arguments will
1501 * be treated as positive zero, which means the return value will deviate from
1502 * the standard math library atan2(y,x) for those cases. That should not be
1503 * of any concern in Gromacs, and in particular it will not affect calculations
1504 * of angles from vectors.
1506 static inline SimdFloat gmx_simdcall atan2(SimdFloat y, SimdFloat x)
1508 const SimdFloat pi(static_cast<float>(M_PI));
1509 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0));
1510 SimdFloat xinv, p, aoffset;
1511 SimdFBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1513 mask_xnz = x != setZero();
1514 mask_ynz = y != setZero();
1515 mask_xlt0 = x < setZero();
1516 mask_ylt0 = y < setZero();
1518 aoffset = selectByNotMask(halfpi, mask_xnz);
1519 aoffset = selectByMask(aoffset, mask_ynz);
1521 aoffset = blend(aoffset, pi, mask_xlt0);
1522 aoffset = blend(aoffset, -aoffset, mask_ylt0);
1524 xinv = maskzInv(x, mask_xnz);
1525 p = y * xinv;
1526 p = atan(p);
1527 p = p + aoffset;
1529 return p;
1532 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1534 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1535 * \result Correction factor to coulomb force - see below for details.
1537 * This routine is meant to enable analytical evaluation of the
1538 * direct-space PME electrostatic force to avoid tables.
1540 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1541 * are some problems evaluating that:
1543 * First, the error function is difficult (read: expensive) to
1544 * approxmiate accurately for intermediate to large arguments, and
1545 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1546 * Second, we now try to avoid calculating potentials in Gromacs but
1547 * use forces directly.
1549 * We can simply things slight by noting that the PME part is really
1550 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1551 * \f[
1552 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1553 * \f]
1554 * The first term we already have from the inverse square root, so
1555 * that we can leave out of this routine.
1557 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1558 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1559 * the range used for the minimax fit. Use your favorite plotting program
1560 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1562 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1563 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1564 * then only use even powers. This is another minor optimization, since
1565 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1566 * the vector between the two atoms to get the vectorial force. The
1567 * fastest flops are the ones we can avoid calculating!
1569 * So, here's how it should be used:
1571 * 1. Calculate \f$r^2\f$.
1572 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1573 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1574 * 4. The return value is the expression:
1576 * \f[
1577 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1578 * \f]
1580 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1582 * \f[
1583 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1584 * \f]
1586 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1588 * \f[
1589 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1590 * \f]
1592 * With a bit of math exercise you should be able to confirm that
1593 * this is exactly
1595 * \f[
1596 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1597 * \f]
1599 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1600 * and you have your force (divided by \f$r\f$). A final multiplication
1601 * with the vector connecting the two particles and you have your
1602 * vectorial force to add to the particles.
1604 * This approximation achieves an error slightly lower than 1e-6
1605 * in single precision and 1e-11 in double precision
1606 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1607 * when added to \f$1/r\f$ the error will be insignificant.
1608 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1611 static inline SimdFloat gmx_simdcall pmeForceCorrection(SimdFloat z2)
1613 const SimdFloat FN6(-1.7357322914161492954e-8F);
1614 const SimdFloat FN5(1.4703624142580877519e-6F);
1615 const SimdFloat FN4(-0.000053401640219807709149F);
1616 const SimdFloat FN3(0.0010054721316683106153F);
1617 const SimdFloat FN2(-0.019278317264888380590F);
1618 const SimdFloat FN1(0.069670166153766424023F);
1619 const SimdFloat FN0(-0.75225204789749321333F);
1621 const SimdFloat FD4(0.0011193462567257629232F);
1622 const SimdFloat FD3(0.014866955030185295499F);
1623 const SimdFloat FD2(0.11583842382862377919F);
1624 const SimdFloat FD1(0.50736591960530292870F);
1625 const SimdFloat FD0(1.0F);
1627 SimdFloat z4;
1628 SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1630 z4 = z2 * z2;
1632 polyFD0 = fma(FD4, z4, FD2);
1633 polyFD1 = fma(FD3, z4, FD1);
1634 polyFD0 = fma(polyFD0, z4, FD0);
1635 polyFD0 = fma(polyFD1, z2, polyFD0);
1637 polyFD0 = inv(polyFD0);
1639 polyFN0 = fma(FN6, z4, FN4);
1640 polyFN1 = fma(FN5, z4, FN3);
1641 polyFN0 = fma(polyFN0, z4, FN2);
1642 polyFN1 = fma(polyFN1, z4, FN1);
1643 polyFN0 = fma(polyFN0, z4, FN0);
1644 polyFN0 = fma(polyFN1, z2, polyFN0);
1646 return polyFN0 * polyFD0;
1650 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1652 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1653 * \result Correction factor to coulomb potential - see below for details.
1655 * See \ref pmeForceCorrection for details about the approximation.
1657 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1658 * as the input argument.
1660 * Here's how it should be used:
1662 * 1. Calculate \f$r^2\f$.
1663 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1664 * 3. Evaluate this routine with z^2 as the argument.
1665 * 4. The return value is the expression:
1667 * \f[
1668 * \frac{\mbox{erf}(z)}{z}
1669 * \f]
1671 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1673 * \f[
1674 * \frac{\mbox{erf}(r \beta)}{r}
1675 * \f]
1677 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1678 * and you have your potential.
1680 * This approximation achieves an error slightly lower than 1e-6
1681 * in single precision and 4e-11 in double precision
1682 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1683 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1684 * when added to \f$1/r\f$ the error will be insignificant.
1685 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1687 static inline SimdFloat gmx_simdcall pmePotentialCorrection(SimdFloat z2)
1689 const SimdFloat VN6(1.9296833005951166339e-8F);
1690 const SimdFloat VN5(-1.4213390571557850962e-6F);
1691 const SimdFloat VN4(0.000041603292906656984871F);
1692 const SimdFloat VN3(-0.00013134036773265025626F);
1693 const SimdFloat VN2(0.038657983986041781264F);
1694 const SimdFloat VN1(0.11285044772717598220F);
1695 const SimdFloat VN0(1.1283802385263030286F);
1697 const SimdFloat VD3(0.0066752224023576045451F);
1698 const SimdFloat VD2(0.078647795836373922256F);
1699 const SimdFloat VD1(0.43336185284710920150F);
1700 const SimdFloat VD0(1.0F);
1702 SimdFloat z4;
1703 SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1705 z4 = z2 * z2;
1707 polyVD1 = fma(VD3, z4, VD1);
1708 polyVD0 = fma(VD2, z4, VD0);
1709 polyVD0 = fma(polyVD1, z2, polyVD0);
1711 polyVD0 = inv(polyVD0);
1713 polyVN0 = fma(VN6, z4, VN4);
1714 polyVN1 = fma(VN5, z4, VN3);
1715 polyVN0 = fma(polyVN0, z4, VN2);
1716 polyVN1 = fma(polyVN1, z4, VN1);
1717 polyVN0 = fma(polyVN0, z4, VN0);
1718 polyVN0 = fma(polyVN1, z2, polyVN0);
1720 return polyVN0 * polyVD0;
1722 # endif
1724 /*! \} */
1726 # if GMX_SIMD_HAVE_DOUBLE
1729 /*! \name Double precision SIMD math functions
1731 * \note In most cases you should use the real-precision functions instead.
1732 * \{
1735 /****************************************
1736 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1737 ****************************************/
1739 # if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1740 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1742 * \param x Values to set sign for
1743 * \param y Values used to set sign
1744 * \return Magnitude of x, sign of y
1746 static inline SimdDouble gmx_simdcall copysign(SimdDouble x, SimdDouble y)
1748 # if GMX_SIMD_HAVE_LOGICAL
1749 return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1750 # else
1751 return blend(abs(x), -abs(x), (y < setZero()));
1752 # endif
1754 # endif
1756 # if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1757 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1759 * This is a low-level routine that should only be used by SIMD math routine
1760 * that evaluates the inverse square root.
1762 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1763 * \param x The reference (starting) value x for which we want 1/sqrt(x).
1764 * \return An improved approximation with roughly twice as many bits of accuracy.
1766 static inline SimdDouble gmx_simdcall rsqrtIter(SimdDouble lu, SimdDouble x)
1768 SimdDouble tmp1 = x * lu;
1769 SimdDouble tmp2 = SimdDouble(-0.5) * lu;
1770 tmp1 = fma(tmp1, lu, SimdDouble(-3.0));
1771 return tmp1 * tmp2;
1773 # endif
1775 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1777 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1778 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1779 * For the single precision implementation this is obviously always
1780 * true for positive values, but for double precision it adds an
1781 * extra restriction since the first lookup step might have to be
1782 * performed in single precision on some architectures. Note that the
1783 * responsibility for checking falls on you - this routine does not
1784 * check arguments.
1786 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
1788 static inline SimdDouble gmx_simdcall invsqrt(SimdDouble x)
1790 SimdDouble lu = rsqrt(x);
1791 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1792 lu = rsqrtIter(lu, x);
1793 # endif
1794 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1795 lu = rsqrtIter(lu, x);
1796 # endif
1797 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1798 lu = rsqrtIter(lu, x);
1799 # endif
1800 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1801 lu = rsqrtIter(lu, x);
1802 # endif
1803 return lu;
1806 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1808 * \param x0 First set of arguments, x0 must be in single range (see below).
1809 * \param x1 Second set of arguments, x1 must be in single range (see below).
1810 * \param[out] out0 Result 1/sqrt(x0)
1811 * \param[out] out1 Result 1/sqrt(x1)
1813 * In particular for double precision we can sometimes calculate square root
1814 * pairs slightly faster by using single precision until the very last step.
1816 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1817 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1818 * For the single precision implementation this is obviously always
1819 * true for positive values, but for double precision it adds an
1820 * extra restriction since the first lookup step might have to be
1821 * performed in single precision on some architectures. Note that the
1822 * responsibility for checking falls on you - this routine does not
1823 * check arguments.
1825 static inline void gmx_simdcall invsqrtPair(SimdDouble x0, SimdDouble x1, SimdDouble* out0, SimdDouble* out1)
1827 # if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
1828 && (GMX_SIMD_RSQRT_BITS < 22)
1829 SimdFloat xf = cvtDD2F(x0, x1);
1830 SimdFloat luf = rsqrt(xf);
1831 SimdDouble lu0, lu1;
1832 // Intermediate target is single - mantissa+1 bits
1833 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1834 luf = rsqrtIter(luf, xf);
1835 # endif
1836 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1837 luf = rsqrtIter(luf, xf);
1838 # endif
1839 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1840 luf = rsqrtIter(luf, xf);
1841 # endif
1842 cvtF2DD(luf, &lu0, &lu1);
1843 // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1844 # if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1845 lu0 = rsqrtIter(lu0, x0);
1846 lu1 = rsqrtIter(lu1, x1);
1847 # endif
1848 # if (GMX_SIMD_ACCURACY_BITS_SINGLE * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1849 lu0 = rsqrtIter(lu0, x0);
1850 lu1 = rsqrtIter(lu1, x1);
1851 # endif
1852 *out0 = lu0;
1853 *out1 = lu1;
1854 # else
1855 *out0 = invsqrt(x0);
1856 *out1 = invsqrt(x1);
1857 # endif
1860 # if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1861 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1863 * This is a low-level routine that should only be used by SIMD math routine
1864 * that evaluates the reciprocal.
1866 * \param lu Approximation of 1/x, typically obtained from lookup.
1867 * \param x The reference (starting) value x for which we want 1/x.
1868 * \return An improved approximation with roughly twice as many bits of accuracy.
1870 static inline SimdDouble gmx_simdcall rcpIter(SimdDouble lu, SimdDouble x)
1872 return lu * fnma(lu, x, SimdDouble(2.0));
1874 # endif
1876 /*! \brief Calculate 1/x for SIMD double.
1878 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1879 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1880 * For the single precision implementation this is obviously always
1881 * true for positive values, but for double precision it adds an
1882 * extra restriction since the first lookup step might have to be
1883 * performed in single precision on some architectures. Note that the
1884 * responsibility for checking falls on you - this routine does not
1885 * check arguments.
1887 * \return 1/x. Result is undefined if your argument was invalid.
1889 static inline SimdDouble gmx_simdcall inv(SimdDouble x)
1891 SimdDouble lu = rcp(x);
1892 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1893 lu = rcpIter(lu, x);
1894 # endif
1895 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1896 lu = rcpIter(lu, x);
1897 # endif
1898 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1899 lu = rcpIter(lu, x);
1900 # endif
1901 # if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1902 lu = rcpIter(lu, x);
1903 # endif
1904 return lu;
1907 /*! \brief Division for SIMD doubles
1909 * \param nom Nominator
1910 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1911 * For single precision this is equivalent to a nonzero argument,
1912 * but in double precision it adds an extra restriction since
1913 * the first lookup step might have to be performed in single
1914 * precision on some architectures. Note that the responsibility
1915 * for checking falls on you - this routine does not check arguments.
1917 * \return nom/denom
1919 * \note This function does not use any masking to avoid problems with
1920 * zero values in the denominator.
1922 static inline SimdDouble gmx_simdcall operator/(SimdDouble nom, SimdDouble denom)
1924 return nom * inv(denom);
1928 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1930 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1931 * Illegal values in the masked-out elements will not lead to
1932 * floating-point exceptions.
1934 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1935 * GMX_FLOAT_MAX for masked-in entries.
1936 * See \ref invsqrt for the discussion about argument restrictions.
1937 * \param m Mask
1938 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
1939 * entry was not masked, and 0.0 for masked-out entries.
1941 static inline SimdDouble maskzInvsqrt(SimdDouble x, SimdDBool m)
1943 SimdDouble lu = maskzRsqrt(x, m);
1944 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1945 lu = rsqrtIter(lu, x);
1946 # endif
1947 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1948 lu = rsqrtIter(lu, x);
1949 # endif
1950 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1951 lu = rsqrtIter(lu, x);
1952 # endif
1953 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1954 lu = rsqrtIter(lu, x);
1955 # endif
1956 return lu;
1959 /*! \brief Calculate 1/x for SIMD double, masked version.
1961 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1962 * GMX_FLOAT_MAX for masked-in entries.
1963 * See \ref invsqrt for the discussion about argument restrictions.
1964 * \param m Mask
1965 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1967 static inline SimdDouble gmx_simdcall maskzInv(SimdDouble x, SimdDBool m)
1969 SimdDouble lu = maskzRcp(x, m);
1970 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1971 lu = rcpIter(lu, x);
1972 # endif
1973 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1974 lu = rcpIter(lu, x);
1975 # endif
1976 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1977 lu = rcpIter(lu, x);
1978 # endif
1979 # if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1980 lu = rcpIter(lu, x);
1981 # endif
1982 return lu;
1986 /*! \brief Calculate sqrt(x) for SIMD doubles.
1988 * \copydetails sqrt(SimdFloat)
1990 template<MathOptimization opt = MathOptimization::Safe>
1991 static inline SimdDouble gmx_simdcall sqrt(SimdDouble x)
1993 if (opt == MathOptimization::Safe)
1995 // As we might use a float version of rsqrt, we mask out small values
1996 SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1997 return res * x;
1999 else
2001 return x * invsqrt(x);
2005 /*! \brief Cube root for SIMD doubles
2007 * \param x Argument to calculate cube root of. Can be negative or zero,
2008 * but NaN or Inf values are not supported. Denormal values will
2009 * be treated as 0.0.
2010 * \return Cube root of x.
2012 static inline SimdDouble gmx_simdcall cbrt(SimdDouble x)
2014 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2015 const SimdDouble minDouble(std::numeric_limits<double>::min());
2016 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2017 // To avoid clang warnings about fragile integer division mixed with FP, we let
2018 // the divided value (1023/3=341) be the original constant.
2019 const std::int32_t offsetDiv3(341);
2020 const SimdDouble c6(-0.145263899385486377);
2021 const SimdDouble c5(0.784932344976639262);
2022 const SimdDouble c4(-1.83469277483613086);
2023 const SimdDouble c3(2.44693122563534430);
2024 const SimdDouble c2(-2.11499494167371287);
2025 const SimdDouble c1(1.50819193781584896);
2026 const SimdDouble c0(0.354895765043919860);
2027 const SimdDouble one(1.0);
2028 const SimdDouble two(2.0);
2029 const SimdDouble three(3.0);
2030 const SimdDouble oneThird(1.0 / 3.0);
2031 const SimdDouble cbrt2(1.2599210498948731648);
2032 const SimdDouble sqrCbrt2(1.5874010519681994748);
2034 // See the single precision routines for documentation of the algorithm
2036 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2037 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2038 SimdDBool xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
2040 SimdDInt32 exponent;
2041 SimdDouble y = frexp(xAbs, &exponent);
2042 SimdDouble z = fma(y, c6, c5);
2043 z = fma(z, y, c4);
2044 z = fma(z, y, c3);
2045 z = fma(z, y, c2);
2046 z = fma(z, y, c1);
2047 z = fma(z, y, c0);
2048 SimdDouble w = z * z * z;
2049 SimdDouble nom = z * fma(two, y, w);
2050 SimdDouble invDenom = inv(fma(two, w, y));
2052 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2053 SimdDouble offsetExpDiv3 =
2054 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2055 SimdDInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
2056 SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
2057 SimdDouble factor = blend(one, cbrt2, SimdDouble(0.5) < remainder);
2058 factor = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
2059 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
2060 SimdDouble result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
2061 return result;
2064 /*! \brief Inverse cube root for SIMD doubles.
2066 * \param x Argument to calculate cube root of. Can be positive or
2067 * negative, but the magnitude cannot be lower than
2068 * the smallest normal number.
2069 * \return Cube root of x. Undefined for values that don't
2070 * fulfill the restriction of abs(x) > minDouble.
2072 static inline SimdDouble gmx_simdcall invcbrt(SimdDouble x)
2074 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2075 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2076 // To avoid clang warnings about fragile integer division mixed with FP, we let
2077 // the divided value (1023/3=341) be the original constant.
2078 const std::int32_t offsetDiv3(341);
2079 const SimdDouble c6(-0.145263899385486377);
2080 const SimdDouble c5(0.784932344976639262);
2081 const SimdDouble c4(-1.83469277483613086);
2082 const SimdDouble c3(2.44693122563534430);
2083 const SimdDouble c2(-2.11499494167371287);
2084 const SimdDouble c1(1.50819193781584896);
2085 const SimdDouble c0(0.354895765043919860);
2086 const SimdDouble one(1.0);
2087 const SimdDouble two(2.0);
2088 const SimdDouble three(3.0);
2089 const SimdDouble oneThird(1.0 / 3.0);
2090 const SimdDouble invCbrt2(1.0 / 1.2599210498948731648);
2091 const SimdDouble invSqrCbrt2(1.0F / 1.5874010519681994748);
2093 // See the single precision routines for documentation of the algorithm
2095 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2096 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2098 SimdDInt32 exponent;
2099 SimdDouble y = frexp(xAbs, &exponent);
2100 SimdDouble z = fma(y, c6, c5);
2101 z = fma(z, y, c4);
2102 z = fma(z, y, c3);
2103 z = fma(z, y, c2);
2104 z = fma(z, y, c1);
2105 z = fma(z, y, c0);
2106 SimdDouble w = z * z * z;
2107 SimdDouble nom = fma(two, w, y);
2108 SimdDouble invDenom = inv(z * fma(two, y, w));
2109 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2110 SimdDouble offsetExpDiv3 =
2111 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2112 SimdDInt32 expDiv3 = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
2113 SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
2114 SimdDouble factor = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
2115 factor = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
2116 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
2117 SimdDouble result = ldexp(fraction, expDiv3);
2118 return result;
2121 /*! \brief SIMD double log2(x). This is the base-2 logarithm.
2123 * \param x Argument, should be >0.
2124 * \result The base-2 logarithm of x. Undefined if argument is invalid.
2126 static inline SimdDouble gmx_simdcall log2(SimdDouble x)
2128 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2129 // Just rescale if native log2() is not present, but log is.
2130 return log(x) * SimdDouble(std::log2(std::exp(1.0)));
2131 # else
2132 const SimdDouble one(1.0);
2133 const SimdDouble two(2.0);
2134 const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2135 const SimdDouble CL15(0.2138031565795550370534528);
2136 const SimdDouble CL13(0.2208884091496370882801159);
2137 const SimdDouble CL11(0.2623358279761824340958754);
2138 const SimdDouble CL9(0.3205984930182496084327681);
2139 const SimdDouble CL7(0.4121985864521960363227038);
2140 const SimdDouble CL5(0.5770780163410746954610886);
2141 const SimdDouble CL3(0.9617966939260027547931031);
2142 const SimdDouble CL1(2.885390081777926774009302);
2143 SimdDouble fExp, x2, p;
2144 SimdDBool m;
2145 SimdDInt32 iExp;
2147 // For log2(), the argument cannot be 0, so use the faster version of frexp()
2148 x = frexp<MathOptimization::Unsafe>(x, &iExp);
2149 fExp = cvtI2R(iExp);
2151 m = x < invsqrt2;
2152 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2153 fExp = fExp - selectByMask(one, m);
2154 x = x * blend(one, two, m);
2156 x = (x - one) * inv(x + one);
2157 x2 = x * x;
2159 p = fma(CL15, x2, CL13);
2160 p = fma(p, x2, CL11);
2161 p = fma(p, x2, CL9);
2162 p = fma(p, x2, CL7);
2163 p = fma(p, x2, CL5);
2164 p = fma(p, x2, CL3);
2165 p = fma(p, x2, CL1);
2166 p = fma(p, x, fExp);
2168 return p;
2169 # endif
2172 # if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2173 /*! \brief SIMD double log(x). This is the natural logarithm.
2175 * \param x Argument, should be >0.
2176 * \result The natural logarithm of x. Undefined if argument is invalid.
2178 static inline SimdDouble gmx_simdcall log(SimdDouble x)
2180 const SimdDouble one(1.0);
2181 const SimdDouble two(2.0);
2182 const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2183 const SimdDouble corr(0.693147180559945286226764);
2184 const SimdDouble CL15(0.148197055177935105296783);
2185 const SimdDouble CL13(0.153108178020442575739679);
2186 const SimdDouble CL11(0.181837339521549679055568);
2187 const SimdDouble CL9(0.22222194152736701733275);
2188 const SimdDouble CL7(0.285714288030134544449368);
2189 const SimdDouble CL5(0.399999999989941956712869);
2190 const SimdDouble CL3(0.666666666666685503450651);
2191 const SimdDouble CL1(2.0);
2192 SimdDouble fExp, x2, p;
2193 SimdDBool m;
2194 SimdDInt32 iExp;
2196 // For log(), the argument cannot be 0, so use the faster version of frexp()
2197 x = frexp<MathOptimization::Unsafe>(x, &iExp);
2198 fExp = cvtI2R(iExp);
2200 m = x < invsqrt2;
2201 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2202 fExp = fExp - selectByMask(one, m);
2203 x = x * blend(one, two, m);
2205 x = (x - one) * inv(x + one);
2206 x2 = x * x;
2208 p = fma(CL15, x2, CL13);
2209 p = fma(p, x2, CL11);
2210 p = fma(p, x2, CL9);
2211 p = fma(p, x2, CL7);
2212 p = fma(p, x2, CL5);
2213 p = fma(p, x2, CL3);
2214 p = fma(p, x2, CL1);
2215 p = fma(p, x, corr * fExp);
2217 return p;
2219 # endif
2221 # if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
2222 /*! \brief SIMD double 2^x.
2224 * \copydetails exp2(SimdFloat)
2226 template<MathOptimization opt = MathOptimization::Safe>
2227 static inline SimdDouble gmx_simdcall exp2(SimdDouble x)
2229 const SimdDouble CE11(4.435280790452730022081181e-10);
2230 const SimdDouble CE10(7.074105630863314448024247e-09);
2231 const SimdDouble CE9(1.017819803432096698472621e-07);
2232 const SimdDouble CE8(1.321543308956718799557863e-06);
2233 const SimdDouble CE7(0.00001525273348995851746990884);
2234 const SimdDouble CE6(0.0001540353046251466849082632);
2235 const SimdDouble CE5(0.001333355814678995257307880);
2236 const SimdDouble CE4(0.009618129107588335039176502);
2237 const SimdDouble CE3(0.05550410866481992147457793);
2238 const SimdDouble CE2(0.2402265069591015620470894);
2239 const SimdDouble CE1(0.6931471805599453304615075);
2240 const SimdDouble one(1.0);
2242 SimdDouble intpart;
2243 SimdDouble fexppart;
2244 SimdDouble p;
2246 // Large negative values are valid arguments to exp2(), so there are two
2247 // things we need to account for:
2248 // 1. When the exponents reaches -1023, the (biased) exponent field will be
2249 // zero and we can no longer multiply with it. There are special IEEE
2250 // formats to handle this range, but for now we have to accept that
2251 // we cannot handle those arguments. If input value becomes even more
2252 // negative, it will start to loop and we would end up with invalid
2253 // exponents. Thus, we need to limit or mask this.
2254 // 2. For VERY large negative values, we will have problems that the
2255 // subtraction to get the fractional part loses accuracy, and then we
2256 // can end up with overflows in the polynomial.
2258 // For now, we handle this by forwarding the math optimization setting to
2259 // ldexp, where the routine will return zero for very small arguments.
2261 // However, before doing that we need to make sure we do not call cvtR2I
2262 // with an argument that is so negative it cannot be converted to an integer.
2263 if (opt == MathOptimization::Safe)
2265 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
2268 fexppart = ldexp<opt>(one, cvtR2I(x));
2269 intpart = round(x);
2270 x = x - intpart;
2272 p = fma(CE11, x, CE10);
2273 p = fma(p, x, CE9);
2274 p = fma(p, x, CE8);
2275 p = fma(p, x, CE7);
2276 p = fma(p, x, CE6);
2277 p = fma(p, x, CE5);
2278 p = fma(p, x, CE4);
2279 p = fma(p, x, CE3);
2280 p = fma(p, x, CE2);
2281 p = fma(p, x, CE1);
2282 p = fma(p, x, one);
2283 x = p * fexppart;
2284 return x;
2286 # endif
2288 # if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
2289 /*! \brief SIMD double exp(x).
2291 * \copydetails exp(SimdFloat)
2293 template<MathOptimization opt = MathOptimization::Safe>
2294 static inline SimdDouble gmx_simdcall exp(SimdDouble x)
2296 const SimdDouble argscale(1.44269504088896340735992468100);
2297 const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
2298 const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
2299 const SimdDouble CE12(2.078375306791423699350304e-09);
2300 const SimdDouble CE11(2.518173854179933105218635e-08);
2301 const SimdDouble CE10(2.755842049600488770111608e-07);
2302 const SimdDouble CE9(2.755691815216689746619849e-06);
2303 const SimdDouble CE8(2.480158383706245033920920e-05);
2304 const SimdDouble CE7(0.0001984127043518048611841321);
2305 const SimdDouble CE6(0.001388888889360258341755930);
2306 const SimdDouble CE5(0.008333333332907368102819109);
2307 const SimdDouble CE4(0.04166666666663836745814631);
2308 const SimdDouble CE3(0.1666666666666796929434570);
2309 const SimdDouble CE2(0.5);
2310 const SimdDouble one(1.0);
2311 SimdDouble fexppart;
2312 SimdDouble intpart;
2313 SimdDouble y, p;
2315 // Large negative values are valid arguments to exp2(), so there are two
2316 // things we need to account for:
2317 // 1. When the exponents reaches -1023, the (biased) exponent field will be
2318 // zero and we can no longer multiply with it. There are special IEEE
2319 // formats to handle this range, but for now we have to accept that
2320 // we cannot handle those arguments. If input value becomes even more
2321 // negative, it will start to loop and we would end up with invalid
2322 // exponents. Thus, we need to limit or mask this.
2323 // 2. For VERY large negative values, we will have problems that the
2324 // subtraction to get the fractional part loses accuracy, and then we
2325 // can end up with overflows in the polynomial.
2327 // For now, we handle this by forwarding the math optimization setting to
2328 // ldexp, where the routine will return zero for very small arguments.
2330 // However, before doing that we need to make sure we do not call cvtR2I
2331 // with an argument that is so negative it cannot be converted to an integer
2332 // after being multiplied by argscale.
2334 if (opt == MathOptimization::Safe)
2336 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
2339 y = x * argscale;
2341 fexppart = ldexp<opt>(one, cvtR2I(y));
2342 intpart = round(y);
2344 // Extended precision arithmetics
2345 x = fma(invargscale0, intpart, x);
2346 x = fma(invargscale1, intpart, x);
2348 p = fma(CE12, x, CE11);
2349 p = fma(p, x, CE10);
2350 p = fma(p, x, CE9);
2351 p = fma(p, x, CE8);
2352 p = fma(p, x, CE7);
2353 p = fma(p, x, CE6);
2354 p = fma(p, x, CE5);
2355 p = fma(p, x, CE4);
2356 p = fma(p, x, CE3);
2357 p = fma(p, x, CE2);
2358 p = fma(p, x * x, x);
2359 # if GMX_SIMD_HAVE_FMA
2360 x = fma(p, fexppart, fexppart);
2361 # else
2362 x = (p + one) * fexppart;
2363 # endif
2365 return x;
2367 # endif
2369 /*! \brief SIMD double pow(x,y)
2371 * This returns x^y for SIMD values.
2373 * \tparam opt If this is changed from the default (safe) into the unsafe
2374 * option, there are no guarantees about correct results for x==0.
2376 * \param x Base.
2378 * \param y exponent.
2380 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
2381 * depending on the underlying implementation. If unsafe optimizations
2382 * are enabled, this is also true for x==0.
2384 * \warning You cannot rely on this implementation returning inf for arguments
2385 * that cause overflow. If you have some very large
2386 * values and need to rely on getting a valid numerical output,
2387 * take the minimum of your variable and the largest valid argument
2388 * before calling this routine.
2390 template<MathOptimization opt = MathOptimization::Safe>
2391 static inline SimdDouble gmx_simdcall pow(SimdDouble x, SimdDouble y)
2393 SimdDouble xcorr;
2395 if (opt == MathOptimization::Safe)
2397 xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
2399 else
2401 xcorr = x;
2404 SimdDouble result = exp2<opt>(y * log2(xcorr));
2406 if (opt == MathOptimization::Safe)
2408 // if x==0 and y>0 we explicitly set the result to 0.0
2409 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
2410 result = blend(result, setZero(), x == setZero() && setZero() < y);
2413 return result;
2417 /*! \brief SIMD double erf(x).
2419 * \param x The value to calculate erf(x) for.
2420 * \result erf(x)
2422 * This routine achieves very close to full precision, but we do not care about
2423 * the last bit or the subnormal result range.
2425 static inline SimdDouble gmx_simdcall erf(SimdDouble x)
2427 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2428 const SimdDouble CAP4(-0.431780540597889301512e-4);
2429 const SimdDouble CAP3(-0.00578562306260059236059);
2430 const SimdDouble CAP2(-0.028593586920219752446);
2431 const SimdDouble CAP1(-0.315924962948621698209);
2432 const SimdDouble CAP0(0.14952975608477029151);
2434 const SimdDouble CAQ5(-0.374089300177174709737e-5);
2435 const SimdDouble CAQ4(0.00015126584532155383535);
2436 const SimdDouble CAQ3(0.00536692680669480725423);
2437 const SimdDouble CAQ2(0.0668686825594046122636);
2438 const SimdDouble CAQ1(0.402604990869284362773);
2439 // CAQ0 == 1.0
2440 const SimdDouble CAoffset(0.9788494110107421875);
2442 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2443 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2444 const SimdDouble CBP5(0.00119770193298159629350136085658);
2445 const SimdDouble CBP4(0.0164944422378370965881008942733);
2446 const SimdDouble CBP3(0.0984581468691775932063932439252);
2447 const SimdDouble CBP2(0.317364595806937763843589437418);
2448 const SimdDouble CBP1(0.554167062641455850932670067075);
2449 const SimdDouble CBP0(0.427583576155807163756925301060);
2450 const SimdDouble CBQ7(0.00212288829699830145976198384930);
2451 const SimdDouble CBQ6(0.0334810979522685300554606393425);
2452 const SimdDouble CBQ5(0.2361713785181450957579508850717);
2453 const SimdDouble CBQ4(0.955364736493055670530981883072);
2454 const SimdDouble CBQ3(2.36815675631420037315349279199);
2455 const SimdDouble CBQ2(3.55261649184083035537184223542);
2456 const SimdDouble CBQ1(2.93501136050160872574376997993);
2457 // CBQ0 == 1.0
2459 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2460 const SimdDouble CCP6(-2.8175401114513378771);
2461 const SimdDouble CCP5(-3.22729451764143718517);
2462 const SimdDouble CCP4(-2.5518551727311523996);
2463 const SimdDouble CCP3(-0.687717681153649930619);
2464 const SimdDouble CCP2(-0.212652252872804219852);
2465 const SimdDouble CCP1(0.0175389834052493308818);
2466 const SimdDouble CCP0(0.00628057170626964891937);
2468 const SimdDouble CCQ6(5.48409182238641741584);
2469 const SimdDouble CCQ5(13.5064170191802889145);
2470 const SimdDouble CCQ4(22.9367376522880577224);
2471 const SimdDouble CCQ3(15.930646027911794143);
2472 const SimdDouble CCQ2(11.0567237927800161565);
2473 const SimdDouble CCQ1(2.79257750980575282228);
2474 // CCQ0 == 1.0
2475 const SimdDouble CCoffset(0.5579090118408203125);
2477 const SimdDouble one(1.0);
2478 const SimdDouble two(2.0);
2479 const SimdDouble minFloat(std::numeric_limits<float>::min());
2481 SimdDouble xabs, x2, x4, t, t2, w, w2;
2482 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2483 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2484 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2485 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2486 SimdDouble expmx2;
2487 SimdDBool mask, mask_erf, notmask_erf;
2489 // Calculate erf()
2490 xabs = abs(x);
2491 mask_erf = (xabs < one);
2492 notmask_erf = (one <= xabs);
2493 x2 = x * x;
2494 x4 = x2 * x2;
2496 PolyAP0 = fma(CAP4, x4, CAP2);
2497 PolyAP1 = fma(CAP3, x4, CAP1);
2498 PolyAP0 = fma(PolyAP0, x4, CAP0);
2499 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2501 PolyAQ1 = fma(CAQ5, x4, CAQ3);
2502 PolyAQ0 = fma(CAQ4, x4, CAQ2);
2503 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2504 PolyAQ0 = fma(PolyAQ0, x4, one);
2505 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2507 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2508 res_erf = CAoffset + res_erf;
2509 res_erf = x * res_erf;
2511 // Calculate erfc() in range [1,4.5]
2512 t = xabs - one;
2513 t2 = t * t;
2515 PolyBP0 = fma(CBP6, t2, CBP4);
2516 PolyBP1 = fma(CBP5, t2, CBP3);
2517 PolyBP0 = fma(PolyBP0, t2, CBP2);
2518 PolyBP1 = fma(PolyBP1, t2, CBP1);
2519 PolyBP0 = fma(PolyBP0, t2, CBP0);
2520 PolyBP0 = fma(PolyBP1, t, PolyBP0);
2522 PolyBQ1 = fma(CBQ7, t2, CBQ5);
2523 PolyBQ0 = fma(CBQ6, t2, CBQ4);
2524 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2525 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2526 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2527 PolyBQ0 = fma(PolyBQ0, t2, one);
2528 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2530 // The denominator polynomial can be zero outside the range
2531 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2533 res_erfcB = res_erfcB * xabs;
2535 // Calculate erfc() in range [4.5,inf]
2536 w = maskzInv(xabs, notmask_erf && (minFloat <= xabs));
2537 w2 = w * w;
2539 PolyCP0 = fma(CCP6, w2, CCP4);
2540 PolyCP1 = fma(CCP5, w2, CCP3);
2541 PolyCP0 = fma(PolyCP0, w2, CCP2);
2542 PolyCP1 = fma(PolyCP1, w2, CCP1);
2543 PolyCP0 = fma(PolyCP0, w2, CCP0);
2544 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2546 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2547 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2548 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2549 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2550 PolyCQ0 = fma(PolyCQ0, w2, one);
2551 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2553 expmx2 = exp(-x2);
2555 // The denominator polynomial can be zero outside the range
2556 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2557 res_erfcC = res_erfcC + CCoffset;
2558 res_erfcC = res_erfcC * w;
2560 mask = (SimdDouble(4.5) < xabs);
2561 res_erfc = blend(res_erfcB, res_erfcC, mask);
2563 res_erfc = res_erfc * expmx2;
2565 // erfc(x<0) = 2-erfc(|x|)
2566 mask = (x < setZero());
2567 res_erfc = blend(res_erfc, two - res_erfc, mask);
2569 // Select erf() or erfc()
2570 res = blend(one - res_erfc, res_erf, mask_erf);
2572 return res;
2575 /*! \brief SIMD double erfc(x).
2577 * \param x The value to calculate erfc(x) for.
2578 * \result erfc(x)
2580 * This routine achieves full precision (bar the last bit) over most of the
2581 * input range, but for large arguments where the result is getting close
2582 * to the minimum representable numbers we accept slightly larger errors
2583 * (think results that are in the ballpark of 10^-200 for double)
2584 * since that is not relevant for MD.
2586 static inline SimdDouble gmx_simdcall erfc(SimdDouble x)
2588 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2589 const SimdDouble CAP4(-0.431780540597889301512e-4);
2590 const SimdDouble CAP3(-0.00578562306260059236059);
2591 const SimdDouble CAP2(-0.028593586920219752446);
2592 const SimdDouble CAP1(-0.315924962948621698209);
2593 const SimdDouble CAP0(0.14952975608477029151);
2595 const SimdDouble CAQ5(-0.374089300177174709737e-5);
2596 const SimdDouble CAQ4(0.00015126584532155383535);
2597 const SimdDouble CAQ3(0.00536692680669480725423);
2598 const SimdDouble CAQ2(0.0668686825594046122636);
2599 const SimdDouble CAQ1(0.402604990869284362773);
2600 // CAQ0 == 1.0
2601 const SimdDouble CAoffset(0.9788494110107421875);
2603 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2604 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2605 const SimdDouble CBP5(0.00119770193298159629350136085658);
2606 const SimdDouble CBP4(0.0164944422378370965881008942733);
2607 const SimdDouble CBP3(0.0984581468691775932063932439252);
2608 const SimdDouble CBP2(0.317364595806937763843589437418);
2609 const SimdDouble CBP1(0.554167062641455850932670067075);
2610 const SimdDouble CBP0(0.427583576155807163756925301060);
2611 const SimdDouble CBQ7(0.00212288829699830145976198384930);
2612 const SimdDouble CBQ6(0.0334810979522685300554606393425);
2613 const SimdDouble CBQ5(0.2361713785181450957579508850717);
2614 const SimdDouble CBQ4(0.955364736493055670530981883072);
2615 const SimdDouble CBQ3(2.36815675631420037315349279199);
2616 const SimdDouble CBQ2(3.55261649184083035537184223542);
2617 const SimdDouble CBQ1(2.93501136050160872574376997993);
2618 // CBQ0 == 1.0
2620 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2621 const SimdDouble CCP6(-2.8175401114513378771);
2622 const SimdDouble CCP5(-3.22729451764143718517);
2623 const SimdDouble CCP4(-2.5518551727311523996);
2624 const SimdDouble CCP3(-0.687717681153649930619);
2625 const SimdDouble CCP2(-0.212652252872804219852);
2626 const SimdDouble CCP1(0.0175389834052493308818);
2627 const SimdDouble CCP0(0.00628057170626964891937);
2629 const SimdDouble CCQ6(5.48409182238641741584);
2630 const SimdDouble CCQ5(13.5064170191802889145);
2631 const SimdDouble CCQ4(22.9367376522880577224);
2632 const SimdDouble CCQ3(15.930646027911794143);
2633 const SimdDouble CCQ2(11.0567237927800161565);
2634 const SimdDouble CCQ1(2.79257750980575282228);
2635 // CCQ0 == 1.0
2636 const SimdDouble CCoffset(0.5579090118408203125);
2638 const SimdDouble one(1.0);
2639 const SimdDouble two(2.0);
2640 const SimdDouble minFloat(std::numeric_limits<float>::min());
2642 SimdDouble xabs, x2, x4, t, t2, w, w2;
2643 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2644 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2645 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2646 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2647 SimdDouble expmx2;
2648 SimdDBool mask, mask_erf, notmask_erf;
2650 // Calculate erf()
2651 xabs = abs(x);
2652 mask_erf = (xabs < one);
2653 notmask_erf = (one <= xabs);
2654 x2 = x * x;
2655 x4 = x2 * x2;
2657 PolyAP0 = fma(CAP4, x4, CAP2);
2658 PolyAP1 = fma(CAP3, x4, CAP1);
2659 PolyAP0 = fma(PolyAP0, x4, CAP0);
2660 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2661 PolyAQ1 = fma(CAQ5, x4, CAQ3);
2662 PolyAQ0 = fma(CAQ4, x4, CAQ2);
2663 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2664 PolyAQ0 = fma(PolyAQ0, x4, one);
2665 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2667 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2668 res_erf = CAoffset + res_erf;
2669 res_erf = x * res_erf;
2671 // Calculate erfc() in range [1,4.5]
2672 t = xabs - one;
2673 t2 = t * t;
2675 PolyBP0 = fma(CBP6, t2, CBP4);
2676 PolyBP1 = fma(CBP5, t2, CBP3);
2677 PolyBP0 = fma(PolyBP0, t2, CBP2);
2678 PolyBP1 = fma(PolyBP1, t2, CBP1);
2679 PolyBP0 = fma(PolyBP0, t2, CBP0);
2680 PolyBP0 = fma(PolyBP1, t, PolyBP0);
2682 PolyBQ1 = fma(CBQ7, t2, CBQ5);
2683 PolyBQ0 = fma(CBQ6, t2, CBQ4);
2684 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2685 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2686 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2687 PolyBQ0 = fma(PolyBQ0, t2, one);
2688 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2690 // The denominator polynomial can be zero outside the range
2691 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2693 res_erfcB = res_erfcB * xabs;
2695 // Calculate erfc() in range [4.5,inf]
2696 // Note that 1/x can only handle single precision!
2697 w = maskzInv(xabs, minFloat <= xabs);
2698 w2 = w * w;
2700 PolyCP0 = fma(CCP6, w2, CCP4);
2701 PolyCP1 = fma(CCP5, w2, CCP3);
2702 PolyCP0 = fma(PolyCP0, w2, CCP2);
2703 PolyCP1 = fma(PolyCP1, w2, CCP1);
2704 PolyCP0 = fma(PolyCP0, w2, CCP0);
2705 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2707 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2708 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2709 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2710 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2711 PolyCQ0 = fma(PolyCQ0, w2, one);
2712 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2714 expmx2 = exp(-x2);
2716 // The denominator polynomial can be zero outside the range
2717 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2718 res_erfcC = res_erfcC + CCoffset;
2719 res_erfcC = res_erfcC * w;
2721 mask = (SimdDouble(4.5) < xabs);
2722 res_erfc = blend(res_erfcB, res_erfcC, mask);
2724 res_erfc = res_erfc * expmx2;
2726 // erfc(x<0) = 2-erfc(|x|)
2727 mask = (x < setZero());
2728 res_erfc = blend(res_erfc, two - res_erfc, mask);
2730 // Select erf() or erfc()
2731 res = blend(res_erfc, one - res_erf, mask_erf);
2733 return res;
2736 /*! \brief SIMD double sin \& cos.
2738 * \param x The argument to evaluate sin/cos for
2739 * \param[out] sinval Sin(x)
2740 * \param[out] cosval Cos(x)
2742 * This version achieves close to machine precision, but for very large
2743 * magnitudes of the argument we inherently begin to lose accuracy due to the
2744 * argument reduction, despite using extended precision arithmetics internally.
2746 static inline void gmx_simdcall sincos(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
2748 // Constants to subtract Pi/4*x from y while minimizing precision loss
2749 const SimdDouble argred0(-2 * 0.78539816290140151978);
2750 const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2751 const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2752 const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2753 const SimdDouble two_over_pi(2.0 / M_PI);
2754 const SimdDouble const_sin5(1.58938307283228937328511e-10);
2755 const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2756 const SimdDouble const_sin3(2.75573131776846360512547e-06);
2757 const SimdDouble const_sin2(-0.000198412698278911770864914);
2758 const SimdDouble const_sin1(0.0083333333333191845961746);
2759 const SimdDouble const_sin0(-0.166666666666666130709393);
2761 const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2762 const SimdDouble const_cos6(2.08757471207040055479366e-09);
2763 const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2764 const SimdDouble const_cos4(2.48015872890001867311915e-05);
2765 const SimdDouble const_cos3(-0.00138888888888714019282329);
2766 const SimdDouble const_cos2(0.0416666666666665519592062);
2767 const SimdDouble half(0.5);
2768 const SimdDouble one(1.0);
2769 SimdDouble ssign, csign;
2770 SimdDouble x2, y, z, psin, pcos, sss, ccc;
2771 SimdDBool mask;
2772 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2773 const SimdDInt32 ione(1);
2774 const SimdDInt32 itwo(2);
2775 SimdDInt32 iy;
2777 z = x * two_over_pi;
2778 iy = cvtR2I(z);
2779 y = round(z);
2781 mask = cvtIB2B((iy & ione) == setZero());
2782 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2783 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2784 # else
2785 const SimdDouble quarter(0.25);
2786 const SimdDouble minusquarter(-0.25);
2787 SimdDouble q;
2788 SimdDBool m1, m2, m3;
2790 /* The most obvious way to find the arguments quadrant in the unit circle
2791 * to calculate the sign is to use integer arithmetic, but that is not
2792 * present in all SIMD implementations. As an alternative, we have devised a
2793 * pure floating-point algorithm that uses truncation for argument reduction
2794 * so that we get a new value 0<=q<1 over the unit circle, and then
2795 * do floating-point comparisons with fractions. This is likely to be
2796 * slightly slower (~10%) due to the longer latencies of floating-point, so
2797 * we only use it when integer SIMD arithmetic is not present.
2799 ssign = x;
2800 x = abs(x);
2801 // It is critical that half-way cases are rounded down
2802 z = fma(x, two_over_pi, half);
2803 y = trunc(z);
2804 q = z * quarter;
2805 q = q - trunc(q);
2806 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2807 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2808 * This removes the 2*Pi periodicity without using any integer arithmetic.
2809 * First check if y had the value 2 or 3, set csign if true.
2811 q = q - half;
2812 /* If we have logical operations we can work directly on the signbit, which
2813 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2814 * Thus, if you are altering defines to debug alternative code paths, the
2815 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2816 * active or inactive - you will get errors if only one is used.
2818 # if GMX_SIMD_HAVE_LOGICAL
2819 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2820 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2821 ssign = ssign ^ csign;
2822 # else
2823 ssign = copysign(SimdDouble(1.0), ssign);
2824 csign = copysign(SimdDouble(1.0), q);
2825 csign = -csign;
2826 ssign = ssign * csign; // swap ssign if csign was set.
2827 # endif
2828 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2829 m1 = (q < minusquarter);
2830 m2 = (setZero() <= q);
2831 m3 = (q < quarter);
2832 m2 = m2 && m3;
2833 mask = m1 || m2;
2834 // where mask is FALSE, swap sign.
2835 csign = csign * blend(SimdDouble(-1.0), one, mask);
2836 # endif
2837 x = fma(y, argred0, x);
2838 x = fma(y, argred1, x);
2839 x = fma(y, argred2, x);
2840 x = fma(y, argred3, x);
2841 x2 = x * x;
2843 psin = fma(const_sin5, x2, const_sin4);
2844 psin = fma(psin, x2, const_sin3);
2845 psin = fma(psin, x2, const_sin2);
2846 psin = fma(psin, x2, const_sin1);
2847 psin = fma(psin, x2, const_sin0);
2848 psin = fma(psin, x2 * x, x);
2850 pcos = fma(const_cos7, x2, const_cos6);
2851 pcos = fma(pcos, x2, const_cos5);
2852 pcos = fma(pcos, x2, const_cos4);
2853 pcos = fma(pcos, x2, const_cos3);
2854 pcos = fma(pcos, x2, const_cos2);
2855 pcos = fms(pcos, x2, half);
2856 pcos = fma(pcos, x2, one);
2858 sss = blend(pcos, psin, mask);
2859 ccc = blend(psin, pcos, mask);
2860 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2861 # if GMX_SIMD_HAVE_LOGICAL
2862 *sinval = sss ^ ssign;
2863 *cosval = ccc ^ csign;
2864 # else
2865 *sinval = sss * ssign;
2866 *cosval = ccc * csign;
2867 # endif
2870 /*! \brief SIMD double sin(x).
2872 * \param x The argument to evaluate sin for
2873 * \result Sin(x)
2875 * \attention Do NOT call both sin & cos if you need both results, since each of them
2876 * will then call \ref sincos and waste a factor 2 in performance.
2878 static inline SimdDouble gmx_simdcall sin(SimdDouble x)
2880 SimdDouble s, c;
2881 sincos(x, &s, &c);
2882 return s;
2885 /*! \brief SIMD double cos(x).
2887 * \param x The argument to evaluate cos for
2888 * \result Cos(x)
2890 * \attention Do NOT call both sin & cos if you need both results, since each of them
2891 * will then call \ref sincos and waste a factor 2 in performance.
2893 static inline SimdDouble gmx_simdcall cos(SimdDouble x)
2895 SimdDouble s, c;
2896 sincos(x, &s, &c);
2897 return c;
2900 /*! \brief SIMD double tan(x).
2902 * \param x The argument to evaluate tan for
2903 * \result Tan(x)
2905 static inline SimdDouble gmx_simdcall tan(SimdDouble x)
2907 const SimdDouble argred0(-2 * 0.78539816290140151978);
2908 const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2909 const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2910 const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2911 const SimdDouble two_over_pi(2.0 / M_PI);
2912 const SimdDouble CT15(1.01419718511083373224408e-05);
2913 const SimdDouble CT14(-2.59519791585924697698614e-05);
2914 const SimdDouble CT13(5.23388081915899855325186e-05);
2915 const SimdDouble CT12(-3.05033014433946488225616e-05);
2916 const SimdDouble CT11(7.14707504084242744267497e-05);
2917 const SimdDouble CT10(8.09674518280159187045078e-05);
2918 const SimdDouble CT9(0.000244884931879331847054404);
2919 const SimdDouble CT8(0.000588505168743587154904506);
2920 const SimdDouble CT7(0.00145612788922812427978848);
2921 const SimdDouble CT6(0.00359208743836906619142924);
2922 const SimdDouble CT5(0.00886323944362401618113356);
2923 const SimdDouble CT4(0.0218694882853846389592078);
2924 const SimdDouble CT3(0.0539682539781298417636002);
2925 const SimdDouble CT2(0.133333333333125941821962);
2926 const SimdDouble CT1(0.333333333333334980164153);
2927 const SimdDouble minFloat(std::numeric_limits<float>::min());
2929 SimdDouble x2, p, y, z;
2930 SimdDBool m;
2932 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2933 SimdDInt32 iy;
2934 SimdDInt32 ione(1);
2936 z = x * two_over_pi;
2937 iy = cvtR2I(z);
2938 y = round(z);
2939 m = cvtIB2B((iy & ione) == ione);
2941 x = fma(y, argred0, x);
2942 x = fma(y, argred1, x);
2943 x = fma(y, argred2, x);
2944 x = fma(y, argred3, x);
2945 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2946 # else
2947 const SimdDouble quarter(0.25);
2948 const SimdDouble half(0.5);
2949 const SimdDouble threequarter(0.75);
2950 const SimdDouble minFloat(std::numeric_limits<float>::min());
2951 SimdDouble w, q;
2952 SimdDBool m1, m2, m3;
2954 w = abs(x);
2955 z = fma(w, two_over_pi, half);
2956 y = trunc(z);
2957 q = z * quarter;
2958 q = q - trunc(q);
2959 m1 = (quarter <= q);
2960 m2 = (q < half);
2961 m3 = (threequarter <= q);
2962 m1 = m1 && m2;
2963 m = m1 || m3;
2964 w = fma(y, argred0, w);
2965 w = fma(y, argred1, w);
2966 w = fma(y, argred2, w);
2967 w = fma(y, argred3, w);
2969 w = blend(w, -w, m);
2970 x = w * copysign(SimdDouble(1.0), x);
2971 # endif
2972 x2 = x * x;
2973 p = fma(CT15, x2, CT14);
2974 p = fma(p, x2, CT13);
2975 p = fma(p, x2, CT12);
2976 p = fma(p, x2, CT11);
2977 p = fma(p, x2, CT10);
2978 p = fma(p, x2, CT9);
2979 p = fma(p, x2, CT8);
2980 p = fma(p, x2, CT7);
2981 p = fma(p, x2, CT6);
2982 p = fma(p, x2, CT5);
2983 p = fma(p, x2, CT4);
2984 p = fma(p, x2, CT3);
2985 p = fma(p, x2, CT2);
2986 p = fma(p, x2, CT1);
2987 p = fma(x2, p * x, x);
2989 p = blend(p, maskzInv(p, m && (minFloat < abs(p))), m);
2990 return p;
2993 /*! \brief SIMD double asin(x).
2995 * \param x The argument to evaluate asin for
2996 * \result Asin(x)
2998 static inline SimdDouble gmx_simdcall asin(SimdDouble x)
3000 // Same algorithm as cephes library
3001 const SimdDouble limit1(0.625);
3002 const SimdDouble limit2(1e-8);
3003 const SimdDouble one(1.0);
3004 const SimdDouble quarterpi(M_PI / 4.0);
3005 const SimdDouble morebits(6.123233995736765886130e-17);
3007 const SimdDouble P5(4.253011369004428248960e-3);
3008 const SimdDouble P4(-6.019598008014123785661e-1);
3009 const SimdDouble P3(5.444622390564711410273e0);
3010 const SimdDouble P2(-1.626247967210700244449e1);
3011 const SimdDouble P1(1.956261983317594739197e1);
3012 const SimdDouble P0(-8.198089802484824371615e0);
3014 const SimdDouble Q4(-1.474091372988853791896e1);
3015 const SimdDouble Q3(7.049610280856842141659e1);
3016 const SimdDouble Q2(-1.471791292232726029859e2);
3017 const SimdDouble Q1(1.395105614657485689735e2);
3018 const SimdDouble Q0(-4.918853881490881290097e1);
3020 const SimdDouble R4(2.967721961301243206100e-3);
3021 const SimdDouble R3(-5.634242780008963776856e-1);
3022 const SimdDouble R2(6.968710824104713396794e0);
3023 const SimdDouble R1(-2.556901049652824852289e1);
3024 const SimdDouble R0(2.853665548261061424989e1);
3026 const SimdDouble S3(-2.194779531642920639778e1);
3027 const SimdDouble S2(1.470656354026814941758e2);
3028 const SimdDouble S1(-3.838770957603691357202e2);
3029 const SimdDouble S0(3.424398657913078477438e2);
3031 SimdDouble xabs;
3032 SimdDouble zz, ww, z, q, w, zz2, ww2;
3033 SimdDouble PA, PB;
3034 SimdDouble QA, QB;
3035 SimdDouble RA, RB;
3036 SimdDouble SA, SB;
3037 SimdDouble nom, denom;
3038 SimdDBool mask, mask2;
3040 xabs = abs(x);
3042 mask = (limit1 < xabs);
3044 zz = one - xabs;
3045 ww = xabs * xabs;
3046 zz2 = zz * zz;
3047 ww2 = ww * ww;
3049 // R
3050 RA = fma(R4, zz2, R2);
3051 RB = fma(R3, zz2, R1);
3052 RA = fma(RA, zz2, R0);
3053 RA = fma(RB, zz, RA);
3055 // S, SA = zz2
3056 SB = fma(S3, zz2, S1);
3057 SA = zz2 + S2;
3058 SA = fma(SA, zz2, S0);
3059 SA = fma(SB, zz, SA);
3061 // P
3062 PA = fma(P5, ww2, P3);
3063 PB = fma(P4, ww2, P2);
3064 PA = fma(PA, ww2, P1);
3065 PB = fma(PB, ww2, P0);
3066 PA = fma(PA, ww, PB);
3068 // Q, QA = ww2
3069 QB = fma(Q4, ww2, Q2);
3070 QA = ww2 + Q3;
3071 QA = fma(QA, ww2, Q1);
3072 QB = fma(QB, ww2, Q0);
3073 QA = fma(QA, ww, QB);
3075 RA = RA * zz;
3076 PA = PA * ww;
3078 nom = blend(PA, RA, mask);
3079 denom = blend(QA, SA, mask);
3081 mask2 = (limit2 < xabs);
3082 q = nom * maskzInv(denom, mask2);
3084 zz = zz + zz;
3085 zz = sqrt(zz);
3086 z = quarterpi - zz;
3087 zz = fms(zz, q, morebits);
3088 z = z - zz;
3089 z = z + quarterpi;
3091 w = xabs * q;
3092 w = w + xabs;
3094 z = blend(w, z, mask);
3096 z = blend(xabs, z, mask2);
3098 z = copysign(z, x);
3100 return z;
3103 /*! \brief SIMD double acos(x).
3105 * \param x The argument to evaluate acos for
3106 * \result Acos(x)
3108 static inline SimdDouble gmx_simdcall acos(SimdDouble x)
3110 const SimdDouble one(1.0);
3111 const SimdDouble half(0.5);
3112 const SimdDouble quarterpi0(7.85398163397448309616e-1);
3113 const SimdDouble quarterpi1(6.123233995736765886130e-17);
3115 SimdDBool mask1;
3116 SimdDouble z, z1, z2;
3118 mask1 = (half < x);
3119 z1 = half * (one - x);
3120 z1 = sqrt(z1);
3121 z = blend(x, z1, mask1);
3123 z = asin(z);
3125 z1 = z + z;
3127 z2 = quarterpi0 - z;
3128 z2 = z2 + quarterpi1;
3129 z2 = z2 + quarterpi0;
3131 z = blend(z2, z1, mask1);
3133 return z;
3136 /*! \brief SIMD double asin(x).
3138 * \param x The argument to evaluate atan for
3139 * \result Atan(x), same argument/value range as standard math library.
3141 static inline SimdDouble gmx_simdcall atan(SimdDouble x)
3143 // Same algorithm as cephes library
3144 const SimdDouble limit1(0.66);
3145 const SimdDouble limit2(2.41421356237309504880);
3146 const SimdDouble quarterpi(M_PI / 4.0);
3147 const SimdDouble halfpi(M_PI / 2.0);
3148 const SimdDouble mone(-1.0);
3149 const SimdDouble morebits1(0.5 * 6.123233995736765886130E-17);
3150 const SimdDouble morebits2(6.123233995736765886130E-17);
3152 const SimdDouble P4(-8.750608600031904122785E-1);
3153 const SimdDouble P3(-1.615753718733365076637E1);
3154 const SimdDouble P2(-7.500855792314704667340E1);
3155 const SimdDouble P1(-1.228866684490136173410E2);
3156 const SimdDouble P0(-6.485021904942025371773E1);
3158 const SimdDouble Q4(2.485846490142306297962E1);
3159 const SimdDouble Q3(1.650270098316988542046E2);
3160 const SimdDouble Q2(4.328810604912902668951E2);
3161 const SimdDouble Q1(4.853903996359136964868E2);
3162 const SimdDouble Q0(1.945506571482613964425E2);
3164 SimdDouble y, xabs, t1, t2;
3165 SimdDouble z, z2;
3166 SimdDouble P_A, P_B, Q_A, Q_B;
3167 SimdDBool mask1, mask2;
3169 xabs = abs(x);
3171 mask1 = (limit1 < xabs);
3172 mask2 = (limit2 < xabs);
3174 t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
3175 t2 = mone * maskzInv(xabs, mask2);
3177 y = selectByMask(quarterpi, mask1);
3178 y = blend(y, halfpi, mask2);
3179 xabs = blend(xabs, t1, mask1);
3180 xabs = blend(xabs, t2, mask2);
3182 z = xabs * xabs;
3183 z2 = z * z;
3185 P_A = fma(P4, z2, P2);
3186 P_B = fma(P3, z2, P1);
3187 P_A = fma(P_A, z2, P0);
3188 P_A = fma(P_B, z, P_A);
3190 // Q_A = z2
3191 Q_B = fma(Q4, z2, Q2);
3192 Q_A = z2 + Q3;
3193 Q_A = fma(Q_A, z2, Q1);
3194 Q_B = fma(Q_B, z2, Q0);
3195 Q_A = fma(Q_A, z, Q_B);
3197 z = z * P_A;
3198 z = z * inv(Q_A);
3199 z = fma(z, xabs, xabs);
3201 t1 = selectByMask(morebits1, mask1);
3202 t1 = blend(t1, morebits2, mask2);
3204 z = z + t1;
3205 y = y + z;
3207 y = copysign(y, x);
3209 return y;
3212 /*! \brief SIMD double atan2(y,x).
3214 * \param y Y component of vector, any quartile
3215 * \param x X component of vector, any quartile
3216 * \result Atan(y,x), same argument/value range as standard math library.
3218 * \note This routine should provide correct results for all finite
3219 * non-zero or positive-zero arguments. However, negative zero arguments will
3220 * be treated as positive zero, which means the return value will deviate from
3221 * the standard math library atan2(y,x) for those cases. That should not be
3222 * of any concern in Gromacs, and in particular it will not affect calculations
3223 * of angles from vectors.
3225 static inline SimdDouble gmx_simdcall atan2(SimdDouble y, SimdDouble x)
3227 const SimdDouble pi(M_PI);
3228 const SimdDouble halfpi(M_PI / 2.0);
3229 const SimdDouble minFloat(std::numeric_limits<float>::min());
3230 SimdDouble xinv, p, aoffset;
3231 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3233 mask_xnz = x != setZero();
3234 mask_ynz = y != setZero();
3235 mask_xlt0 = (x < setZero());
3236 mask_ylt0 = (y < setZero());
3238 aoffset = selectByNotMask(halfpi, mask_xnz);
3239 aoffset = selectByMask(aoffset, mask_ynz);
3241 aoffset = blend(aoffset, pi, mask_xlt0);
3242 aoffset = blend(aoffset, -aoffset, mask_ylt0);
3244 xinv = maskzInv(x, mask_xnz && (minFloat <= abs(x)));
3245 p = y * xinv;
3246 p = atan(p);
3247 p = p + aoffset;
3249 return p;
3253 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
3255 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3256 * interaction distance and beta the ewald splitting parameters.
3257 * \result Correction factor to coulomb force.
3259 * This routine is meant to enable analytical evaluation of the
3260 * direct-space PME electrostatic force to avoid tables. For details, see the
3261 * single precision function.
3263 static inline SimdDouble gmx_simdcall pmeForceCorrection(SimdDouble z2)
3265 const SimdDouble FN10(-8.0072854618360083154e-14);
3266 const SimdDouble FN9(1.1859116242260148027e-11);
3267 const SimdDouble FN8(-8.1490406329798423616e-10);
3268 const SimdDouble FN7(3.4404793543907847655e-8);
3269 const SimdDouble FN6(-9.9471420832602741006e-7);
3270 const SimdDouble FN5(0.000020740315999115847456);
3271 const SimdDouble FN4(-0.00031991745139313364005);
3272 const SimdDouble FN3(0.0035074449373659008203);
3273 const SimdDouble FN2(-0.031750380176100813405);
3274 const SimdDouble FN1(0.13884101728898463426);
3275 const SimdDouble FN0(-0.75225277815249618847);
3277 const SimdDouble FD5(0.000016009278224355026701);
3278 const SimdDouble FD4(0.00051055686934806966046);
3279 const SimdDouble FD3(0.0081803507497974289008);
3280 const SimdDouble FD2(0.077181146026670287235);
3281 const SimdDouble FD1(0.41543303143712535988);
3282 const SimdDouble FD0(1.0);
3284 SimdDouble z4;
3285 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3287 z4 = z2 * z2;
3289 polyFD1 = fma(FD5, z4, FD3);
3290 polyFD1 = fma(polyFD1, z4, FD1);
3291 polyFD1 = polyFD1 * z2;
3292 polyFD0 = fma(FD4, z4, FD2);
3293 polyFD0 = fma(polyFD0, z4, FD0);
3294 polyFD0 = polyFD0 + polyFD1;
3296 polyFD0 = inv(polyFD0);
3298 polyFN0 = fma(FN10, z4, FN8);
3299 polyFN0 = fma(polyFN0, z4, FN6);
3300 polyFN0 = fma(polyFN0, z4, FN4);
3301 polyFN0 = fma(polyFN0, z4, FN2);
3302 polyFN0 = fma(polyFN0, z4, FN0);
3303 polyFN1 = fma(FN9, z4, FN7);
3304 polyFN1 = fma(polyFN1, z4, FN5);
3305 polyFN1 = fma(polyFN1, z4, FN3);
3306 polyFN1 = fma(polyFN1, z4, FN1);
3307 polyFN0 = fma(polyFN1, z2, polyFN0);
3310 return polyFN0 * polyFD0;
3314 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
3316 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3317 * interaction distance and beta the ewald splitting parameters.
3318 * \result Correction factor to coulomb force.
3320 * This routine is meant to enable analytical evaluation of the
3321 * direct-space PME electrostatic potential to avoid tables. For details, see the
3322 * single precision function.
3324 static inline SimdDouble gmx_simdcall pmePotentialCorrection(SimdDouble z2)
3326 const SimdDouble VN9(-9.3723776169321855475e-13);
3327 const SimdDouble VN8(1.2280156762674215741e-10);
3328 const SimdDouble VN7(-7.3562157912251309487e-9);
3329 const SimdDouble VN6(2.6215886208032517509e-7);
3330 const SimdDouble VN5(-4.9532491651265819499e-6);
3331 const SimdDouble VN4(0.00025907400778966060389);
3332 const SimdDouble VN3(0.0010585044856156469792);
3333 const SimdDouble VN2(0.045247661136833092885);
3334 const SimdDouble VN1(0.11643931522926034421);
3335 const SimdDouble VN0(1.1283791671726767970);
3337 const SimdDouble VD5(0.000021784709867336150342);
3338 const SimdDouble VD4(0.00064293662010911388448);
3339 const SimdDouble VD3(0.0096311444822588683504);
3340 const SimdDouble VD2(0.085608012351550627051);
3341 const SimdDouble VD1(0.43652499166614811084);
3342 const SimdDouble VD0(1.0);
3344 SimdDouble z4;
3345 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
3347 z4 = z2 * z2;
3349 polyVD1 = fma(VD5, z4, VD3);
3350 polyVD0 = fma(VD4, z4, VD2);
3351 polyVD1 = fma(polyVD1, z4, VD1);
3352 polyVD0 = fma(polyVD0, z4, VD0);
3353 polyVD0 = fma(polyVD1, z2, polyVD0);
3355 polyVD0 = inv(polyVD0);
3357 polyVN1 = fma(VN9, z4, VN7);
3358 polyVN0 = fma(VN8, z4, VN6);
3359 polyVN1 = fma(polyVN1, z4, VN5);
3360 polyVN0 = fma(polyVN0, z4, VN4);
3361 polyVN1 = fma(polyVN1, z4, VN3);
3362 polyVN0 = fma(polyVN0, z4, VN2);
3363 polyVN1 = fma(polyVN1, z4, VN1);
3364 polyVN0 = fma(polyVN0, z4, VN0);
3365 polyVN0 = fma(polyVN1, z2, polyVN0);
3367 return polyVN0 * polyVD0;
3370 /*! \} */
3373 /*! \name SIMD math functions for double prec. data, single prec. accuracy
3375 * \note In some cases we do not need full double accuracy of individual
3376 * SIMD math functions, although the data is stored in double precision
3377 * SIMD registers. This might be the case for special algorithms, or
3378 * if the architecture does not support single precision.
3379 * Since the full double precision evaluation of math functions
3380 * typically require much more expensive polynomial approximations
3381 * these functions implement the algorithms used in the single precision
3382 * SIMD math functions, but they operate on double precision
3383 * SIMD variables.
3385 * \{
3388 /*********************************************************************
3389 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3390 *********************************************************************/
3392 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
3394 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3395 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3396 * For the single precision implementation this is obviously always
3397 * true for positive values, but for double precision it adds an
3398 * extra restriction since the first lookup step might have to be
3399 * performed in single precision on some architectures. Note that the
3400 * responsibility for checking falls on you - this routine does not
3401 * check arguments.
3403 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3405 static inline SimdDouble gmx_simdcall invsqrtSingleAccuracy(SimdDouble x)
3407 SimdDouble lu = rsqrt(x);
3408 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3409 lu = rsqrtIter(lu, x);
3410 # endif
3411 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3412 lu = rsqrtIter(lu, x);
3413 # endif
3414 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3415 lu = rsqrtIter(lu, x);
3416 # endif
3417 return lu;
3420 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
3422 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3423 * Illegal values in the masked-out elements will not lead to
3424 * floating-point exceptions.
3426 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3427 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3428 * For the single precision implementation this is obviously always
3429 * true for positive values, but for double precision it adds an
3430 * extra restriction since the first lookup step might have to be
3431 * performed in single precision on some architectures. Note that the
3432 * responsibility for checking falls on you - this routine does not
3433 * check arguments.
3435 * \param m Mask
3436 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
3437 * entry was not masked, and 0.0 for masked-out entries.
3439 static inline SimdDouble maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
3441 SimdDouble lu = maskzRsqrt(x, m);
3442 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3443 lu = rsqrtIter(lu, x);
3444 # endif
3445 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3446 lu = rsqrtIter(lu, x);
3447 # endif
3448 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3449 lu = rsqrtIter(lu, x);
3450 # endif
3451 return lu;
3454 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
3456 * \param x0 First set of arguments, x0 must be in single range (see below).
3457 * \param x1 Second set of arguments, x1 must be in single range (see below).
3458 * \param[out] out0 Result 1/sqrt(x0)
3459 * \param[out] out1 Result 1/sqrt(x1)
3461 * In particular for double precision we can sometimes calculate square root
3462 * pairs slightly faster by using single precision until the very last step.
3464 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
3465 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3466 * For the single precision implementation this is obviously always
3467 * true for positive values, but for double precision it adds an
3468 * extra restriction since the first lookup step might have to be
3469 * performed in single precision on some architectures. Note that the
3470 * responsibility for checking falls on you - this routine does not
3471 * check arguments.
3473 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdDouble x0,
3474 SimdDouble x1,
3475 SimdDouble* out0,
3476 SimdDouble* out1)
3478 # if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
3479 && (GMX_SIMD_RSQRT_BITS < 22)
3480 SimdFloat xf = cvtDD2F(x0, x1);
3481 SimdFloat luf = rsqrt(xf);
3482 SimdDouble lu0, lu1;
3483 // Intermediate target is single - mantissa+1 bits
3484 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3485 luf = rsqrtIter(luf, xf);
3486 # endif
3487 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3488 luf = rsqrtIter(luf, xf);
3489 # endif
3490 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3491 luf = rsqrtIter(luf, xf);
3492 # endif
3493 cvtF2DD(luf, &lu0, &lu1);
3494 // We now have single-precision accuracy values in lu0/lu1
3495 *out0 = lu0;
3496 *out1 = lu1;
3497 # else
3498 *out0 = invsqrtSingleAccuracy(x0);
3499 *out1 = invsqrtSingleAccuracy(x1);
3500 # endif
3503 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3505 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3506 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3507 * For the single precision implementation this is obviously always
3508 * true for positive values, but for double precision it adds an
3509 * extra restriction since the first lookup step might have to be
3510 * performed in single precision on some architectures. Note that the
3511 * responsibility for checking falls on you - this routine does not
3512 * check arguments.
3514 * \return 1/x. Result is undefined if your argument was invalid.
3516 static inline SimdDouble gmx_simdcall invSingleAccuracy(SimdDouble x)
3518 SimdDouble lu = rcp(x);
3519 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3520 lu = rcpIter(lu, x);
3521 # endif
3522 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3523 lu = rcpIter(lu, x);
3524 # endif
3525 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3526 lu = rcpIter(lu, x);
3527 # endif
3528 return lu;
3531 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3533 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3534 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3535 * For the single precision implementation this is obviously always
3536 * true for positive values, but for double precision it adds an
3537 * extra restriction since the first lookup step might have to be
3538 * performed in single precision on some architectures. Note that the
3539 * responsibility for checking falls on you - this routine does not
3540 * check arguments.
3542 * \param m Mask
3543 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3545 static inline SimdDouble gmx_simdcall maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3547 SimdDouble lu = maskzRcp(x, m);
3548 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3549 lu = rcpIter(lu, x);
3550 # endif
3551 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3552 lu = rcpIter(lu, x);
3553 # endif
3554 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3555 lu = rcpIter(lu, x);
3556 # endif
3557 return lu;
3561 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3563 * \copydetails sqrt(SimdFloat)
3565 template<MathOptimization opt = MathOptimization::Safe>
3566 static inline SimdDouble gmx_simdcall sqrtSingleAccuracy(SimdDouble x)
3568 if (opt == MathOptimization::Safe)
3570 SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3571 return res * x;
3573 else
3575 return x * invsqrtSingleAccuracy(x);
3579 /*! \brief Cube root for SIMD doubles, single accuracy.
3581 * \param x Argument to calculate cube root of. Can be negative or zero,
3582 * but NaN or Inf values are not supported. Denormal values will
3583 * be treated as 0.0.
3584 * \return Cube root of x.
3586 static inline SimdDouble gmx_simdcall cbrtSingleAccuracy(SimdDouble x)
3588 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3589 const SimdDouble minDouble(std::numeric_limits<double>::min());
3590 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3591 // Use the divided value as original constant to avoid division warnings.
3592 const std::int32_t offsetDiv3(341);
3593 const SimdDouble c2(-0.191502161678719066);
3594 const SimdDouble c1(0.697570460207922770);
3595 const SimdDouble c0(0.492659620528969547);
3596 const SimdDouble one(1.0);
3597 const SimdDouble two(2.0);
3598 const SimdDouble three(3.0);
3599 const SimdDouble oneThird(1.0 / 3.0);
3600 const SimdDouble cbrt2(1.2599210498948731648);
3601 const SimdDouble sqrCbrt2(1.5874010519681994748);
3603 // See the single precision routines for documentation of the algorithm
3605 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3606 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3607 SimdDBool xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
3609 SimdDInt32 exponent;
3610 SimdDouble y = frexp(xAbs, &exponent);
3611 SimdDouble z = fma(fma(y, c2, c1), y, c0);
3612 SimdDouble w = z * z * z;
3613 SimdDouble nom = z * fma(two, y, w);
3614 SimdDouble invDenom = inv(fma(two, w, y));
3616 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3617 SimdDouble offsetExpDiv3 =
3618 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3619 SimdDInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
3620 SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
3621 SimdDouble factor = blend(one, cbrt2, SimdDouble(0.5) < remainder);
3622 factor = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
3623 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
3624 SimdDouble result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
3625 return result;
3628 /*! \brief Inverse cube root for SIMD doubles, single accuracy.
3630 * \param x Argument to calculate cube root of. Can be positive or
3631 * negative, but the magnitude cannot be lower than
3632 * the smallest normal number.
3633 * \return Cube root of x. Undefined for values that don't
3634 * fulfill the restriction of abs(x) > minDouble.
3636 static inline SimdDouble gmx_simdcall invcbrtSingleAccuracy(SimdDouble x)
3638 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3639 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3640 // Use the divided value as original constant to avoid division warnings.
3641 const std::int32_t offsetDiv3(341);
3642 const SimdDouble c2(-0.191502161678719066);
3643 const SimdDouble c1(0.697570460207922770);
3644 const SimdDouble c0(0.492659620528969547);
3645 const SimdDouble one(1.0);
3646 const SimdDouble two(2.0);
3647 const SimdDouble three(3.0);
3648 const SimdDouble oneThird(1.0 / 3.0);
3649 const SimdDouble invCbrt2(1.0 / 1.2599210498948731648);
3650 const SimdDouble invSqrCbrt2(1.0F / 1.5874010519681994748);
3652 // See the single precision routines for documentation of the algorithm
3654 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3655 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3657 SimdDInt32 exponent;
3658 SimdDouble y = frexp(xAbs, &exponent);
3659 SimdDouble z = fma(fma(y, c2, c1), y, c0);
3660 SimdDouble w = z * z * z;
3661 SimdDouble nom = fma(two, w, y);
3662 SimdDouble invDenom = inv(z * fma(two, y, w));
3663 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3664 SimdDouble offsetExpDiv3 =
3665 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3666 SimdDInt32 expDiv3 = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
3667 SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
3668 SimdDouble factor = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
3669 factor = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
3670 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
3671 SimdDouble result = ldexp(fraction, expDiv3);
3672 return result;
3675 /*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
3677 * \param x Argument, should be >0.
3678 * \result The base 2 logarithm of x. Undefined if argument is invalid.
3680 static inline SimdDouble gmx_simdcall log2SingleAccuracy(SimdDouble x)
3682 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3683 return log(x) * SimdDouble(std::log2(std::exp(1.0)));
3684 # else
3685 const SimdDouble one(1.0);
3686 const SimdDouble two(2.0);
3687 const SimdDouble sqrt2(std::sqrt(2.0));
3688 const SimdDouble CL9(0.342149508897807708152F);
3689 const SimdDouble CL7(0.411570606888219447939F);
3690 const SimdDouble CL5(0.577085979152320294183F);
3691 const SimdDouble CL3(0.961796550607099898222F);
3692 const SimdDouble CL1(2.885390081777926774009F);
3693 SimdDouble fexp, x2, p;
3694 SimdDInt32 iexp;
3695 SimdDBool mask;
3697 // For log2(), the argument cannot be 0, so use the faster version of frexp
3698 x = frexp<MathOptimization::Unsafe>(x, &iexp);
3699 fexp = cvtI2R(iexp);
3701 mask = (x < sqrt2);
3702 // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3703 fexp = fexp - selectByMask(one, mask);
3704 x = x * blend(one, two, mask);
3706 x = (x - one) * invSingleAccuracy(x + one);
3707 x2 = x * x;
3709 p = fma(CL9, x2, CL7);
3710 p = fma(p, x2, CL5);
3711 p = fma(p, x2, CL3);
3712 p = fma(p, x2, CL1);
3713 p = fma(p, x, fexp);
3715 return p;
3716 # endif
3719 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3721 * \param x Argument, should be >0.
3722 * \result The natural logarithm of x. Undefined if argument is invalid.
3724 static inline SimdDouble gmx_simdcall logSingleAccuracy(SimdDouble x)
3726 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3727 return log(x);
3728 # else
3729 const SimdDouble one(1.0);
3730 const SimdDouble two(2.0);
3731 const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
3732 const SimdDouble corr(0.693147180559945286226764);
3733 const SimdDouble CL9(0.2371599674224853515625);
3734 const SimdDouble CL7(0.285279005765914916992188);
3735 const SimdDouble CL5(0.400005519390106201171875);
3736 const SimdDouble CL3(0.666666567325592041015625);
3737 const SimdDouble CL1(2.0);
3738 SimdDouble fexp, x2, p;
3739 SimdDInt32 iexp;
3740 SimdDBool mask;
3742 // For log(), the argument cannot be 0, so use the faster version of frexp
3743 x = frexp<MathOptimization::Unsafe>(x, &iexp);
3744 fexp = cvtI2R(iexp);
3746 mask = x < invsqrt2;
3747 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
3748 fexp = fexp - selectByMask(one, mask);
3749 x = x * blend(one, two, mask);
3751 x = (x - one) * invSingleAccuracy(x + one);
3752 x2 = x * x;
3754 p = fma(CL9, x2, CL7);
3755 p = fma(p, x2, CL5);
3756 p = fma(p, x2, CL3);
3757 p = fma(p, x2, CL1);
3758 p = fma(p, x, corr * fexp);
3760 return p;
3761 # endif
3764 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3766 * \copydetails exp2(SimdFloat)
3768 template<MathOptimization opt = MathOptimization::Safe>
3769 static inline SimdDouble gmx_simdcall exp2SingleAccuracy(SimdDouble x)
3771 # if GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
3772 return exp2(x);
3773 # else
3774 const SimdDouble CC6(0.0001534581200287996416911311);
3775 const SimdDouble CC5(0.001339993121934088894618990);
3776 const SimdDouble CC4(0.009618488957115180159497841);
3777 const SimdDouble CC3(0.05550328776964726865751735);
3778 const SimdDouble CC2(0.2402264689063408646490722);
3779 const SimdDouble CC1(0.6931472057372680777553816);
3780 const SimdDouble one(1.0);
3782 SimdDouble intpart;
3783 SimdDouble p;
3784 SimdDInt32 ix;
3786 // Large negative values are valid arguments to exp2(), so there are two
3787 // things we need to account for:
3788 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3789 // zero and we can no longer multiply with it. There are special IEEE
3790 // formats to handle this range, but for now we have to accept that
3791 // we cannot handle those arguments. If input value becomes even more
3792 // negative, it will start to loop and we would end up with invalid
3793 // exponents. Thus, we need to limit or mask this.
3794 // 2. For VERY large negative values, we will have problems that the
3795 // subtraction to get the fractional part loses accuracy, and then we
3796 // can end up with overflows in the polynomial.
3798 // For now, we handle this by forwarding the math optimization setting to
3799 // ldexp, where the routine will return zero for very small arguments.
3801 // However, before doing that we need to make sure we do not call cvtR2I
3802 // with an argument that is so negative it cannot be converted to an integer.
3803 if (opt == MathOptimization::Safe)
3805 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
3808 ix = cvtR2I(x);
3809 intpart = round(x);
3810 x = x - intpart;
3812 p = fma(CC6, x, CC5);
3813 p = fma(p, x, CC4);
3814 p = fma(p, x, CC3);
3815 p = fma(p, x, CC2);
3816 p = fma(p, x, CC1);
3817 p = fma(p, x, one);
3818 x = ldexp<opt>(p, ix);
3820 return x;
3821 # endif
3825 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3827 * \copydetails exp(SimdFloat)
3829 template<MathOptimization opt = MathOptimization::Safe>
3830 static inline SimdDouble gmx_simdcall expSingleAccuracy(SimdDouble x)
3832 # if GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
3833 return exp(x);
3834 # else
3835 const SimdDouble argscale(1.44269504088896341);
3836 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3837 const SimdDouble smallArgLimit(-709.0895657128);
3838 const SimdDouble invargscale(-0.69314718055994528623);
3839 const SimdDouble CC4(0.00136324646882712841033936);
3840 const SimdDouble CC3(0.00836596917361021041870117);
3841 const SimdDouble CC2(0.0416710823774337768554688);
3842 const SimdDouble CC1(0.166665524244308471679688);
3843 const SimdDouble CC0(0.499999850988388061523438);
3844 const SimdDouble one(1.0);
3845 SimdDouble intpart;
3846 SimdDouble y, p;
3847 SimdDInt32 iy;
3849 // Large negative values are valid arguments to exp2(), so there are two
3850 // things we need to account for:
3851 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3852 // zero and we can no longer multiply with it. There are special IEEE
3853 // formats to handle this range, but for now we have to accept that
3854 // we cannot handle those arguments. If input value becomes even more
3855 // negative, it will start to loop and we would end up with invalid
3856 // exponents. Thus, we need to limit or mask this.
3857 // 2. For VERY large negative values, we will have problems that the
3858 // subtraction to get the fractional part loses accuracy, and then we
3859 // can end up with overflows in the polynomial.
3861 // For now, we handle this by forwarding the math optimization setting to
3862 // ldexp, where the routine will return zero for very small arguments.
3864 // However, before doing that we need to make sure we do not call cvtR2I
3865 // with an argument that is so negative it cannot be converted to an integer
3866 // after being multiplied by argscale.
3868 if (opt == MathOptimization::Safe)
3870 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
3873 y = x * argscale;
3875 iy = cvtR2I(y);
3876 intpart = round(y); // use same rounding algorithm here
3878 // Extended precision arithmetics not needed since
3879 // we have double precision and only need single accuracy.
3880 x = fma(invargscale, intpart, x);
3882 p = fma(CC4, x, CC3);
3883 p = fma(p, x, CC2);
3884 p = fma(p, x, CC1);
3885 p = fma(p, x, CC0);
3886 p = fma(x * x, p, x);
3887 p = p + one;
3888 x = ldexp<opt>(p, iy);
3889 return x;
3890 # endif
3893 /*! \brief SIMD pow(x,y). Double precision SIMD data, single accuracy.
3895 * This returns x^y for SIMD values.
3897 * \tparam opt If this is changed from the default (safe) into the unsafe
3898 * option, there are no guarantees about correct results for x==0.
3900 * \param x Base.
3902 * \param y exponent.
3904 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
3905 * depending on the underlying implementation. If unsafe optimizations
3906 * are enabled, this is also true for x==0.
3908 * \warning You cannot rely on this implementation returning inf for arguments
3909 * that cause overflow. If you have some very large
3910 * values and need to rely on getting a valid numerical output,
3911 * take the minimum of your variable and the largest valid argument
3912 * before calling this routine.
3914 template<MathOptimization opt = MathOptimization::Safe>
3915 static inline SimdDouble gmx_simdcall powSingleAccuracy(SimdDouble x, SimdDouble y)
3917 SimdDouble xcorr;
3919 if (opt == MathOptimization::Safe)
3921 xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
3923 else
3925 xcorr = x;
3928 SimdDouble result = exp2SingleAccuracy<opt>(y * log2SingleAccuracy(xcorr));
3930 if (opt == MathOptimization::Safe)
3932 // if x==0 and y>0 we explicitly set the result to 0.0
3933 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
3934 result = blend(result, setZero(), x == setZero() && setZero() < y);
3937 return result;
3940 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3942 * \param x The value to calculate erf(x) for.
3943 * \result erf(x)
3945 * This routine achieves very close to single precision, but we do not care about
3946 * the last bit or the subnormal result range.
3948 static inline SimdDouble gmx_simdcall erfSingleAccuracy(SimdDouble x)
3950 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3951 const SimdDouble CA6(7.853861353153693e-5);
3952 const SimdDouble CA5(-8.010193625184903e-4);
3953 const SimdDouble CA4(5.188327685732524e-3);
3954 const SimdDouble CA3(-2.685381193529856e-2);
3955 const SimdDouble CA2(1.128358514861418e-1);
3956 const SimdDouble CA1(-3.761262582423300e-1);
3957 const SimdDouble CA0(1.128379165726710);
3958 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3959 const SimdDouble CB9(-0.0018629930017603923);
3960 const SimdDouble CB8(0.003909821287598495);
3961 const SimdDouble CB7(-0.0052094582210355615);
3962 const SimdDouble CB6(0.005685614362160572);
3963 const SimdDouble CB5(-0.0025367682853477272);
3964 const SimdDouble CB4(-0.010199799682318782);
3965 const SimdDouble CB3(0.04369575504816542);
3966 const SimdDouble CB2(-0.11884063474674492);
3967 const SimdDouble CB1(0.2732120154030589);
3968 const SimdDouble CB0(0.42758357702025784);
3969 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3970 const SimdDouble CC10(-0.0445555913112064);
3971 const SimdDouble CC9(0.21376355144663348);
3972 const SimdDouble CC8(-0.3473187200259257);
3973 const SimdDouble CC7(0.016690861551248114);
3974 const SimdDouble CC6(0.7560973182491192);
3975 const SimdDouble CC5(-1.2137903600145787);
3976 const SimdDouble CC4(0.8411872321232948);
3977 const SimdDouble CC3(-0.08670413896296343);
3978 const SimdDouble CC2(-0.27124782687240334);
3979 const SimdDouble CC1(-0.0007502488047806069);
3980 const SimdDouble CC0(0.5642114853803148);
3981 const SimdDouble one(1.0);
3982 const SimdDouble two(2.0);
3984 SimdDouble x2, x4, y;
3985 SimdDouble t, t2, w, w2;
3986 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3987 SimdDouble expmx2;
3988 SimdDouble res_erf, res_erfc, res;
3989 SimdDBool mask, msk_erf;
3991 // Calculate erf()
3992 x2 = x * x;
3993 x4 = x2 * x2;
3995 pA0 = fma(CA6, x4, CA4);
3996 pA1 = fma(CA5, x4, CA3);
3997 pA0 = fma(pA0, x4, CA2);
3998 pA1 = fma(pA1, x4, CA1);
3999 pA0 = pA0 * x4;
4000 pA0 = fma(pA1, x2, pA0);
4001 // Constant term must come last for precision reasons
4002 pA0 = pA0 + CA0;
4004 res_erf = x * pA0;
4006 // Calculate erfc
4007 y = abs(x);
4008 msk_erf = (SimdDouble(0.75) <= y);
4009 t = maskzInvSingleAccuracy(y, msk_erf);
4010 w = t - one;
4011 t2 = t * t;
4012 w2 = w * w;
4014 expmx2 = expSingleAccuracy(-y * y);
4016 pB1 = fma(CB9, w2, CB7);
4017 pB0 = fma(CB8, w2, CB6);
4018 pB1 = fma(pB1, w2, CB5);
4019 pB0 = fma(pB0, w2, CB4);
4020 pB1 = fma(pB1, w2, CB3);
4021 pB0 = fma(pB0, w2, CB2);
4022 pB1 = fma(pB1, w2, CB1);
4023 pB0 = fma(pB0, w2, CB0);
4024 pB0 = fma(pB1, w, pB0);
4026 pC0 = fma(CC10, t2, CC8);
4027 pC1 = fma(CC9, t2, CC7);
4028 pC0 = fma(pC0, t2, CC6);
4029 pC1 = fma(pC1, t2, CC5);
4030 pC0 = fma(pC0, t2, CC4);
4031 pC1 = fma(pC1, t2, CC3);
4032 pC0 = fma(pC0, t2, CC2);
4033 pC1 = fma(pC1, t2, CC1);
4035 pC0 = fma(pC0, t2, CC0);
4036 pC0 = fma(pC1, t, pC0);
4037 pC0 = pC0 * t;
4039 // Select pB0 or pC0 for erfc()
4040 mask = (two < y);
4041 res_erfc = blend(pB0, pC0, mask);
4042 res_erfc = res_erfc * expmx2;
4044 // erfc(x<0) = 2-erfc(|x|)
4045 mask = (x < setZero());
4046 res_erfc = blend(res_erfc, two - res_erfc, mask);
4048 // Select erf() or erfc()
4049 mask = (y < SimdDouble(0.75));
4050 res = blend(one - res_erfc, res_erf, mask);
4052 return res;
4055 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
4057 * \param x The value to calculate erfc(x) for.
4058 * \result erfc(x)
4060 * This routine achieves singleprecision (bar the last bit) over most of the
4061 * input range, but for large arguments where the result is getting close
4062 * to the minimum representable numbers we accept slightly larger errors
4063 * (think results that are in the ballpark of 10^-30) since that is not
4064 * relevant for MD.
4066 static inline SimdDouble gmx_simdcall erfcSingleAccuracy(SimdDouble x)
4068 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
4069 const SimdDouble CA6(7.853861353153693e-5);
4070 const SimdDouble CA5(-8.010193625184903e-4);
4071 const SimdDouble CA4(5.188327685732524e-3);
4072 const SimdDouble CA3(-2.685381193529856e-2);
4073 const SimdDouble CA2(1.128358514861418e-1);
4074 const SimdDouble CA1(-3.761262582423300e-1);
4075 const SimdDouble CA0(1.128379165726710);
4076 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
4077 const SimdDouble CB9(-0.0018629930017603923);
4078 const SimdDouble CB8(0.003909821287598495);
4079 const SimdDouble CB7(-0.0052094582210355615);
4080 const SimdDouble CB6(0.005685614362160572);
4081 const SimdDouble CB5(-0.0025367682853477272);
4082 const SimdDouble CB4(-0.010199799682318782);
4083 const SimdDouble CB3(0.04369575504816542);
4084 const SimdDouble CB2(-0.11884063474674492);
4085 const SimdDouble CB1(0.2732120154030589);
4086 const SimdDouble CB0(0.42758357702025784);
4087 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
4088 const SimdDouble CC10(-0.0445555913112064);
4089 const SimdDouble CC9(0.21376355144663348);
4090 const SimdDouble CC8(-0.3473187200259257);
4091 const SimdDouble CC7(0.016690861551248114);
4092 const SimdDouble CC6(0.7560973182491192);
4093 const SimdDouble CC5(-1.2137903600145787);
4094 const SimdDouble CC4(0.8411872321232948);
4095 const SimdDouble CC3(-0.08670413896296343);
4096 const SimdDouble CC2(-0.27124782687240334);
4097 const SimdDouble CC1(-0.0007502488047806069);
4098 const SimdDouble CC0(0.5642114853803148);
4099 const SimdDouble one(1.0);
4100 const SimdDouble two(2.0);
4102 SimdDouble x2, x4, y;
4103 SimdDouble t, t2, w, w2;
4104 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
4105 SimdDouble expmx2;
4106 SimdDouble res_erf, res_erfc, res;
4107 SimdDBool mask, msk_erf;
4109 // Calculate erf()
4110 x2 = x * x;
4111 x4 = x2 * x2;
4113 pA0 = fma(CA6, x4, CA4);
4114 pA1 = fma(CA5, x4, CA3);
4115 pA0 = fma(pA0, x4, CA2);
4116 pA1 = fma(pA1, x4, CA1);
4117 pA1 = pA1 * x2;
4118 pA0 = fma(pA0, x4, pA1);
4119 // Constant term must come last for precision reasons
4120 pA0 = pA0 + CA0;
4122 res_erf = x * pA0;
4124 // Calculate erfc
4125 y = abs(x);
4126 msk_erf = (SimdDouble(0.75) <= y);
4127 t = maskzInvSingleAccuracy(y, msk_erf);
4128 w = t - one;
4129 t2 = t * t;
4130 w2 = w * w;
4132 expmx2 = expSingleAccuracy(-y * y);
4134 pB1 = fma(CB9, w2, CB7);
4135 pB0 = fma(CB8, w2, CB6);
4136 pB1 = fma(pB1, w2, CB5);
4137 pB0 = fma(pB0, w2, CB4);
4138 pB1 = fma(pB1, w2, CB3);
4139 pB0 = fma(pB0, w2, CB2);
4140 pB1 = fma(pB1, w2, CB1);
4141 pB0 = fma(pB0, w2, CB0);
4142 pB0 = fma(pB1, w, pB0);
4144 pC0 = fma(CC10, t2, CC8);
4145 pC1 = fma(CC9, t2, CC7);
4146 pC0 = fma(pC0, t2, CC6);
4147 pC1 = fma(pC1, t2, CC5);
4148 pC0 = fma(pC0, t2, CC4);
4149 pC1 = fma(pC1, t2, CC3);
4150 pC0 = fma(pC0, t2, CC2);
4151 pC1 = fma(pC1, t2, CC1);
4153 pC0 = fma(pC0, t2, CC0);
4154 pC0 = fma(pC1, t, pC0);
4155 pC0 = pC0 * t;
4157 // Select pB0 or pC0 for erfc()
4158 mask = (two < y);
4159 res_erfc = blend(pB0, pC0, mask);
4160 res_erfc = res_erfc * expmx2;
4162 // erfc(x<0) = 2-erfc(|x|)
4163 mask = (x < setZero());
4164 res_erfc = blend(res_erfc, two - res_erfc, mask);
4166 // Select erf() or erfc()
4167 mask = (y < SimdDouble(0.75));
4168 res = blend(res_erfc, one - res_erf, mask);
4170 return res;
4173 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
4175 * \param x The argument to evaluate sin/cos for
4176 * \param[out] sinval Sin(x)
4177 * \param[out] cosval Cos(x)
4179 static inline void gmx_simdcall sinCosSingleAccuracy(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
4181 // Constants to subtract Pi/4*x from y while minimizing precision loss
4182 const SimdDouble argred0(2 * 0.78539816290140151978);
4183 const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4184 const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4185 const SimdDouble two_over_pi(2.0 / M_PI);
4186 const SimdDouble const_sin2(-1.9515295891e-4);
4187 const SimdDouble const_sin1(8.3321608736e-3);
4188 const SimdDouble const_sin0(-1.6666654611e-1);
4189 const SimdDouble const_cos2(2.443315711809948e-5);
4190 const SimdDouble const_cos1(-1.388731625493765e-3);
4191 const SimdDouble const_cos0(4.166664568298827e-2);
4193 const SimdDouble half(0.5);
4194 const SimdDouble one(1.0);
4195 SimdDouble ssign, csign;
4196 SimdDouble x2, y, z, psin, pcos, sss, ccc;
4197 SimdDBool mask;
4199 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4200 const SimdDInt32 ione(1);
4201 const SimdDInt32 itwo(2);
4202 SimdDInt32 iy;
4204 z = x * two_over_pi;
4205 iy = cvtR2I(z);
4206 y = round(z);
4208 mask = cvtIB2B((iy & ione) == setZero());
4209 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
4210 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
4211 # else
4212 const SimdDouble quarter(0.25);
4213 const SimdDouble minusquarter(-0.25);
4214 SimdDouble q;
4215 SimdDBool m1, m2, m3;
4217 /* The most obvious way to find the arguments quadrant in the unit circle
4218 * to calculate the sign is to use integer arithmetic, but that is not
4219 * present in all SIMD implementations. As an alternative, we have devised a
4220 * pure floating-point algorithm that uses truncation for argument reduction
4221 * so that we get a new value 0<=q<1 over the unit circle, and then
4222 * do floating-point comparisons with fractions. This is likely to be
4223 * slightly slower (~10%) due to the longer latencies of floating-point, so
4224 * we only use it when integer SIMD arithmetic is not present.
4226 ssign = x;
4227 x = abs(x);
4228 // It is critical that half-way cases are rounded down
4229 z = fma(x, two_over_pi, half);
4230 y = trunc(z);
4231 q = z * quarter;
4232 q = q - trunc(q);
4233 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
4234 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
4235 * This removes the 2*Pi periodicity without using any integer arithmetic.
4236 * First check if y had the value 2 or 3, set csign if true.
4238 q = q - half;
4239 /* If we have logical operations we can work directly on the signbit, which
4240 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
4241 * Thus, if you are altering defines to debug alternative code paths, the
4242 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
4243 * active or inactive - you will get errors if only one is used.
4245 # if GMX_SIMD_HAVE_LOGICAL
4246 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
4247 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
4248 ssign = ssign ^ csign;
4249 # else
4250 ssign = copysign(SimdDouble(1.0), ssign);
4251 csign = copysign(SimdDouble(1.0), q);
4252 csign = -csign;
4253 ssign = ssign * csign; // swap ssign if csign was set.
4254 # endif
4255 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
4256 m1 = (q < minusquarter);
4257 m2 = (setZero() <= q);
4258 m3 = (q < quarter);
4259 m2 = m2 && m3;
4260 mask = m1 || m2;
4261 // where mask is FALSE, swap sign.
4262 csign = csign * blend(SimdDouble(-1.0), one, mask);
4263 # endif
4264 x = fnma(y, argred0, x);
4265 x = fnma(y, argred1, x);
4266 x = fnma(y, argred2, x);
4267 x2 = x * x;
4269 psin = fma(const_sin2, x2, const_sin1);
4270 psin = fma(psin, x2, const_sin0);
4271 psin = fma(psin, x * x2, x);
4272 pcos = fma(const_cos2, x2, const_cos1);
4273 pcos = fma(pcos, x2, const_cos0);
4274 pcos = fms(pcos, x2, half);
4275 pcos = fma(pcos, x2, one);
4277 sss = blend(pcos, psin, mask);
4278 ccc = blend(psin, pcos, mask);
4279 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
4280 # if GMX_SIMD_HAVE_LOGICAL
4281 *sinval = sss ^ ssign;
4282 *cosval = ccc ^ csign;
4283 # else
4284 *sinval = sss * ssign;
4285 *cosval = ccc * csign;
4286 # endif
4289 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
4291 * \param x The argument to evaluate sin for
4292 * \result Sin(x)
4294 * \attention Do NOT call both sin & cos if you need both results, since each of them
4295 * will then call \ref sincos and waste a factor 2 in performance.
4297 static inline SimdDouble gmx_simdcall sinSingleAccuracy(SimdDouble x)
4299 SimdDouble s, c;
4300 sinCosSingleAccuracy(x, &s, &c);
4301 return s;
4304 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
4306 * \param x The argument to evaluate cos for
4307 * \result Cos(x)
4309 * \attention Do NOT call both sin & cos if you need both results, since each of them
4310 * will then call \ref sincos and waste a factor 2 in performance.
4312 static inline SimdDouble gmx_simdcall cosSingleAccuracy(SimdDouble x)
4314 SimdDouble s, c;
4315 sinCosSingleAccuracy(x, &s, &c);
4316 return c;
4319 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
4321 * \param x The argument to evaluate tan for
4322 * \result Tan(x)
4324 static inline SimdDouble gmx_simdcall tanSingleAccuracy(SimdDouble x)
4326 const SimdDouble argred0(2 * 0.78539816290140151978);
4327 const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4328 const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4329 const SimdDouble two_over_pi(2.0 / M_PI);
4330 const SimdDouble CT6(0.009498288995810566122993911);
4331 const SimdDouble CT5(0.002895755790837379295226923);
4332 const SimdDouble CT4(0.02460087336161924491836265);
4333 const SimdDouble CT3(0.05334912882656359828045988);
4334 const SimdDouble CT2(0.1333989091464957704418495);
4335 const SimdDouble CT1(0.3333307599244198227797507);
4337 SimdDouble x2, p, y, z;
4338 SimdDBool mask;
4340 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4341 SimdDInt32 iy;
4342 SimdDInt32 ione(1);
4344 z = x * two_over_pi;
4345 iy = cvtR2I(z);
4346 y = round(z);
4347 mask = cvtIB2B((iy & ione) == ione);
4349 x = fnma(y, argred0, x);
4350 x = fnma(y, argred1, x);
4351 x = fnma(y, argred2, x);
4352 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
4353 # else
4354 const SimdDouble quarter(0.25);
4355 const SimdDouble half(0.5);
4356 const SimdDouble threequarter(0.75);
4357 SimdDouble w, q;
4358 SimdDBool m1, m2, m3;
4360 w = abs(x);
4361 z = fma(w, two_over_pi, half);
4362 y = trunc(z);
4363 q = z * quarter;
4364 q = q - trunc(q);
4365 m1 = (quarter <= q);
4366 m2 = (q < half);
4367 m3 = (threequarter <= q);
4368 m1 = m1 && m2;
4369 mask = m1 || m3;
4370 w = fnma(y, argred0, w);
4371 w = fnma(y, argred1, w);
4372 w = fnma(y, argred2, w);
4374 w = blend(w, -w, mask);
4375 x = w * copysign(SimdDouble(1.0), x);
4376 # endif
4377 x2 = x * x;
4378 p = fma(CT6, x2, CT5);
4379 p = fma(p, x2, CT4);
4380 p = fma(p, x2, CT3);
4381 p = fma(p, x2, CT2);
4382 p = fma(p, x2, CT1);
4383 p = fma(x2, p * x, x);
4385 p = blend(p, maskzInvSingleAccuracy(p, mask), mask);
4386 return p;
4389 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4391 * \param x The argument to evaluate asin for
4392 * \result Asin(x)
4394 static inline SimdDouble gmx_simdcall asinSingleAccuracy(SimdDouble x)
4396 const SimdDouble limitlow(1e-4);
4397 const SimdDouble half(0.5);
4398 const SimdDouble one(1.0);
4399 const SimdDouble halfpi(M_PI / 2.0);
4400 const SimdDouble CC5(4.2163199048E-2);
4401 const SimdDouble CC4(2.4181311049E-2);
4402 const SimdDouble CC3(4.5470025998E-2);
4403 const SimdDouble CC2(7.4953002686E-2);
4404 const SimdDouble CC1(1.6666752422E-1);
4405 SimdDouble xabs;
4406 SimdDouble z, z1, z2, q, q1, q2;
4407 SimdDouble pA, pB;
4408 SimdDBool mask, mask2;
4410 xabs = abs(x);
4411 mask = (half < xabs);
4412 z1 = half * (one - xabs);
4413 mask2 = (xabs < one);
4414 q1 = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
4415 q2 = xabs;
4416 z2 = q2 * q2;
4417 z = blend(z2, z1, mask);
4418 q = blend(q2, q1, mask);
4420 z2 = z * z;
4421 pA = fma(CC5, z2, CC3);
4422 pB = fma(CC4, z2, CC2);
4423 pA = fma(pA, z2, CC1);
4424 pA = pA * z;
4425 z = fma(pB, z2, pA);
4426 z = fma(z, q, q);
4427 q2 = halfpi - z;
4428 q2 = q2 - z;
4429 z = blend(z, q2, mask);
4431 mask = (limitlow < xabs);
4432 z = blend(xabs, z, mask);
4433 z = copysign(z, x);
4435 return z;
4438 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
4440 * \param x The argument to evaluate acos for
4441 * \result Acos(x)
4443 static inline SimdDouble gmx_simdcall acosSingleAccuracy(SimdDouble x)
4445 const SimdDouble one(1.0);
4446 const SimdDouble half(0.5);
4447 const SimdDouble pi(M_PI);
4448 const SimdDouble halfpi(M_PI / 2.0);
4449 SimdDouble xabs;
4450 SimdDouble z, z1, z2, z3;
4451 SimdDBool mask1, mask2, mask3;
4453 xabs = abs(x);
4454 mask1 = (half < xabs);
4455 mask2 = (setZero() < x);
4457 z = half * (one - xabs);
4458 mask3 = (xabs < one);
4459 z = z * maskzInvsqrtSingleAccuracy(z, mask3);
4460 z = blend(x, z, mask1);
4461 z = asinSingleAccuracy(z);
4463 z2 = z + z;
4464 z1 = pi - z2;
4465 z3 = halfpi - z;
4466 z = blend(z1, z2, mask2);
4467 z = blend(z3, z, mask1);
4469 return z;
4472 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4474 * \param x The argument to evaluate atan for
4475 * \result Atan(x), same argument/value range as standard math library.
4477 static inline SimdDouble gmx_simdcall atanSingleAccuracy(SimdDouble x)
4479 const SimdDouble halfpi(M_PI / 2);
4480 const SimdDouble CA17(0.002823638962581753730774);
4481 const SimdDouble CA15(-0.01595690287649631500244);
4482 const SimdDouble CA13(0.04250498861074447631836);
4483 const SimdDouble CA11(-0.07489009201526641845703);
4484 const SimdDouble CA9(0.1063479334115982055664);
4485 const SimdDouble CA7(-0.1420273631811141967773);
4486 const SimdDouble CA5(0.1999269574880599975585);
4487 const SimdDouble CA3(-0.3333310186862945556640);
4488 SimdDouble x2, x3, x4, pA, pB;
4489 SimdDBool mask, mask2;
4491 mask = (x < setZero());
4492 x = abs(x);
4493 mask2 = (SimdDouble(1.0) < x);
4494 x = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
4496 x2 = x * x;
4497 x3 = x2 * x;
4498 x4 = x2 * x2;
4499 pA = fma(CA17, x4, CA13);
4500 pB = fma(CA15, x4, CA11);
4501 pA = fma(pA, x4, CA9);
4502 pB = fma(pB, x4, CA7);
4503 pA = fma(pA, x4, CA5);
4504 pB = fma(pB, x4, CA3);
4505 pA = fma(pA, x2, pB);
4506 pA = fma(pA, x3, x);
4508 pA = blend(pA, halfpi - pA, mask2);
4509 pA = blend(pA, -pA, mask);
4511 return pA;
4514 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
4516 * \param y Y component of vector, any quartile
4517 * \param x X component of vector, any quartile
4518 * \result Atan(y,x), same argument/value range as standard math library.
4520 * \note This routine should provide correct results for all finite
4521 * non-zero or positive-zero arguments. However, negative zero arguments will
4522 * be treated as positive zero, which means the return value will deviate from
4523 * the standard math library atan2(y,x) for those cases. That should not be
4524 * of any concern in Gromacs, and in particular it will not affect calculations
4525 * of angles from vectors.
4527 static inline SimdDouble gmx_simdcall atan2SingleAccuracy(SimdDouble y, SimdDouble x)
4529 const SimdDouble pi(M_PI);
4530 const SimdDouble halfpi(M_PI / 2.0);
4531 SimdDouble xinv, p, aoffset;
4532 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
4534 mask_xnz = x != setZero();
4535 mask_ynz = y != setZero();
4536 mask_xlt0 = (x < setZero());
4537 mask_ylt0 = (y < setZero());
4539 aoffset = selectByNotMask(halfpi, mask_xnz);
4540 aoffset = selectByMask(aoffset, mask_ynz);
4542 aoffset = blend(aoffset, pi, mask_xlt0);
4543 aoffset = blend(aoffset, -aoffset, mask_ylt0);
4545 xinv = maskzInvSingleAccuracy(x, mask_xnz);
4546 p = y * xinv;
4547 p = atanSingleAccuracy(p);
4548 p = p + aoffset;
4550 return p;
4553 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
4555 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4556 * \result Correction factor to coulomb force - see below for details.
4558 * This routine is meant to enable analytical evaluation of the
4559 * direct-space PME electrostatic force to avoid tables.
4561 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
4562 * are some problems evaluating that:
4564 * First, the error function is difficult (read: expensive) to
4565 * approxmiate accurately for intermediate to large arguments, and
4566 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
4567 * Second, we now try to avoid calculating potentials in Gromacs but
4568 * use forces directly.
4570 * We can simply things slight by noting that the PME part is really
4571 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
4572 * \f[
4573 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
4574 * \f]
4575 * The first term we already have from the inverse square root, so
4576 * that we can leave out of this routine.
4578 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
4579 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
4580 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
4581 * in this range!
4583 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
4584 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
4585 * then only use even powers. This is another minor optimization, since
4586 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
4587 * the vector between the two atoms to get the vectorial force. The
4588 * fastest flops are the ones we can avoid calculating!
4590 * So, here's how it should be used:
4592 * 1. Calculate \f$r^2\f$.
4593 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
4594 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
4595 * 4. The return value is the expression:
4597 * \f[
4598 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
4599 * \f]
4601 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
4603 * \f[
4604 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
4605 * \f]
4607 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
4609 * \f[
4610 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
4611 * \f]
4613 * With a bit of math exercise you should be able to confirm that
4614 * this is exactly
4616 * \f[
4617 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
4618 * \f]
4620 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
4621 * and you have your force (divided by \f$r\f$). A final multiplication
4622 * with the vector connecting the two particles and you have your
4623 * vectorial force to add to the particles.
4625 * This approximation achieves an accuracy slightly lower than 1e-6; when
4626 * added to \f$1/r\f$ the error will be insignificant.
4629 static inline SimdDouble gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdDouble z2)
4631 const SimdDouble FN6(-1.7357322914161492954e-8);
4632 const SimdDouble FN5(1.4703624142580877519e-6);
4633 const SimdDouble FN4(-0.000053401640219807709149);
4634 const SimdDouble FN3(0.0010054721316683106153);
4635 const SimdDouble FN2(-0.019278317264888380590);
4636 const SimdDouble FN1(0.069670166153766424023);
4637 const SimdDouble FN0(-0.75225204789749321333);
4639 const SimdDouble FD4(0.0011193462567257629232);
4640 const SimdDouble FD3(0.014866955030185295499);
4641 const SimdDouble FD2(0.11583842382862377919);
4642 const SimdDouble FD1(0.50736591960530292870);
4643 const SimdDouble FD0(1.0);
4645 SimdDouble z4;
4646 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
4648 z4 = z2 * z2;
4650 polyFD0 = fma(FD4, z4, FD2);
4651 polyFD1 = fma(FD3, z4, FD1);
4652 polyFD0 = fma(polyFD0, z4, FD0);
4653 polyFD0 = fma(polyFD1, z2, polyFD0);
4655 polyFD0 = invSingleAccuracy(polyFD0);
4657 polyFN0 = fma(FN6, z4, FN4);
4658 polyFN1 = fma(FN5, z4, FN3);
4659 polyFN0 = fma(polyFN0, z4, FN2);
4660 polyFN1 = fma(polyFN1, z4, FN1);
4661 polyFN0 = fma(polyFN0, z4, FN0);
4662 polyFN0 = fma(polyFN1, z2, polyFN0);
4664 return polyFN0 * polyFD0;
4668 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4670 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4671 * \result Correction factor to coulomb potential - see below for details.
4673 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4674 * as the input argument.
4676 * Here's how it should be used:
4678 * 1. Calculate \f$r^2\f$.
4679 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4680 * 3. Evaluate this routine with z^2 as the argument.
4681 * 4. The return value is the expression:
4683 * \f[
4684 * \frac{\mbox{erf}(z)}{z}
4685 * \f]
4687 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4689 * \f[
4690 * \frac{\mbox{erf}(r \beta)}{r}
4691 * \f]
4693 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4694 * and you have your potential.
4696 * This approximation achieves an accuracy slightly lower than 1e-6; when
4697 * added to \f$1/r\f$ the error will be insignificant.
4699 static inline SimdDouble gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4701 const SimdDouble VN6(1.9296833005951166339e-8);
4702 const SimdDouble VN5(-1.4213390571557850962e-6);
4703 const SimdDouble VN4(0.000041603292906656984871);
4704 const SimdDouble VN3(-0.00013134036773265025626);
4705 const SimdDouble VN2(0.038657983986041781264);
4706 const SimdDouble VN1(0.11285044772717598220);
4707 const SimdDouble VN0(1.1283802385263030286);
4709 const SimdDouble VD3(0.0066752224023576045451);
4710 const SimdDouble VD2(0.078647795836373922256);
4711 const SimdDouble VD1(0.43336185284710920150);
4712 const SimdDouble VD0(1.0);
4714 SimdDouble z4;
4715 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
4717 z4 = z2 * z2;
4719 polyVD1 = fma(VD3, z4, VD1);
4720 polyVD0 = fma(VD2, z4, VD0);
4721 polyVD0 = fma(polyVD1, z2, polyVD0);
4723 polyVD0 = invSingleAccuracy(polyVD0);
4725 polyVN0 = fma(VN6, z4, VN4);
4726 polyVN1 = fma(VN5, z4, VN3);
4727 polyVN0 = fma(polyVN0, z4, VN2);
4728 polyVN1 = fma(polyVN1, z4, VN1);
4729 polyVN0 = fma(polyVN0, z4, VN0);
4730 polyVN0 = fma(polyVN1, z2, polyVN0);
4732 return polyVN0 * polyVD0;
4735 # endif
4738 /*! \name SIMD4 math functions
4740 * \note Only a subset of the math functions are implemented for SIMD4.
4741 * \{
4745 # if GMX_SIMD4_HAVE_FLOAT
4747 /*************************************************************************
4748 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4749 *************************************************************************/
4751 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4753 * This is a low-level routine that should only be used by SIMD math routine
4754 * that evaluates the inverse square root.
4756 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4757 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4758 * \return An improved approximation with roughly twice as many bits of accuracy.
4760 static inline Simd4Float gmx_simdcall rsqrtIter(Simd4Float lu, Simd4Float x)
4762 Simd4Float tmp1 = x * lu;
4763 Simd4Float tmp2 = Simd4Float(-0.5F) * lu;
4764 tmp1 = fma(tmp1, lu, Simd4Float(-3.0F));
4765 return tmp1 * tmp2;
4768 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4770 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4771 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4772 * For the single precision implementation this is obviously always
4773 * true for positive values, but for double precision it adds an
4774 * extra restriction since the first lookup step might have to be
4775 * performed in single precision on some architectures. Note that the
4776 * responsibility for checking falls on you - this routine does not
4777 * check arguments.
4778 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4780 static inline Simd4Float gmx_simdcall invsqrt(Simd4Float x)
4782 Simd4Float lu = rsqrt(x);
4783 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4784 lu = rsqrtIter(lu, x);
4785 # endif
4786 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4787 lu = rsqrtIter(lu, x);
4788 # endif
4789 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4790 lu = rsqrtIter(lu, x);
4791 # endif
4792 return lu;
4796 # endif // GMX_SIMD4_HAVE_FLOAT
4799 # if GMX_SIMD4_HAVE_DOUBLE
4800 /*************************************************************************
4801 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4802 *************************************************************************/
4804 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4806 * This is a low-level routine that should only be used by SIMD math routine
4807 * that evaluates the inverse square root.
4809 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4810 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4811 * \return An improved approximation with roughly twice as many bits of accuracy.
4813 static inline Simd4Double gmx_simdcall rsqrtIter(Simd4Double lu, Simd4Double x)
4815 Simd4Double tmp1 = x * lu;
4816 Simd4Double tmp2 = Simd4Double(-0.5F) * lu;
4817 tmp1 = fma(tmp1, lu, Simd4Double(-3.0F));
4818 return tmp1 * tmp2;
4821 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4823 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4824 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4825 * For the single precision implementation this is obviously always
4826 * true for positive values, but for double precision it adds an
4827 * extra restriction since the first lookup step might have to be
4828 * performed in single precision on some architectures. Note that the
4829 * responsibility for checking falls on you - this routine does not
4830 * check arguments.
4831 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4833 static inline Simd4Double gmx_simdcall invsqrt(Simd4Double x)
4835 Simd4Double lu = rsqrt(x);
4836 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4837 lu = rsqrtIter(lu, x);
4838 # endif
4839 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4840 lu = rsqrtIter(lu, x);
4841 # endif
4842 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4843 lu = rsqrtIter(lu, x);
4844 # endif
4845 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4846 lu = rsqrtIter(lu, x);
4847 # endif
4848 return lu;
4852 /**********************************************************************
4853 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4854 **********************************************************************/
4856 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4858 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4859 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4860 * For the single precision implementation this is obviously always
4861 * true for positive values, but for double precision it adds an
4862 * extra restriction since the first lookup step might have to be
4863 * performed in single precision on some architectures. Note that the
4864 * responsibility for checking falls on you - this routine does not
4865 * check arguments.
4866 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4868 static inline Simd4Double gmx_simdcall invsqrtSingleAccuracy(Simd4Double x)
4870 Simd4Double lu = rsqrt(x);
4871 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4872 lu = rsqrtIter(lu, x);
4873 # endif
4874 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4875 lu = rsqrtIter(lu, x);
4876 # endif
4877 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4878 lu = rsqrtIter(lu, x);
4879 # endif
4880 return lu;
4884 # endif // GMX_SIMD4_HAVE_DOUBLE
4886 /*! \} */
4888 # if GMX_SIMD_HAVE_FLOAT
4889 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4891 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4892 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4893 * For the single precision implementation this is obviously always
4894 * true for positive values, but for double precision it adds an
4895 * extra restriction since the first lookup step might have to be
4896 * performed in single precision on some architectures. Note that the
4897 * responsibility for checking falls on you - this routine does not
4898 * check arguments.
4899 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4901 static inline SimdFloat gmx_simdcall invsqrtSingleAccuracy(SimdFloat x)
4903 return invsqrt(x);
4906 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4908 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4909 * Illegal values in the masked-out elements will not lead to
4910 * floating-point exceptions.
4912 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4913 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4914 * For the single precision implementation this is obviously always
4915 * true for positive values, but for double precision it adds an
4916 * extra restriction since the first lookup step might have to be
4917 * performed in single precision on some architectures. Note that the
4918 * responsibility for checking falls on you - this routine does not
4919 * check arguments.
4920 * \param m Mask
4921 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
4922 * entry was not masked, and 0.0 for masked-out entries.
4924 static inline SimdFloat maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4926 return maskzInvsqrt(x, m);
4929 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4931 * \param x0 First set of arguments, x0 must be in single range (see below).
4932 * \param x1 Second set of arguments, x1 must be in single range (see below).
4933 * \param[out] out0 Result 1/sqrt(x0)
4934 * \param[out] out1 Result 1/sqrt(x1)
4936 * In particular for double precision we can sometimes calculate square root
4937 * pairs slightly faster by using single precision until the very last step.
4939 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4940 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4941 * For the single precision implementation this is obviously always
4942 * true for positive values, but for double precision it adds an
4943 * extra restriction since the first lookup step might have to be
4944 * performed in single precision on some architectures. Note that the
4945 * responsibility for checking falls on you - this routine does not
4946 * check arguments.
4948 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
4950 return invsqrtPair(x0, x1, out0, out1);
4953 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4955 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4956 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4957 * For the single precision implementation this is obviously always
4958 * true for positive values, but for double precision it adds an
4959 * extra restriction since the first lookup step might have to be
4960 * performed in single precision on some architectures. Note that the
4961 * responsibility for checking falls on you - this routine does not
4962 * check arguments.
4963 * \return 1/x. Result is undefined if your argument was invalid.
4965 static inline SimdFloat gmx_simdcall invSingleAccuracy(SimdFloat x)
4967 return inv(x);
4971 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4973 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4974 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4975 * For the single precision implementation this is obviously always
4976 * true for positive values, but for double precision it adds an
4977 * extra restriction since the first lookup step might have to be
4978 * performed in single precision on some architectures. Note that the
4979 * responsibility for checking falls on you - this routine does not
4980 * check arguments.
4981 * \param m Mask
4982 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
4984 static inline SimdFloat maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4986 return maskzInv(x, m);
4989 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4991 * \copydetails sqrt(SimdFloat)
4993 template<MathOptimization opt = MathOptimization::Safe>
4994 static inline SimdFloat gmx_simdcall sqrtSingleAccuracy(SimdFloat x)
4996 return sqrt<opt>(x);
4999 /*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
5001 * \copydetails cbrt(SimdFloat)
5003 static inline SimdFloat gmx_simdcall cbrtSingleAccuracy(SimdFloat x)
5005 return cbrt(x);
5008 /*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
5010 * \copydetails cbrt(SimdFloat)
5012 static inline SimdFloat gmx_simdcall invcbrtSingleAccuracy(SimdFloat x)
5014 return invcbrt(x);
5017 /*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
5019 * \param x Argument, should be >0.
5020 * \result The base-2 logarithm of x. Undefined if argument is invalid.
5022 static inline SimdFloat gmx_simdcall log2SingleAccuracy(SimdFloat x)
5024 return log2(x);
5027 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
5029 * \param x Argument, should be >0.
5030 * \result The natural logarithm of x. Undefined if argument is invalid.
5032 static inline SimdFloat gmx_simdcall logSingleAccuracy(SimdFloat x)
5034 return log(x);
5037 /*! \brief SIMD float 2^x, only targeting single accuracy.
5039 * \copydetails exp2(SimdFloat)
5041 template<MathOptimization opt = MathOptimization::Safe>
5042 static inline SimdFloat gmx_simdcall exp2SingleAccuracy(SimdFloat x)
5044 return exp2<opt>(x);
5047 /*! \brief SIMD float e^x, only targeting single accuracy.
5049 * \copydetails exp(SimdFloat)
5051 template<MathOptimization opt = MathOptimization::Safe>
5052 static inline SimdFloat gmx_simdcall expSingleAccuracy(SimdFloat x)
5054 return exp<opt>(x);
5057 /*! \brief SIMD pow(x,y), only targeting single accuracy.
5059 * \copydetails pow(SimdFloat,SimdFloat)
5061 template<MathOptimization opt = MathOptimization::Safe>
5062 static inline SimdFloat gmx_simdcall powSingleAccuracy(SimdFloat x, SimdFloat y)
5064 return pow<opt>(x, y);
5067 /*! \brief SIMD float erf(x), only targeting single accuracy.
5069 * \param x The value to calculate erf(x) for.
5070 * \result erf(x)
5072 * This routine achieves very close to single precision, but we do not care about
5073 * the last bit or the subnormal result range.
5075 static inline SimdFloat gmx_simdcall erfSingleAccuracy(SimdFloat x)
5077 return erf(x);
5080 /*! \brief SIMD float erfc(x), only targeting single accuracy.
5082 * \param x The value to calculate erfc(x) for.
5083 * \result erfc(x)
5085 * This routine achieves singleprecision (bar the last bit) over most of the
5086 * input range, but for large arguments where the result is getting close
5087 * to the minimum representable numbers we accept slightly larger errors
5088 * (think results that are in the ballpark of 10^-30) since that is not
5089 * relevant for MD.
5091 static inline SimdFloat gmx_simdcall erfcSingleAccuracy(SimdFloat x)
5093 return erfc(x);
5096 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
5098 * \param x The argument to evaluate sin/cos for
5099 * \param[out] sinval Sin(x)
5100 * \param[out] cosval Cos(x)
5102 static inline void gmx_simdcall sinCosSingleAccuracy(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
5104 sincos(x, sinval, cosval);
5107 /*! \brief SIMD float sin(x), only targeting single accuracy.
5109 * \param x The argument to evaluate sin for
5110 * \result Sin(x)
5112 * \attention Do NOT call both sin & cos if you need both results, since each of them
5113 * will then call \ref sincos and waste a factor 2 in performance.
5115 static inline SimdFloat gmx_simdcall sinSingleAccuracy(SimdFloat x)
5117 return sin(x);
5120 /*! \brief SIMD float cos(x), only targeting single accuracy.
5122 * \param x The argument to evaluate cos for
5123 * \result Cos(x)
5125 * \attention Do NOT call both sin & cos if you need both results, since each of them
5126 * will then call \ref sincos and waste a factor 2 in performance.
5128 static inline SimdFloat gmx_simdcall cosSingleAccuracy(SimdFloat x)
5130 return cos(x);
5133 /*! \brief SIMD float tan(x), only targeting single accuracy.
5135 * \param x The argument to evaluate tan for
5136 * \result Tan(x)
5138 static inline SimdFloat gmx_simdcall tanSingleAccuracy(SimdFloat x)
5140 return tan(x);
5143 /*! \brief SIMD float asin(x), only targeting single accuracy.
5145 * \param x The argument to evaluate asin for
5146 * \result Asin(x)
5148 static inline SimdFloat gmx_simdcall asinSingleAccuracy(SimdFloat x)
5150 return asin(x);
5153 /*! \brief SIMD float acos(x), only targeting single accuracy.
5155 * \param x The argument to evaluate acos for
5156 * \result Acos(x)
5158 static inline SimdFloat gmx_simdcall acosSingleAccuracy(SimdFloat x)
5160 return acos(x);
5163 /*! \brief SIMD float atan(x), only targeting single accuracy.
5165 * \param x The argument to evaluate atan for
5166 * \result Atan(x), same argument/value range as standard math library.
5168 static inline SimdFloat gmx_simdcall atanSingleAccuracy(SimdFloat x)
5170 return atan(x);
5173 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
5175 * \param y Y component of vector, any quartile
5176 * \param x X component of vector, any quartile
5177 * \result Atan(y,x), same argument/value range as standard math library.
5179 * \note This routine should provide correct results for all finite
5180 * non-zero or positive-zero arguments. However, negative zero arguments will
5181 * be treated as positive zero, which means the return value will deviate from
5182 * the standard math library atan2(y,x) for those cases. That should not be
5183 * of any concern in Gromacs, and in particular it will not affect calculations
5184 * of angles from vectors.
5186 static inline SimdFloat gmx_simdcall atan2SingleAccuracy(SimdFloat y, SimdFloat x)
5188 return atan2(y, x);
5191 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
5193 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5194 * \result Correction factor to coulomb force.
5196 static inline SimdFloat gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdFloat z2)
5198 return pmeForceCorrection(z2);
5201 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
5203 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5204 * \result Correction factor to coulomb force.
5206 static inline SimdFloat gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
5208 return pmePotentialCorrection(z2);
5210 # endif // GMX_SIMD_HAVE_FLOAT
5212 # if GMX_SIMD4_HAVE_FLOAT
5213 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
5215 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
5216 * GMX_FLOAT_MAX, i.e. within the range of single precision.
5217 * For the single precision implementation this is obviously always
5218 * true for positive values, but for double precision it adds an
5219 * extra restriction since the first lookup step might have to be
5220 * performed in single precision on some architectures. Note that the
5221 * responsibility for checking falls on you - this routine does not
5222 * check arguments.
5223 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
5225 static inline Simd4Float gmx_simdcall invsqrtSingleAccuracy(Simd4Float x)
5227 return invsqrt(x);
5229 # endif // GMX_SIMD4_HAVE_FLOAT
5231 /*! \} end of addtogroup module_simd */
5232 /*! \endcond end of condition libabl */
5234 #endif // GMX_SIMD
5236 } // namespace gmx
5238 #endif // GMX_SIMD_SIMD_MATH_H