2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
5 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_SIMD_MATH_H
37 #define GMX_SIMD_SIMD_MATH_H
39 /*! \libinternal \file
41 * \brief Math functions for SIMD datatypes.
43 * \attention This file is generic for all SIMD architectures, so you cannot
44 * assume that any of the optional SIMD features (as defined in simd.h) are
45 * present. In particular, this means you cannot assume support for integers,
46 * logical operations (neither on floating-point nor integer values), shifts,
47 * and the architecture might only have SIMD for either float or double.
48 * Second, to keep this file clean and general, any additions to this file
49 * must work for all possible SIMD architectures in both single and double
50 * precision (if they support it), and you cannot make any assumptions about
53 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
56 * \ingroup module_simd
65 #include "gromacs/math/utilities.h"
66 #include "gromacs/simd/simd.h"
67 #include "gromacs/utility/basedefinitions.h"
68 #include "gromacs/utility/real.h"
76 /*! \addtogroup module_simd */
79 /*! \name Implementation accuracy settings
85 # if GMX_SIMD_HAVE_FLOAT
87 /*! \name Single precision SIMD math functions
89 * \note In most cases you should use the real-precision functions instead.
93 /****************************************
94 * SINGLE PRECISION SIMD MATH FUNCTIONS *
95 ****************************************/
97 # if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
98 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
100 * \param x Values to set sign for
101 * \param y Values used to set sign
102 * \return Magnitude of x, sign of y
104 static inline SimdFloat gmx_simdcall
copysign(SimdFloat x
, SimdFloat y
)
106 # if GMX_SIMD_HAVE_LOGICAL
107 return abs(x
) | (SimdFloat(GMX_FLOAT_NEGZERO
) & y
);
109 return blend(abs(x
), -abs(x
), y
< setZero());
114 # if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
115 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
117 * This is a low-level routine that should only be used by SIMD math routine
118 * that evaluates the inverse square root.
120 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
121 * \param x The reference (starting) value x for which we want 1/sqrt(x).
122 * \return An improved approximation with roughly twice as many bits of accuracy.
124 static inline SimdFloat gmx_simdcall
rsqrtIter(SimdFloat lu
, SimdFloat x
)
126 SimdFloat tmp1
= x
* lu
;
127 SimdFloat tmp2
= SimdFloat(-0.5F
) * lu
;
128 tmp1
= fma(tmp1
, lu
, SimdFloat(-3.0F
));
133 /*! \brief Calculate 1/sqrt(x) for SIMD float.
135 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
136 * GMX_FLOAT_MAX, i.e. within the range of single precision.
137 * For the single precision implementation this is obviously always
138 * true for positive values, but for double precision it adds an
139 * extra restriction since the first lookup step might have to be
140 * performed in single precision on some architectures. Note that the
141 * responsibility for checking falls on you - this routine does not
144 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
146 static inline SimdFloat gmx_simdcall
invsqrt(SimdFloat x
)
148 SimdFloat lu
= rsqrt(x
);
149 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
150 lu
= rsqrtIter(lu
, x
);
152 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
153 lu
= rsqrtIter(lu
, x
);
155 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
156 lu
= rsqrtIter(lu
, x
);
161 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
163 * \param x0 First set of arguments, x0 must be in single range (see below).
164 * \param x1 Second set of arguments, x1 must be in single range (see below).
165 * \param[out] out0 Result 1/sqrt(x0)
166 * \param[out] out1 Result 1/sqrt(x1)
168 * In particular for double precision we can sometimes calculate square root
169 * pairs slightly faster by using single precision until the very last step.
171 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
172 * GMX_FLOAT_MAX, i.e. within the range of single precision.
173 * For the single precision implementation this is obviously always
174 * true for positive values, but for double precision it adds an
175 * extra restriction since the first lookup step might have to be
176 * performed in single precision on some architectures. Note that the
177 * responsibility for checking falls on you - this routine does not
180 static inline void gmx_simdcall
invsqrtPair(SimdFloat x0
, SimdFloat x1
, SimdFloat
* out0
, SimdFloat
* out1
)
186 # if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
187 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
189 * This is a low-level routine that should only be used by SIMD math routine
190 * that evaluates the reciprocal.
192 * \param lu Approximation of 1/x, typically obtained from lookup.
193 * \param x The reference (starting) value x for which we want 1/x.
194 * \return An improved approximation with roughly twice as many bits of accuracy.
196 static inline SimdFloat gmx_simdcall
rcpIter(SimdFloat lu
, SimdFloat x
)
198 return lu
* fnma(lu
, x
, SimdFloat(2.0F
));
202 /*! \brief Calculate 1/x for SIMD float.
204 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
205 * GMX_FLOAT_MAX, i.e. within the range of single precision.
206 * For the single precision implementation this is obviously always
207 * true for positive values, but for double precision it adds an
208 * extra restriction since the first lookup step might have to be
209 * performed in single precision on some architectures. Note that the
210 * responsibility for checking falls on you - this routine does not
213 * \return 1/x. Result is undefined if your argument was invalid.
215 static inline SimdFloat gmx_simdcall
inv(SimdFloat x
)
217 SimdFloat lu
= rcp(x
);
218 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
221 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
224 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
230 /*! \brief Division for SIMD floats
232 * \param nom Nominator
233 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
234 * For single precision this is equivalent to a nonzero argument,
235 * but in double precision it adds an extra restriction since
236 * the first lookup step might have to be performed in single
237 * precision on some architectures. Note that the responsibility
238 * for checking falls on you - this routine does not check arguments.
242 * \note This function does not use any masking to avoid problems with
243 * zero values in the denominator.
245 static inline SimdFloat gmx_simdcall
operator/(SimdFloat nom
, SimdFloat denom
)
247 return nom
* inv(denom
);
250 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
252 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
253 * Illegal values in the masked-out elements will not lead to
254 * floating-point exceptions.
256 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
257 * GMX_FLOAT_MAX for masked-in entries.
258 * See \ref invsqrt for the discussion about argument restrictions.
260 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
261 * entry was not masked, and 0.0 for masked-out entries.
263 static inline SimdFloat
maskzInvsqrt(SimdFloat x
, SimdFBool m
)
265 SimdFloat lu
= maskzRsqrt(x
, m
);
266 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
267 lu
= rsqrtIter(lu
, x
);
269 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
270 lu
= rsqrtIter(lu
, x
);
272 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
273 lu
= rsqrtIter(lu
, x
);
278 /*! \brief Calculate 1/x for SIMD float, masked version.
280 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
281 * GMX_FLOAT_MAX for masked-in entries.
282 * See \ref invsqrt for the discussion about argument restrictions.
284 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
286 static inline SimdFloat gmx_simdcall
maskzInv(SimdFloat x
, SimdFBool m
)
288 SimdFloat lu
= maskzRcp(x
, m
);
289 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
292 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
295 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
301 /*! \brief Calculate sqrt(x) for SIMD floats
303 * \tparam opt By default, this function checks if the input value is 0.0
304 * and masks this to return the correct result. If you are certain
305 * your argument will never be zero, and you know you need to save
306 * every single cycle you can, you can alternatively call the
307 * function as sqrt<MathOptimization::Unsafe>(x).
309 * \param x Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
310 * lookup step often has to be implemented in single precision.
311 * Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
312 * result, even in double precision. If you are using the unsafe
313 * math optimization parameter, the argument must be in the range
314 * GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
316 * \return sqrt(x). The result is undefined if the input value does not fall
317 * in the allowed range specified for the argument.
319 template<MathOptimization opt
= MathOptimization::Safe
>
320 static inline SimdFloat gmx_simdcall
sqrt(SimdFloat x
)
322 if (opt
== MathOptimization::Safe
)
324 SimdFloat res
= maskzInvsqrt(x
, setZero() < x
);
329 return x
* invsqrt(x
);
333 /*! \brief Cube root for SIMD floats
335 * \param x Argument to calculate cube root of. Can be negative or zero,
336 * but NaN or Inf values are not supported. Denormal values will
338 * \return Cube root of x.
340 static inline SimdFloat gmx_simdcall
cbrt(SimdFloat x
)
342 const SimdFloat
signBit(GMX_FLOAT_NEGZERO
);
343 const SimdFloat
minFloat(std::numeric_limits
<float>::min());
344 // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
345 // negative exponent from frexp() is -126, we can subtract one more unit to get 126
346 // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
347 // division mixed with FP, we let the divided value (42) be the original constant.
348 const std::int32_t offsetDiv3(42);
349 const SimdFloat
c2(-0.191502161678719066F
);
350 const SimdFloat
c1(0.697570460207922770F
);
351 const SimdFloat
c0(0.492659620528969547F
);
352 const SimdFloat
one(1.0F
);
353 const SimdFloat
two(2.0F
);
354 const SimdFloat
three(3.0F
);
355 const SimdFloat
oneThird(1.0F
/ 3.0F
);
356 const SimdFloat
cbrt2(1.2599210498948731648F
);
357 const SimdFloat
sqrCbrt2(1.5874010519681994748F
);
359 // To calculate cbrt(x) we first take the absolute value of x but save the sign,
360 // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
362 // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
363 // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
364 // The cube root can the be evaluated by calculating the cube root of the fraction
365 // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or
366 // +/-2^(2/3), and then we load this into a new IEEE754 fp number with the exponent 2^n, where
367 // n is the integer part of the original exponent divided by 3.
369 SimdFloat xSignBit
= x
& signBit
; // create bit mask where the sign bit is 1 for x elements < 0
370 SimdFloat xAbs
= andNot(signBit
, x
); // select everthing but the sign bit => abs(x)
371 SimdFBool xIsNonZero
= (minFloat
<= xAbs
); // treat denormals as 0
374 SimdFloat y
= frexp(xAbs
, &exponent
);
375 // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
376 // by first using a polynomial and then evaluating
377 // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
378 // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
379 SimdFloat z
= fma(fma(y
, c2
, c1
), y
, c0
);
380 SimdFloat w
= z
* z
* z
;
381 SimdFloat nom
= z
* fma(two
, y
, w
);
382 SimdFloat invDenom
= inv(fma(two
, w
, y
));
384 // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
385 // division converted to multiplication... but we can't do that since our SIMD layer cannot
386 // assume the presence of integer shift operations!
387 // However, when I first worked with the integer algorithm I still came up with a neat
388 // optimization, so I'll describe the full algorithm here in case we ever want to use it
391 // Our dividend is signed, which is a complication, but let's consider the unsigned case
392 // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
393 // our dividend is less than 16 bits (exponent range) we can accomplish this by
394 // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
395 // right-shifting by 16 bits to divide out the 2^16 part.
396 // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
397 // To handle the signed exponent one alternative would be to take absolute values, saving
398 // signs, etc - but that gets a bit complicated with 2-complement integers.
399 // Instead, we remember that we don't really want the exact division per se - what we're
400 // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
401 // we require that m must be positive (fewer cases to handle) instead of having n as the
403 // To handle this we start by adding 127 to the exponent. This value corresponds to the
404 // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
405 // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
406 // and then multiply by 21845 to get a division result offsetExpDiv3.
407 // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
408 // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
409 // the extra 1 corrects for the value we added to the exponent to get correct division.
410 // This remainder would have the value 0,1, or 2, but since we only use it to select
411 // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
413 // OK; end of long detour. Here's how we actually do it in our implementation by using
414 // floating-point for the exponent instead to avoid needing integer shifts:
416 // 1) Convert the exponent (obtained from frexp) to a float
417 // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
418 // do floating-point division instead of our integer hack, so it's the exponent bias-1, or
419 // the largest exponent minus 2.
420 // 3) Divide the float by 3 by multiplying with 1/3
421 // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
422 // combination with floating-point, because many integers cannot be represented exactly in
423 // floating point, and if we are just epsilon below the result might be truncated to a lower
424 // integer. I have not observed this on x86, but to have a safety margin we can add a small
425 // fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
426 // We can even save this extra floating-point addition by adding a small fraction (0.1) when
427 // we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
428 // 5) Get the remainder part by subtracting the truncated floating-point part.
429 // Here too we will have a plain division, so the remainder is a strict modulus
430 // and will have the values 0, 1 or 2.
432 // Before worrying about the few wasted cycles due to longer fp latency, this has the
433 // additional advantage that we don't use a single integer operation, so the algorithm
434 // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
436 // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
437 SimdFloat offsetExp
= cvtI2R(exponent
) + SimdFloat(static_cast<float>(3 * offsetDiv3
) + 0.1);
439 SimdFloat offsetExpDiv3
=
440 trunc(offsetExp
* oneThird
); // important to truncate here to mimic integer division
442 SimdFInt32 expDiv3
= cvtR2I(offsetExpDiv3
- SimdFloat(static_cast<float>(offsetDiv3
)));
444 SimdFloat remainder
= offsetExp
- offsetExpDiv3
* three
;
446 // If remainder is 0 we should just have the factor 1.0,
447 // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
448 SimdFloat factor
= blend(one
, cbrt2
, SimdFloat(0.5) < remainder
);
449 // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
450 factor
= blend(factor
, sqrCbrt2
, SimdFloat(1.5) < remainder
);
452 // Assemble the non-signed fraction, and add the sign back by xor
453 SimdFloat fraction
= (nom
* invDenom
* factor
) ^ xSignBit
;
454 // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
455 SimdFloat result
= selectByMask(ldexp(fraction
, expDiv3
), xIsNonZero
);
460 /*! \brief Inverse cube root for SIMD floats
462 * \param x Argument to calculate cube root of. Can be positive or
463 * negative, but the magnitude cannot be lower than
464 * the smallest normal number.
465 * \return Cube root of x. Undefined for values that don't
466 * fulfill the restriction of abs(x) > minFloat.
468 static inline SimdFloat gmx_simdcall
invcbrt(SimdFloat x
)
470 const SimdFloat
signBit(GMX_FLOAT_NEGZERO
);
471 const SimdFloat
minFloat(std::numeric_limits
<float>::min());
472 // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
473 // negative exponent from frexp() is -126, we can subtract one more unit to get 126
474 // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
475 // division mixed with FP, we let the divided value (42) be the original constant.
476 const std::int32_t offsetDiv3(42);
477 const SimdFloat
c2(-0.191502161678719066F
);
478 const SimdFloat
c1(0.697570460207922770F
);
479 const SimdFloat
c0(0.492659620528969547F
);
480 const SimdFloat
one(1.0F
);
481 const SimdFloat
two(2.0F
);
482 const SimdFloat
three(3.0F
);
483 const SimdFloat
oneThird(1.0F
/ 3.0F
);
484 const SimdFloat
invCbrt2(1.0F
/ 1.2599210498948731648F
);
485 const SimdFloat
invSqrCbrt2(1.0F
/ 1.5874010519681994748F
);
487 // We use pretty much exactly the same implementation as for cbrt(x),
488 // but to compute the inverse we swap the nominator/denominator
489 // in the quotient, and also swap the sign of the exponent parts.
491 SimdFloat xSignBit
= x
& signBit
; // create bit mask where the sign bit is 1 for x elements < 0
492 SimdFloat xAbs
= andNot(signBit
, x
); // select everthing but the sign bit => abs(x)
495 SimdFloat y
= frexp(xAbs
, &exponent
);
496 // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
497 // by first using a polynomial and then evaluating
498 // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
499 // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
500 SimdFloat z
= fma(fma(y
, c2
, c1
), y
, c0
);
501 SimdFloat w
= z
* z
* z
;
502 SimdFloat nom
= fma(two
, w
, y
);
503 SimdFloat invDenom
= inv(z
* fma(two
, y
, w
));
505 // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
506 SimdFloat offsetExp
= cvtI2R(exponent
) + SimdFloat(static_cast<float>(3 * offsetDiv3
) + 0.1);
507 SimdFloat offsetExpDiv3
=
508 trunc(offsetExp
* oneThird
); // important to truncate here to mimic integer division
510 // We should swap the sign here, so we change order of the terms in the subtraction
511 SimdFInt32 expDiv3
= cvtR2I(SimdFloat(static_cast<float>(offsetDiv3
)) - offsetExpDiv3
);
513 // Swap sign here too, so remainder is either 0, -1 or -2
514 SimdFloat remainder
= offsetExpDiv3
* three
- offsetExp
;
516 // If remainder is 0 we should just have the factor 1.0,
517 // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
518 SimdFloat factor
= blend(one
, invCbrt2
, remainder
< SimdFloat(-0.5));
519 // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
520 factor
= blend(factor
, invSqrCbrt2
, remainder
< SimdFloat(-1.5));
522 // Assemble the non-signed fraction, and add the sign back by xor
523 SimdFloat fraction
= (nom
* invDenom
* factor
) ^ xSignBit
;
524 // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
525 SimdFloat result
= ldexp(fraction
, expDiv3
);
530 /*! \brief SIMD float log2(x). This is the base-2 logarithm.
532 * \param x Argument, should be >0.
533 * \result The base-2 logarithm of x. Undefined if argument is invalid.
535 static inline SimdFloat gmx_simdcall
log2(SimdFloat x
)
537 // This implementation computes log2 by
538 // 1) Extracting the exponent and adding it to...
539 // 2) A 9th-order minimax approximation using only odd
540 // terms of (x-1)/(x+1), where x is the mantissa.
542 # if GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
543 // Just rescale if native log2() is not present, but log() is.
544 return log(x
) * SimdFloat(std::log2(std::exp(1.0)));
546 const SimdFloat
one(1.0F
);
547 const SimdFloat
two(2.0F
);
548 const SimdFloat
invsqrt2(1.0F
/ std::sqrt(2.0F
));
549 const SimdFloat
CL9(0.342149508897807708152F
);
550 const SimdFloat
CL7(0.411570606888219447939F
);
551 const SimdFloat
CL5(0.577085979152320294183F
);
552 const SimdFloat
CL3(0.961796550607099898222F
);
553 const SimdFloat
CL1(2.885390081777926774009F
);
554 SimdFloat fExp
, x2
, p
;
558 // For the log2() function, the argument can never be 0, so use the faster version of frexp()
559 x
= frexp
<MathOptimization::Unsafe
>(x
, &iExp
);
563 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
564 fExp
= fExp
- selectByMask(one
, m
);
565 x
= x
* blend(one
, two
, m
);
567 x
= (x
- one
) * inv(x
+ one
);
570 p
= fma(CL9
, x2
, CL7
);
580 # if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
581 /*! \brief SIMD float log(x). This is the natural logarithm.
583 * \param x Argument, should be >0.
584 * \result The natural logarithm of x. Undefined if argument is invalid.
586 static inline SimdFloat gmx_simdcall
log(SimdFloat x
)
588 const SimdFloat
one(1.0F
);
589 const SimdFloat
two(2.0F
);
590 const SimdFloat
invsqrt2(1.0F
/ std::sqrt(2.0F
));
591 const SimdFloat
corr(0.693147180559945286226764F
);
592 const SimdFloat
CL9(0.2371599674224853515625F
);
593 const SimdFloat
CL7(0.285279005765914916992188F
);
594 const SimdFloat
CL5(0.400005519390106201171875F
);
595 const SimdFloat
CL3(0.666666567325592041015625F
);
596 const SimdFloat
CL1(2.0F
);
597 SimdFloat fExp
, x2
, p
;
601 // For log(), the argument cannot be 0, so use the faster version of frexp()
602 x
= frexp
<MathOptimization::Unsafe
>(x
, &iExp
);
606 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
607 fExp
= fExp
- selectByMask(one
, m
);
608 x
= x
* blend(one
, two
, m
);
610 x
= (x
- one
) * inv(x
+ one
);
613 p
= fma(CL9
, x2
, CL7
);
617 p
= fma(p
, x
, corr
* fExp
);
623 # if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
624 /*! \brief SIMD float 2^x
626 * \tparam opt If this is changed from the default (safe) into the unsafe
627 * option, input values that would otherwise lead to zero-clamped
628 * results are not allowed and will lead to undefined results.
630 * \param x Argument. For the default (safe) function version this can be
631 * arbitrarily small value, but the routine might clamp the result to
632 * zero for arguments that would produce subnormal IEEE754-2008 results.
633 * This corresponds to inputs below -126 in single or -1022 in double,
634 * and it might overflow for arguments reaching 127 (single) or
635 * 1023 (double). If you enable the unsafe math optimization,
636 * very small arguments will not necessarily be zero-clamped, but
637 * can produce undefined results.
639 * \result 2^x. The result is undefined for very large arguments that cause
640 * internal floating-point overflow. If unsafe optimizations are enabled,
641 * this is also true for very small values.
643 * \note The definition range of this function is just-so-slightly smaller
644 * than the allowed IEEE exponents for many architectures. This is due
645 * to the implementation, which will hopefully improve in the future.
647 * \warning You cannot rely on this implementation returning inf for arguments
648 * that cause overflow. If you have some very large
649 * values and need to rely on getting a valid numerical output,
650 * take the minimum of your variable and the largest valid argument
651 * before calling this routine.
653 template<MathOptimization opt
= MathOptimization::Safe
>
654 static inline SimdFloat gmx_simdcall
exp2(SimdFloat x
)
656 const SimdFloat
CC6(0.0001534581200287996416911311F
);
657 const SimdFloat
CC5(0.001339993121934088894618990F
);
658 const SimdFloat
CC4(0.009618488957115180159497841F
);
659 const SimdFloat
CC3(0.05550328776964726865751735F
);
660 const SimdFloat
CC2(0.2402264689063408646490722F
);
661 const SimdFloat
CC1(0.6931472057372680777553816F
);
662 const SimdFloat
one(1.0F
);
668 // Large negative values are valid arguments to exp2(), so there are two
669 // things we need to account for:
670 // 1. When the exponents reaches -127, the (biased) exponent field will be
671 // zero and we can no longer multiply with it. There are special IEEE
672 // formats to handle this range, but for now we have to accept that
673 // we cannot handle those arguments. If input value becomes even more
674 // negative, it will start to loop and we would end up with invalid
675 // exponents. Thus, we need to limit or mask this.
676 // 2. For VERY large negative values, we will have problems that the
677 // subtraction to get the fractional part loses accuracy, and then we
678 // can end up with overflows in the polynomial.
680 // For now, we handle this by forwarding the math optimization setting to
681 // ldexp, where the routine will return zero for very small arguments.
683 // However, before doing that we need to make sure we do not call cvtR2I
684 // with an argument that is so negative it cannot be converted to an integer.
685 if (opt
== MathOptimization::Safe
)
687 x
= max(x
, SimdFloat(std::numeric_limits
<std::int32_t>::lowest()));
690 fexppart
= ldexp
<opt
>(one
, cvtR2I(x
));
694 p
= fma(CC6
, x
, CC5
);
705 # if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
706 /*! \brief SIMD float exp(x).
708 * In addition to scaling the argument for 2^x this routine correctly does
709 * extended precision arithmetics to improve accuracy.
711 * \tparam opt If this is changed from the default (safe) into the unsafe
712 * option, input values that would otherwise lead to zero-clamped
713 * results are not allowed and will lead to undefined results.
715 * \param x Argument. For the default (safe) function version this can be
716 * arbitrarily small value, but the routine might clamp the result to
717 * zero for arguments that would produce subnormal IEEE754-2008 results.
718 * This corresponds to input arguments reaching
719 * -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
720 * Similarly, it might overflow for arguments reaching
721 * 127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
722 * unsafe math optimizations are enabled, small input values that would
723 * result in zero-clamped output are not allowed.
725 * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
726 * depending on the underlying implementation. If unsafe optimizations
727 * are enabled, this is also true for very small values.
729 * \note The definition range of this function is just-so-slightly smaller
730 * than the allowed IEEE exponents for many architectures. This is due
731 * to the implementation, which will hopefully improve in the future.
733 * \warning You cannot rely on this implementation returning inf for arguments
734 * that cause overflow. If you have some very large
735 * values and need to rely on getting a valid numerical output,
736 * take the minimum of your variable and the largest valid argument
737 * before calling this routine.
739 template<MathOptimization opt
= MathOptimization::Safe
>
740 static inline SimdFloat gmx_simdcall
exp(SimdFloat x
)
742 const SimdFloat
argscale(1.44269504088896341F
);
743 const SimdFloat
invargscale0(-0.693145751953125F
);
744 const SimdFloat
invargscale1(-1.428606765330187045e-06F
);
745 const SimdFloat
CC4(0.00136324646882712841033936F
);
746 const SimdFloat
CC3(0.00836596917361021041870117F
);
747 const SimdFloat
CC2(0.0416710823774337768554688F
);
748 const SimdFloat
CC1(0.166665524244308471679688F
);
749 const SimdFloat
CC0(0.499999850988388061523438F
);
750 const SimdFloat
one(1.0F
);
755 // Large negative values are valid arguments to exp2(), so there are two
756 // things we need to account for:
757 // 1. When the exponents reaches -127, the (biased) exponent field will be
758 // zero and we can no longer multiply with it. There are special IEEE
759 // formats to handle this range, but for now we have to accept that
760 // we cannot handle those arguments. If input value becomes even more
761 // negative, it will start to loop and we would end up with invalid
762 // exponents. Thus, we need to limit or mask this.
763 // 2. For VERY large negative values, we will have problems that the
764 // subtraction to get the fractional part loses accuracy, and then we
765 // can end up with overflows in the polynomial.
767 // For now, we handle this by forwarding the math optimization setting to
768 // ldexp, where the routine will return zero for very small arguments.
770 // However, before doing that we need to make sure we do not call cvtR2I
771 // with an argument that is so negative it cannot be converted to an integer
772 // after being multiplied by argscale.
774 if (opt
== MathOptimization::Safe
)
776 x
= max(x
, SimdFloat(std::numeric_limits
<std::int32_t>::lowest()) / argscale
);
782 fexppart
= ldexp
<opt
>(one
, cvtR2I(y
));
785 // Extended precision arithmetics
786 x
= fma(invargscale0
, intpart
, x
);
787 x
= fma(invargscale1
, intpart
, x
);
789 p
= fma(CC4
, x
, CC3
);
793 p
= fma(x
* x
, p
, x
);
794 # if GMX_SIMD_HAVE_FMA
795 x
= fma(p
, fexppart
, fexppart
);
797 x
= (p
+ one
) * fexppart
;
803 /*! \brief SIMD float pow(x,y)
805 * This returns x^y for SIMD values.
807 * \tparam opt If this is changed from the default (safe) into the unsafe
808 * option, there are no guarantees about correct results for x==0.
814 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
815 * depending on the underlying implementation. If unsafe optimizations
816 * are enabled, this is also true for x==0.
818 * \warning You cannot rely on this implementation returning inf for arguments
819 * that cause overflow. If you have some very large
820 * values and need to rely on getting a valid numerical output,
821 * take the minimum of your variable and the largest valid argument
822 * before calling this routine.
824 template<MathOptimization opt
= MathOptimization::Safe
>
825 static inline SimdFloat gmx_simdcall
pow(SimdFloat x
, SimdFloat y
)
829 if (opt
== MathOptimization::Safe
)
831 xcorr
= max(x
, SimdFloat(std::numeric_limits
<float>::min()));
838 SimdFloat result
= exp2
<opt
>(y
* log2(xcorr
));
840 if (opt
== MathOptimization::Safe
)
842 // if x==0 and y>0 we explicitly set the result to 0.0
843 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
844 result
= blend(result
, setZero(), x
== setZero() && setZero() < y
);
851 /*! \brief SIMD float erf(x).
853 * \param x The value to calculate erf(x) for.
856 * This routine achieves very close to full precision, but we do not care about
857 * the last bit or the subnormal result range.
859 static inline SimdFloat gmx_simdcall
erf(SimdFloat x
)
861 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
862 const SimdFloat
CA6(7.853861353153693e-5F
);
863 const SimdFloat
CA5(-8.010193625184903e-4F
);
864 const SimdFloat
CA4(5.188327685732524e-3F
);
865 const SimdFloat
CA3(-2.685381193529856e-2F
);
866 const SimdFloat
CA2(1.128358514861418e-1F
);
867 const SimdFloat
CA1(-3.761262582423300e-1F
);
868 const SimdFloat
CA0(1.128379165726710F
);
869 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
870 const SimdFloat
CB9(-0.0018629930017603923F
);
871 const SimdFloat
CB8(0.003909821287598495F
);
872 const SimdFloat
CB7(-0.0052094582210355615F
);
873 const SimdFloat
CB6(0.005685614362160572F
);
874 const SimdFloat
CB5(-0.0025367682853477272F
);
875 const SimdFloat
CB4(-0.010199799682318782F
);
876 const SimdFloat
CB3(0.04369575504816542F
);
877 const SimdFloat
CB2(-0.11884063474674492F
);
878 const SimdFloat
CB1(0.2732120154030589F
);
879 const SimdFloat
CB0(0.42758357702025784F
);
880 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
881 const SimdFloat
CC10(-0.0445555913112064F
);
882 const SimdFloat
CC9(0.21376355144663348F
);
883 const SimdFloat
CC8(-0.3473187200259257F
);
884 const SimdFloat
CC7(0.016690861551248114F
);
885 const SimdFloat
CC6(0.7560973182491192F
);
886 const SimdFloat
CC5(-1.2137903600145787F
);
887 const SimdFloat
CC4(0.8411872321232948F
);
888 const SimdFloat
CC3(-0.08670413896296343F
);
889 const SimdFloat
CC2(-0.27124782687240334F
);
890 const SimdFloat
CC1(-0.0007502488047806069F
);
891 const SimdFloat
CC0(0.5642114853803148F
);
892 const SimdFloat
one(1.0F
);
893 const SimdFloat
two(2.0F
);
896 SimdFloat t
, t2
, w
, w2
;
897 SimdFloat pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
899 SimdFloat res_erf
, res_erfc
, res
;
900 SimdFBool m
, maskErf
;
906 pA0
= fma(CA6
, x4
, CA4
);
907 pA1
= fma(CA5
, x4
, CA3
);
908 pA0
= fma(pA0
, x4
, CA2
);
909 pA1
= fma(pA1
, x4
, CA1
);
911 pA0
= fma(pA1
, x2
, pA0
);
912 // Constant term must come last for precision reasons
919 maskErf
= SimdFloat(0.75F
) <= y
;
920 t
= maskzInv(y
, maskErf
);
925 // No need for a floating-point sieve here (as in erfc), since erf()
926 // will never return values that are extremely small for large args.
927 expmx2
= exp(-y
* y
);
929 pB1
= fma(CB9
, w2
, CB7
);
930 pB0
= fma(CB8
, w2
, CB6
);
931 pB1
= fma(pB1
, w2
, CB5
);
932 pB0
= fma(pB0
, w2
, CB4
);
933 pB1
= fma(pB1
, w2
, CB3
);
934 pB0
= fma(pB0
, w2
, CB2
);
935 pB1
= fma(pB1
, w2
, CB1
);
936 pB0
= fma(pB0
, w2
, CB0
);
937 pB0
= fma(pB1
, w
, pB0
);
939 pC0
= fma(CC10
, t2
, CC8
);
940 pC1
= fma(CC9
, t2
, CC7
);
941 pC0
= fma(pC0
, t2
, CC6
);
942 pC1
= fma(pC1
, t2
, CC5
);
943 pC0
= fma(pC0
, t2
, CC4
);
944 pC1
= fma(pC1
, t2
, CC3
);
945 pC0
= fma(pC0
, t2
, CC2
);
946 pC1
= fma(pC1
, t2
, CC1
);
948 pC0
= fma(pC0
, t2
, CC0
);
949 pC0
= fma(pC1
, t
, pC0
);
952 // Select pB0 or pC0 for erfc()
954 res_erfc
= blend(pB0
, pC0
, m
);
955 res_erfc
= res_erfc
* expmx2
;
957 // erfc(x<0) = 2-erfc(|x|)
959 res_erfc
= blend(res_erfc
, two
- res_erfc
, m
);
961 // Select erf() or erfc()
962 res
= blend(res_erf
, one
- res_erfc
, maskErf
);
967 /*! \brief SIMD float erfc(x).
969 * \param x The value to calculate erfc(x) for.
972 * This routine achieves full precision (bar the last bit) over most of the
973 * input range, but for large arguments where the result is getting close
974 * to the minimum representable numbers we accept slightly larger errors
975 * (think results that are in the ballpark of 10^-30 for single precision)
976 * since that is not relevant for MD.
978 static inline SimdFloat gmx_simdcall
erfc(SimdFloat x
)
980 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
981 const SimdFloat
CA6(7.853861353153693e-5F
);
982 const SimdFloat
CA5(-8.010193625184903e-4F
);
983 const SimdFloat
CA4(5.188327685732524e-3F
);
984 const SimdFloat
CA3(-2.685381193529856e-2F
);
985 const SimdFloat
CA2(1.128358514861418e-1F
);
986 const SimdFloat
CA1(-3.761262582423300e-1F
);
987 const SimdFloat
CA0(1.128379165726710F
);
988 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
989 const SimdFloat
CB9(-0.0018629930017603923F
);
990 const SimdFloat
CB8(0.003909821287598495F
);
991 const SimdFloat
CB7(-0.0052094582210355615F
);
992 const SimdFloat
CB6(0.005685614362160572F
);
993 const SimdFloat
CB5(-0.0025367682853477272F
);
994 const SimdFloat
CB4(-0.010199799682318782F
);
995 const SimdFloat
CB3(0.04369575504816542F
);
996 const SimdFloat
CB2(-0.11884063474674492F
);
997 const SimdFloat
CB1(0.2732120154030589F
);
998 const SimdFloat
CB0(0.42758357702025784F
);
999 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
1000 const SimdFloat
CC10(-0.0445555913112064F
);
1001 const SimdFloat
CC9(0.21376355144663348F
);
1002 const SimdFloat
CC8(-0.3473187200259257F
);
1003 const SimdFloat
CC7(0.016690861551248114F
);
1004 const SimdFloat
CC6(0.7560973182491192F
);
1005 const SimdFloat
CC5(-1.2137903600145787F
);
1006 const SimdFloat
CC4(0.8411872321232948F
);
1007 const SimdFloat
CC3(-0.08670413896296343F
);
1008 const SimdFloat
CC2(-0.27124782687240334F
);
1009 const SimdFloat
CC1(-0.0007502488047806069F
);
1010 const SimdFloat
CC0(0.5642114853803148F
);
1011 // Coefficients for expansion of exp(x) in [0,0.1]
1012 // CD0 and CD1 are both 1.0, so no need to declare them separately
1013 const SimdFloat
CD2(0.5000066608081202F
);
1014 const SimdFloat
CD3(0.1664795422874624F
);
1015 const SimdFloat
CD4(0.04379839977652482F
);
1016 const SimdFloat
one(1.0F
);
1017 const SimdFloat
two(2.0F
);
1019 /* We need to use a small trick here, since we cannot assume all SIMD
1020 * architectures support integers, and the flag we want (0xfffff000) would
1021 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
1022 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
1023 * fp numbers, and perform a logical or. Since the expression is constant,
1024 * we can at least hope it is evaluated at compile-time.
1026 # if GMX_SIMD_HAVE_LOGICAL
1027 const SimdFloat
sieve(SimdFloat(-5.965323564e+29F
) | SimdFloat(7.05044434e-30F
));
1029 const int isieve
= 0xFFFFF000;
1030 alignas(GMX_SIMD_ALIGNMENT
) float mem
[GMX_SIMD_FLOAT_WIDTH
];
1039 SimdFloat x2
, x4
, y
;
1040 SimdFloat q
, z
, t
, t2
, w
, w2
;
1041 SimdFloat pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
1042 SimdFloat expmx2
, corr
;
1043 SimdFloat res_erf
, res_erfc
, res
;
1044 SimdFBool m
, msk_erf
;
1050 pA0
= fma(CA6
, x4
, CA4
);
1051 pA1
= fma(CA5
, x4
, CA3
);
1052 pA0
= fma(pA0
, x4
, CA2
);
1053 pA1
= fma(pA1
, x4
, CA1
);
1055 pA0
= fma(pA0
, x4
, pA1
);
1056 // Constant term must come last for precision reasons
1063 msk_erf
= SimdFloat(0.75F
) <= y
;
1064 t
= maskzInv(y
, msk_erf
);
1069 * We cannot simply calculate exp(-y2) directly in single precision, since
1070 * that will lose a couple of bits of precision due to the multiplication.
1071 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
1072 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
1074 * The only drawback with this is that it requires TWO separate exponential
1075 * evaluations, which would be horrible performance-wise. However, the argument
1076 * for the second exp() call is always small, so there we simply use a
1077 * low-order minimax expansion on [0,0.1].
1079 * However, this neat idea requires support for logical ops (and) on
1080 * FP numbers, which some vendors decided isn't necessary in their SIMD
1081 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
1082 * in double, but we still need memory as a backup when that is not available,
1083 * and this case is rare enough that we go directly there...
1085 # if GMX_SIMD_HAVE_LOGICAL
1089 for (i
= 0; i
< GMX_SIMD_FLOAT_WIDTH
; i
++)
1092 conv
.i
= conv
.i
& isieve
;
1095 z
= load
<SimdFloat
>(mem
);
1097 q
= (z
- y
) * (z
+ y
);
1098 corr
= fma(CD4
, q
, CD3
);
1099 corr
= fma(corr
, q
, CD2
);
1100 corr
= fma(corr
, q
, one
);
1101 corr
= fma(corr
, q
, one
);
1103 expmx2
= exp(-z
* z
);
1104 expmx2
= expmx2
* corr
;
1106 pB1
= fma(CB9
, w2
, CB7
);
1107 pB0
= fma(CB8
, w2
, CB6
);
1108 pB1
= fma(pB1
, w2
, CB5
);
1109 pB0
= fma(pB0
, w2
, CB4
);
1110 pB1
= fma(pB1
, w2
, CB3
);
1111 pB0
= fma(pB0
, w2
, CB2
);
1112 pB1
= fma(pB1
, w2
, CB1
);
1113 pB0
= fma(pB0
, w2
, CB0
);
1114 pB0
= fma(pB1
, w
, pB0
);
1116 pC0
= fma(CC10
, t2
, CC8
);
1117 pC1
= fma(CC9
, t2
, CC7
);
1118 pC0
= fma(pC0
, t2
, CC6
);
1119 pC1
= fma(pC1
, t2
, CC5
);
1120 pC0
= fma(pC0
, t2
, CC4
);
1121 pC1
= fma(pC1
, t2
, CC3
);
1122 pC0
= fma(pC0
, t2
, CC2
);
1123 pC1
= fma(pC1
, t2
, CC1
);
1125 pC0
= fma(pC0
, t2
, CC0
);
1126 pC0
= fma(pC1
, t
, pC0
);
1129 // Select pB0 or pC0 for erfc()
1131 res_erfc
= blend(pB0
, pC0
, m
);
1132 res_erfc
= res_erfc
* expmx2
;
1134 // erfc(x<0) = 2-erfc(|x|)
1136 res_erfc
= blend(res_erfc
, two
- res_erfc
, m
);
1138 // Select erf() or erfc()
1139 res
= blend(one
- res_erf
, res_erfc
, msk_erf
);
1144 /*! \brief SIMD float sin \& cos.
1146 * \param x The argument to evaluate sin/cos for
1147 * \param[out] sinval Sin(x)
1148 * \param[out] cosval Cos(x)
1150 * This version achieves close to machine precision, but for very large
1151 * magnitudes of the argument we inherently begin to lose accuracy due to the
1152 * argument reduction, despite using extended precision arithmetics internally.
1154 static inline void gmx_simdcall
sincos(SimdFloat x
, SimdFloat
* sinval
, SimdFloat
* cosval
)
1156 // Constants to subtract Pi/4*x from y while minimizing precision loss
1157 const SimdFloat
argred0(-1.5703125);
1158 const SimdFloat
argred1(-4.83751296997070312500e-04F
);
1159 const SimdFloat
argred2(-7.54953362047672271729e-08F
);
1160 const SimdFloat
argred3(-2.56334406825708960298e-12F
);
1161 const SimdFloat
two_over_pi(static_cast<float>(2.0F
/ M_PI
));
1162 const SimdFloat
const_sin2(-1.9515295891e-4F
);
1163 const SimdFloat
const_sin1(8.3321608736e-3F
);
1164 const SimdFloat
const_sin0(-1.6666654611e-1F
);
1165 const SimdFloat
const_cos2(2.443315711809948e-5F
);
1166 const SimdFloat
const_cos1(-1.388731625493765e-3F
);
1167 const SimdFloat
const_cos0(4.166664568298827e-2F
);
1168 const SimdFloat
half(0.5F
);
1169 const SimdFloat
one(1.0F
);
1170 SimdFloat ssign
, csign
;
1171 SimdFloat x2
, y
, z
, psin
, pcos
, sss
, ccc
;
1174 # if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1175 const SimdFInt32
ione(1);
1176 const SimdFInt32
itwo(2);
1179 z
= x
* two_over_pi
;
1183 m
= cvtIB2B((iy
& ione
) == SimdFInt32(0));
1184 ssign
= selectByMask(SimdFloat(GMX_FLOAT_NEGZERO
), cvtIB2B((iy
& itwo
) == itwo
));
1185 csign
= selectByMask(SimdFloat(GMX_FLOAT_NEGZERO
), cvtIB2B(((iy
+ ione
) & itwo
) == itwo
));
1187 const SimdFloat
quarter(0.25f
);
1188 const SimdFloat
minusquarter(-0.25f
);
1190 SimdFBool m1
, m2
, m3
;
1192 /* The most obvious way to find the arguments quadrant in the unit circle
1193 * to calculate the sign is to use integer arithmetic, but that is not
1194 * present in all SIMD implementations. As an alternative, we have devised a
1195 * pure floating-point algorithm that uses truncation for argument reduction
1196 * so that we get a new value 0<=q<1 over the unit circle, and then
1197 * do floating-point comparisons with fractions. This is likely to be
1198 * slightly slower (~10%) due to the longer latencies of floating-point, so
1199 * we only use it when integer SIMD arithmetic is not present.
1203 // It is critical that half-way cases are rounded down
1204 z
= fma(x
, two_over_pi
, half
);
1208 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
1209 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
1210 * This removes the 2*Pi periodicity without using any integer arithmetic.
1211 * First check if y had the value 2 or 3, set csign if true.
1214 /* If we have logical operations we can work directly on the signbit, which
1215 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
1216 * Thus, if you are altering defines to debug alternative code paths, the
1217 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
1218 * active or inactive - you will get errors if only one is used.
1220 # if GMX_SIMD_HAVE_LOGICAL
1221 ssign
= ssign
& SimdFloat(GMX_FLOAT_NEGZERO
);
1222 csign
= andNot(q
, SimdFloat(GMX_FLOAT_NEGZERO
));
1223 ssign
= ssign
^ csign
;
1225 ssign
= copysign(SimdFloat(1.0f
), ssign
);
1226 csign
= copysign(SimdFloat(1.0f
), q
);
1228 ssign
= ssign
* csign
; // swap ssign if csign was set.
1230 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
1231 m1
= (q
< minusquarter
);
1232 m2
= (setZero() <= q
);
1236 // where mask is FALSE, swap sign.
1237 csign
= csign
* blend(SimdFloat(-1.0f
), one
, m
);
1239 x
= fma(y
, argred0
, x
);
1240 x
= fma(y
, argred1
, x
);
1241 x
= fma(y
, argred2
, x
);
1242 x
= fma(y
, argred3
, x
);
1245 psin
= fma(const_sin2
, x2
, const_sin1
);
1246 psin
= fma(psin
, x2
, const_sin0
);
1247 psin
= fma(psin
, x
* x2
, x
);
1248 pcos
= fma(const_cos2
, x2
, const_cos1
);
1249 pcos
= fma(pcos
, x2
, const_cos0
);
1250 pcos
= fms(pcos
, x2
, half
);
1251 pcos
= fma(pcos
, x2
, one
);
1253 sss
= blend(pcos
, psin
, m
);
1254 ccc
= blend(psin
, pcos
, m
);
1255 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
1256 # if GMX_SIMD_HAVE_LOGICAL
1257 *sinval
= sss
^ ssign
;
1258 *cosval
= ccc
^ csign
;
1260 *sinval
= sss
* ssign
;
1261 *cosval
= ccc
* csign
;
1265 /*! \brief SIMD float sin(x).
1267 * \param x The argument to evaluate sin for
1270 * \attention Do NOT call both sin & cos if you need both results, since each of them
1271 * will then call \ref sincos and waste a factor 2 in performance.
1273 static inline SimdFloat gmx_simdcall
sin(SimdFloat x
)
1280 /*! \brief SIMD float cos(x).
1282 * \param x The argument to evaluate cos for
1285 * \attention Do NOT call both sin & cos if you need both results, since each of them
1286 * will then call \ref sincos and waste a factor 2 in performance.
1288 static inline SimdFloat gmx_simdcall
cos(SimdFloat x
)
1295 /*! \brief SIMD float tan(x).
1297 * \param x The argument to evaluate tan for
1300 static inline SimdFloat gmx_simdcall
tan(SimdFloat x
)
1302 const SimdFloat
argred0(-1.5703125);
1303 const SimdFloat
argred1(-4.83751296997070312500e-04F
);
1304 const SimdFloat
argred2(-7.54953362047672271729e-08F
);
1305 const SimdFloat
argred3(-2.56334406825708960298e-12F
);
1306 const SimdFloat
two_over_pi(static_cast<float>(2.0F
/ M_PI
));
1307 const SimdFloat
CT6(0.009498288995810566122993911);
1308 const SimdFloat
CT5(0.002895755790837379295226923);
1309 const SimdFloat
CT4(0.02460087336161924491836265);
1310 const SimdFloat
CT3(0.05334912882656359828045988);
1311 const SimdFloat
CT2(0.1333989091464957704418495);
1312 const SimdFloat
CT1(0.3333307599244198227797507);
1314 SimdFloat x2
, p
, y
, z
;
1317 # if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1321 z
= x
* two_over_pi
;
1324 m
= cvtIB2B((iy
& ione
) == ione
);
1326 x
= fma(y
, argred0
, x
);
1327 x
= fma(y
, argred1
, x
);
1328 x
= fma(y
, argred2
, x
);
1329 x
= fma(y
, argred3
, x
);
1330 x
= selectByMask(SimdFloat(GMX_FLOAT_NEGZERO
), m
) ^ x
;
1332 const SimdFloat
quarter(0.25f
);
1333 const SimdFloat
half(0.5f
);
1334 const SimdFloat
threequarter(0.75f
);
1336 SimdFBool m1
, m2
, m3
;
1339 z
= fma(w
, two_over_pi
, half
);
1345 m3
= threequarter
<= q
;
1348 w
= fma(y
, argred0
, w
);
1349 w
= fma(y
, argred1
, w
);
1350 w
= fma(y
, argred2
, w
);
1351 w
= fma(y
, argred3
, w
);
1352 w
= blend(w
, -w
, m
);
1353 x
= w
* copysign(SimdFloat(1.0), x
);
1356 p
= fma(CT6
, x2
, CT5
);
1357 p
= fma(p
, x2
, CT4
);
1358 p
= fma(p
, x2
, CT3
);
1359 p
= fma(p
, x2
, CT2
);
1360 p
= fma(p
, x2
, CT1
);
1361 p
= fma(x2
* p
, x
, x
);
1363 p
= blend(p
, maskzInv(p
, m
), m
);
1367 /*! \brief SIMD float asin(x).
1369 * \param x The argument to evaluate asin for
1372 static inline SimdFloat gmx_simdcall
asin(SimdFloat x
)
1374 const SimdFloat
limitlow(1e-4F
);
1375 const SimdFloat
half(0.5F
);
1376 const SimdFloat
one(1.0F
);
1377 const SimdFloat
halfpi(static_cast<float>(M_PI
/ 2.0F
));
1378 const SimdFloat
CC5(4.2163199048E-2F
);
1379 const SimdFloat
CC4(2.4181311049E-2F
);
1380 const SimdFloat
CC3(4.5470025998E-2F
);
1381 const SimdFloat
CC2(7.4953002686E-2F
);
1382 const SimdFloat
CC1(1.6666752422E-1F
);
1384 SimdFloat z
, z1
, z2
, q
, q1
, q2
;
1390 z1
= half
* (one
- xabs
);
1392 q1
= z1
* maskzInvsqrt(z1
, m2
);
1395 z
= blend(z2
, z1
, m
);
1396 q
= blend(q2
, q1
, m
);
1399 pA
= fma(CC5
, z2
, CC3
);
1400 pB
= fma(CC4
, z2
, CC2
);
1401 pA
= fma(pA
, z2
, CC1
);
1403 z
= fma(pB
, z2
, pA
);
1407 z
= blend(z
, q2
, m
);
1409 m
= limitlow
< xabs
;
1410 z
= blend(xabs
, z
, m
);
1416 /*! \brief SIMD float acos(x).
1418 * \param x The argument to evaluate acos for
1421 static inline SimdFloat gmx_simdcall
acos(SimdFloat x
)
1423 const SimdFloat
one(1.0F
);
1424 const SimdFloat
half(0.5F
);
1425 const SimdFloat
pi(static_cast<float>(M_PI
));
1426 const SimdFloat
halfpi(static_cast<float>(M_PI
/ 2.0F
));
1428 SimdFloat z
, z1
, z2
, z3
;
1429 SimdFBool m1
, m2
, m3
;
1435 z
= fnma(half
, xabs
, half
);
1437 z
= z
* maskzInvsqrt(z
, m3
);
1438 z
= blend(x
, z
, m1
);
1444 z
= blend(z1
, z2
, m2
);
1445 z
= blend(z3
, z
, m1
);
1450 /*! \brief SIMD float asin(x).
1452 * \param x The argument to evaluate atan for
1453 * \result Atan(x), same argument/value range as standard math library.
1455 static inline SimdFloat gmx_simdcall
atan(SimdFloat x
)
1457 const SimdFloat
halfpi(static_cast<float>(M_PI
/ 2.0F
));
1458 const SimdFloat
CA17(0.002823638962581753730774F
);
1459 const SimdFloat
CA15(-0.01595690287649631500244F
);
1460 const SimdFloat
CA13(0.04250498861074447631836F
);
1461 const SimdFloat
CA11(-0.07489009201526641845703F
);
1462 const SimdFloat
CA9(0.1063479334115982055664F
);
1463 const SimdFloat
CA7(-0.1420273631811141967773F
);
1464 const SimdFloat
CA5(0.1999269574880599975585F
);
1465 const SimdFloat
CA3(-0.3333310186862945556640F
);
1466 const SimdFloat
one(1.0F
);
1467 SimdFloat x2
, x3
, x4
, pA
, pB
;
1473 x
= blend(x
, maskzInv(x
, m2
), m2
);
1478 pA
= fma(CA17
, x4
, CA13
);
1479 pB
= fma(CA15
, x4
, CA11
);
1480 pA
= fma(pA
, x4
, CA9
);
1481 pB
= fma(pB
, x4
, CA7
);
1482 pA
= fma(pA
, x4
, CA5
);
1483 pB
= fma(pB
, x4
, CA3
);
1484 pA
= fma(pA
, x2
, pB
);
1485 pA
= fma(pA
, x3
, x
);
1487 pA
= blend(pA
, halfpi
- pA
, m2
);
1488 pA
= blend(pA
, -pA
, m
);
1493 /*! \brief SIMD float atan2(y,x).
1495 * \param y Y component of vector, any quartile
1496 * \param x X component of vector, any quartile
1497 * \result Atan(y,x), same argument/value range as standard math library.
1499 * \note This routine should provide correct results for all finite
1500 * non-zero or positive-zero arguments. However, negative zero arguments will
1501 * be treated as positive zero, which means the return value will deviate from
1502 * the standard math library atan2(y,x) for those cases. That should not be
1503 * of any concern in Gromacs, and in particular it will not affect calculations
1504 * of angles from vectors.
1506 static inline SimdFloat gmx_simdcall
atan2(SimdFloat y
, SimdFloat x
)
1508 const SimdFloat
pi(static_cast<float>(M_PI
));
1509 const SimdFloat
halfpi(static_cast<float>(M_PI
/ 2.0));
1510 SimdFloat xinv
, p
, aoffset
;
1511 SimdFBool mask_xnz
, mask_ynz
, mask_xlt0
, mask_ylt0
;
1513 mask_xnz
= x
!= setZero();
1514 mask_ynz
= y
!= setZero();
1515 mask_xlt0
= x
< setZero();
1516 mask_ylt0
= y
< setZero();
1518 aoffset
= selectByNotMask(halfpi
, mask_xnz
);
1519 aoffset
= selectByMask(aoffset
, mask_ynz
);
1521 aoffset
= blend(aoffset
, pi
, mask_xlt0
);
1522 aoffset
= blend(aoffset
, -aoffset
, mask_ylt0
);
1524 xinv
= maskzInv(x
, mask_xnz
);
1532 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1534 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1535 * \result Correction factor to coulomb force - see below for details.
1537 * This routine is meant to enable analytical evaluation of the
1538 * direct-space PME electrostatic force to avoid tables.
1540 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1541 * are some problems evaluating that:
1543 * First, the error function is difficult (read: expensive) to
1544 * approxmiate accurately for intermediate to large arguments, and
1545 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1546 * Second, we now try to avoid calculating potentials in Gromacs but
1547 * use forces directly.
1549 * We can simply things slight by noting that the PME part is really
1550 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1552 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1554 * The first term we already have from the inverse square root, so
1555 * that we can leave out of this routine.
1557 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1558 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1559 * the range used for the minimax fit. Use your favorite plotting program
1560 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1562 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1563 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1564 * then only use even powers. This is another minor optimization, since
1565 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1566 * the vector between the two atoms to get the vectorial force. The
1567 * fastest flops are the ones we can avoid calculating!
1569 * So, here's how it should be used:
1571 * 1. Calculate \f$r^2\f$.
1572 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1573 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1574 * 4. The return value is the expression:
1577 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1580 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1583 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1586 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1589 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1592 * With a bit of math exercise you should be able to confirm that
1596 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1599 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1600 * and you have your force (divided by \f$r\f$). A final multiplication
1601 * with the vector connecting the two particles and you have your
1602 * vectorial force to add to the particles.
1604 * This approximation achieves an error slightly lower than 1e-6
1605 * in single precision and 1e-11 in double precision
1606 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1607 * when added to \f$1/r\f$ the error will be insignificant.
1608 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1611 static inline SimdFloat gmx_simdcall
pmeForceCorrection(SimdFloat z2
)
1613 const SimdFloat
FN6(-1.7357322914161492954e-8F
);
1614 const SimdFloat
FN5(1.4703624142580877519e-6F
);
1615 const SimdFloat
FN4(-0.000053401640219807709149F
);
1616 const SimdFloat
FN3(0.0010054721316683106153F
);
1617 const SimdFloat
FN2(-0.019278317264888380590F
);
1618 const SimdFloat
FN1(0.069670166153766424023F
);
1619 const SimdFloat
FN0(-0.75225204789749321333F
);
1621 const SimdFloat
FD4(0.0011193462567257629232F
);
1622 const SimdFloat
FD3(0.014866955030185295499F
);
1623 const SimdFloat
FD2(0.11583842382862377919F
);
1624 const SimdFloat
FD1(0.50736591960530292870F
);
1625 const SimdFloat
FD0(1.0F
);
1628 SimdFloat polyFN0
, polyFN1
, polyFD0
, polyFD1
;
1632 polyFD0
= fma(FD4
, z4
, FD2
);
1633 polyFD1
= fma(FD3
, z4
, FD1
);
1634 polyFD0
= fma(polyFD0
, z4
, FD0
);
1635 polyFD0
= fma(polyFD1
, z2
, polyFD0
);
1637 polyFD0
= inv(polyFD0
);
1639 polyFN0
= fma(FN6
, z4
, FN4
);
1640 polyFN1
= fma(FN5
, z4
, FN3
);
1641 polyFN0
= fma(polyFN0
, z4
, FN2
);
1642 polyFN1
= fma(polyFN1
, z4
, FN1
);
1643 polyFN0
= fma(polyFN0
, z4
, FN0
);
1644 polyFN0
= fma(polyFN1
, z2
, polyFN0
);
1646 return polyFN0
* polyFD0
;
1650 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1652 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1653 * \result Correction factor to coulomb potential - see below for details.
1655 * See \ref pmeForceCorrection for details about the approximation.
1657 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1658 * as the input argument.
1660 * Here's how it should be used:
1662 * 1. Calculate \f$r^2\f$.
1663 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1664 * 3. Evaluate this routine with z^2 as the argument.
1665 * 4. The return value is the expression:
1668 * \frac{\mbox{erf}(z)}{z}
1671 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1674 * \frac{\mbox{erf}(r \beta)}{r}
1677 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1678 * and you have your potential.
1680 * This approximation achieves an error slightly lower than 1e-6
1681 * in single precision and 4e-11 in double precision
1682 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1683 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1684 * when added to \f$1/r\f$ the error will be insignificant.
1685 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1687 static inline SimdFloat gmx_simdcall
pmePotentialCorrection(SimdFloat z2
)
1689 const SimdFloat
VN6(1.9296833005951166339e-8F
);
1690 const SimdFloat
VN5(-1.4213390571557850962e-6F
);
1691 const SimdFloat
VN4(0.000041603292906656984871F
);
1692 const SimdFloat
VN3(-0.00013134036773265025626F
);
1693 const SimdFloat
VN2(0.038657983986041781264F
);
1694 const SimdFloat
VN1(0.11285044772717598220F
);
1695 const SimdFloat
VN0(1.1283802385263030286F
);
1697 const SimdFloat
VD3(0.0066752224023576045451F
);
1698 const SimdFloat
VD2(0.078647795836373922256F
);
1699 const SimdFloat
VD1(0.43336185284710920150F
);
1700 const SimdFloat
VD0(1.0F
);
1703 SimdFloat polyVN0
, polyVN1
, polyVD0
, polyVD1
;
1707 polyVD1
= fma(VD3
, z4
, VD1
);
1708 polyVD0
= fma(VD2
, z4
, VD0
);
1709 polyVD0
= fma(polyVD1
, z2
, polyVD0
);
1711 polyVD0
= inv(polyVD0
);
1713 polyVN0
= fma(VN6
, z4
, VN4
);
1714 polyVN1
= fma(VN5
, z4
, VN3
);
1715 polyVN0
= fma(polyVN0
, z4
, VN2
);
1716 polyVN1
= fma(polyVN1
, z4
, VN1
);
1717 polyVN0
= fma(polyVN0
, z4
, VN0
);
1718 polyVN0
= fma(polyVN1
, z2
, polyVN0
);
1720 return polyVN0
* polyVD0
;
1726 # if GMX_SIMD_HAVE_DOUBLE
1729 /*! \name Double precision SIMD math functions
1731 * \note In most cases you should use the real-precision functions instead.
1735 /****************************************
1736 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1737 ****************************************/
1739 # if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1740 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1742 * \param x Values to set sign for
1743 * \param y Values used to set sign
1744 * \return Magnitude of x, sign of y
1746 static inline SimdDouble gmx_simdcall
copysign(SimdDouble x
, SimdDouble y
)
1748 # if GMX_SIMD_HAVE_LOGICAL
1749 return abs(x
) | (SimdDouble(GMX_DOUBLE_NEGZERO
) & y
);
1751 return blend(abs(x
), -abs(x
), (y
< setZero()));
1756 # if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1757 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1759 * This is a low-level routine that should only be used by SIMD math routine
1760 * that evaluates the inverse square root.
1762 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1763 * \param x The reference (starting) value x for which we want 1/sqrt(x).
1764 * \return An improved approximation with roughly twice as many bits of accuracy.
1766 static inline SimdDouble gmx_simdcall
rsqrtIter(SimdDouble lu
, SimdDouble x
)
1768 SimdDouble tmp1
= x
* lu
;
1769 SimdDouble tmp2
= SimdDouble(-0.5) * lu
;
1770 tmp1
= fma(tmp1
, lu
, SimdDouble(-3.0));
1775 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1777 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1778 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1779 * For the single precision implementation this is obviously always
1780 * true for positive values, but for double precision it adds an
1781 * extra restriction since the first lookup step might have to be
1782 * performed in single precision on some architectures. Note that the
1783 * responsibility for checking falls on you - this routine does not
1786 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
1788 static inline SimdDouble gmx_simdcall
invsqrt(SimdDouble x
)
1790 SimdDouble lu
= rsqrt(x
);
1791 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1792 lu
= rsqrtIter(lu
, x
);
1794 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1795 lu
= rsqrtIter(lu
, x
);
1797 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1798 lu
= rsqrtIter(lu
, x
);
1800 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1801 lu
= rsqrtIter(lu
, x
);
1806 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1808 * \param x0 First set of arguments, x0 must be in single range (see below).
1809 * \param x1 Second set of arguments, x1 must be in single range (see below).
1810 * \param[out] out0 Result 1/sqrt(x0)
1811 * \param[out] out1 Result 1/sqrt(x1)
1813 * In particular for double precision we can sometimes calculate square root
1814 * pairs slightly faster by using single precision until the very last step.
1816 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1817 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1818 * For the single precision implementation this is obviously always
1819 * true for positive values, but for double precision it adds an
1820 * extra restriction since the first lookup step might have to be
1821 * performed in single precision on some architectures. Note that the
1822 * responsibility for checking falls on you - this routine does not
1825 static inline void gmx_simdcall
invsqrtPair(SimdDouble x0
, SimdDouble x1
, SimdDouble
* out0
, SimdDouble
* out1
)
1827 # if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
1828 && (GMX_SIMD_RSQRT_BITS < 22)
1829 SimdFloat xf
= cvtDD2F(x0
, x1
);
1830 SimdFloat luf
= rsqrt(xf
);
1831 SimdDouble lu0
, lu1
;
1832 // Intermediate target is single - mantissa+1 bits
1833 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1834 luf
= rsqrtIter(luf
, xf
);
1836 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1837 luf
= rsqrtIter(luf
, xf
);
1839 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1840 luf
= rsqrtIter(luf
, xf
);
1842 cvtF2DD(luf
, &lu0
, &lu1
);
1843 // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1844 # if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1845 lu0
= rsqrtIter(lu0
, x0
);
1846 lu1
= rsqrtIter(lu1
, x1
);
1848 # if (GMX_SIMD_ACCURACY_BITS_SINGLE * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1849 lu0
= rsqrtIter(lu0
, x0
);
1850 lu1
= rsqrtIter(lu1
, x1
);
1855 *out0
= invsqrt(x0
);
1856 *out1
= invsqrt(x1
);
1860 # if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1861 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1863 * This is a low-level routine that should only be used by SIMD math routine
1864 * that evaluates the reciprocal.
1866 * \param lu Approximation of 1/x, typically obtained from lookup.
1867 * \param x The reference (starting) value x for which we want 1/x.
1868 * \return An improved approximation with roughly twice as many bits of accuracy.
1870 static inline SimdDouble gmx_simdcall
rcpIter(SimdDouble lu
, SimdDouble x
)
1872 return lu
* fnma(lu
, x
, SimdDouble(2.0));
1876 /*! \brief Calculate 1/x for SIMD double.
1878 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1879 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1880 * For the single precision implementation this is obviously always
1881 * true for positive values, but for double precision it adds an
1882 * extra restriction since the first lookup step might have to be
1883 * performed in single precision on some architectures. Note that the
1884 * responsibility for checking falls on you - this routine does not
1887 * \return 1/x. Result is undefined if your argument was invalid.
1889 static inline SimdDouble gmx_simdcall
inv(SimdDouble x
)
1891 SimdDouble lu
= rcp(x
);
1892 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1893 lu
= rcpIter(lu
, x
);
1895 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1896 lu
= rcpIter(lu
, x
);
1898 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1899 lu
= rcpIter(lu
, x
);
1901 # if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1902 lu
= rcpIter(lu
, x
);
1907 /*! \brief Division for SIMD doubles
1909 * \param nom Nominator
1910 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1911 * For single precision this is equivalent to a nonzero argument,
1912 * but in double precision it adds an extra restriction since
1913 * the first lookup step might have to be performed in single
1914 * precision on some architectures. Note that the responsibility
1915 * for checking falls on you - this routine does not check arguments.
1919 * \note This function does not use any masking to avoid problems with
1920 * zero values in the denominator.
1922 static inline SimdDouble gmx_simdcall
operator/(SimdDouble nom
, SimdDouble denom
)
1924 return nom
* inv(denom
);
1928 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1930 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1931 * Illegal values in the masked-out elements will not lead to
1932 * floating-point exceptions.
1934 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1935 * GMX_FLOAT_MAX for masked-in entries.
1936 * See \ref invsqrt for the discussion about argument restrictions.
1938 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
1939 * entry was not masked, and 0.0 for masked-out entries.
1941 static inline SimdDouble
maskzInvsqrt(SimdDouble x
, SimdDBool m
)
1943 SimdDouble lu
= maskzRsqrt(x
, m
);
1944 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1945 lu
= rsqrtIter(lu
, x
);
1947 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1948 lu
= rsqrtIter(lu
, x
);
1950 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1951 lu
= rsqrtIter(lu
, x
);
1953 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1954 lu
= rsqrtIter(lu
, x
);
1959 /*! \brief Calculate 1/x for SIMD double, masked version.
1961 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1962 * GMX_FLOAT_MAX for masked-in entries.
1963 * See \ref invsqrt for the discussion about argument restrictions.
1965 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1967 static inline SimdDouble gmx_simdcall
maskzInv(SimdDouble x
, SimdDBool m
)
1969 SimdDouble lu
= maskzRcp(x
, m
);
1970 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1971 lu
= rcpIter(lu
, x
);
1973 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1974 lu
= rcpIter(lu
, x
);
1976 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1977 lu
= rcpIter(lu
, x
);
1979 # if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1980 lu
= rcpIter(lu
, x
);
1986 /*! \brief Calculate sqrt(x) for SIMD doubles.
1988 * \copydetails sqrt(SimdFloat)
1990 template<MathOptimization opt
= MathOptimization::Safe
>
1991 static inline SimdDouble gmx_simdcall
sqrt(SimdDouble x
)
1993 if (opt
== MathOptimization::Safe
)
1995 // As we might use a float version of rsqrt, we mask out small values
1996 SimdDouble res
= maskzInvsqrt(x
, SimdDouble(GMX_FLOAT_MIN
) < x
);
2001 return x
* invsqrt(x
);
2005 /*! \brief Cube root for SIMD doubles
2007 * \param x Argument to calculate cube root of. Can be negative or zero,
2008 * but NaN or Inf values are not supported. Denormal values will
2009 * be treated as 0.0.
2010 * \return Cube root of x.
2012 static inline SimdDouble gmx_simdcall
cbrt(SimdDouble x
)
2014 const SimdDouble
signBit(GMX_DOUBLE_NEGZERO
);
2015 const SimdDouble
minDouble(std::numeric_limits
<double>::min());
2016 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2017 // To avoid clang warnings about fragile integer division mixed with FP, we let
2018 // the divided value (1023/3=341) be the original constant.
2019 const std::int32_t offsetDiv3(341);
2020 const SimdDouble
c6(-0.145263899385486377);
2021 const SimdDouble
c5(0.784932344976639262);
2022 const SimdDouble
c4(-1.83469277483613086);
2023 const SimdDouble
c3(2.44693122563534430);
2024 const SimdDouble
c2(-2.11499494167371287);
2025 const SimdDouble
c1(1.50819193781584896);
2026 const SimdDouble
c0(0.354895765043919860);
2027 const SimdDouble
one(1.0);
2028 const SimdDouble
two(2.0);
2029 const SimdDouble
three(3.0);
2030 const SimdDouble
oneThird(1.0 / 3.0);
2031 const SimdDouble
cbrt2(1.2599210498948731648);
2032 const SimdDouble
sqrCbrt2(1.5874010519681994748);
2034 // See the single precision routines for documentation of the algorithm
2036 SimdDouble xSignBit
= x
& signBit
; // create bit mask where the sign bit is 1 for x elements < 0
2037 SimdDouble xAbs
= andNot(signBit
, x
); // select everthing but the sign bit => abs(x)
2038 SimdDBool xIsNonZero
= (minDouble
<= xAbs
); // treat denormals as 0
2040 SimdDInt32 exponent
;
2041 SimdDouble y
= frexp(xAbs
, &exponent
);
2042 SimdDouble z
= fma(y
, c6
, c5
);
2048 SimdDouble w
= z
* z
* z
;
2049 SimdDouble nom
= z
* fma(two
, y
, w
);
2050 SimdDouble invDenom
= inv(fma(two
, w
, y
));
2052 SimdDouble offsetExp
= cvtI2R(exponent
) + SimdDouble(static_cast<double>(3 * offsetDiv3
) + 0.1);
2053 SimdDouble offsetExpDiv3
=
2054 trunc(offsetExp
* oneThird
); // important to truncate here to mimic integer division
2055 SimdDInt32 expDiv3
= cvtR2I(offsetExpDiv3
- SimdDouble(static_cast<double>(offsetDiv3
)));
2056 SimdDouble remainder
= offsetExp
- offsetExpDiv3
* three
;
2057 SimdDouble factor
= blend(one
, cbrt2
, SimdDouble(0.5) < remainder
);
2058 factor
= blend(factor
, sqrCbrt2
, SimdDouble(1.5) < remainder
);
2059 SimdDouble fraction
= (nom
* invDenom
* factor
) ^ xSignBit
;
2060 SimdDouble result
= selectByMask(ldexp(fraction
, expDiv3
), xIsNonZero
);
2064 /*! \brief Inverse cube root for SIMD doubles.
2066 * \param x Argument to calculate cube root of. Can be positive or
2067 * negative, but the magnitude cannot be lower than
2068 * the smallest normal number.
2069 * \return Cube root of x. Undefined for values that don't
2070 * fulfill the restriction of abs(x) > minDouble.
2072 static inline SimdDouble gmx_simdcall
invcbrt(SimdDouble x
)
2074 const SimdDouble
signBit(GMX_DOUBLE_NEGZERO
);
2075 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2076 // To avoid clang warnings about fragile integer division mixed with FP, we let
2077 // the divided value (1023/3=341) be the original constant.
2078 const std::int32_t offsetDiv3(341);
2079 const SimdDouble
c6(-0.145263899385486377);
2080 const SimdDouble
c5(0.784932344976639262);
2081 const SimdDouble
c4(-1.83469277483613086);
2082 const SimdDouble
c3(2.44693122563534430);
2083 const SimdDouble
c2(-2.11499494167371287);
2084 const SimdDouble
c1(1.50819193781584896);
2085 const SimdDouble
c0(0.354895765043919860);
2086 const SimdDouble
one(1.0);
2087 const SimdDouble
two(2.0);
2088 const SimdDouble
three(3.0);
2089 const SimdDouble
oneThird(1.0 / 3.0);
2090 const SimdDouble
invCbrt2(1.0 / 1.2599210498948731648);
2091 const SimdDouble
invSqrCbrt2(1.0F
/ 1.5874010519681994748);
2093 // See the single precision routines for documentation of the algorithm
2095 SimdDouble xSignBit
= x
& signBit
; // create bit mask where the sign bit is 1 for x elements < 0
2096 SimdDouble xAbs
= andNot(signBit
, x
); // select everthing but the sign bit => abs(x)
2098 SimdDInt32 exponent
;
2099 SimdDouble y
= frexp(xAbs
, &exponent
);
2100 SimdDouble z
= fma(y
, c6
, c5
);
2106 SimdDouble w
= z
* z
* z
;
2107 SimdDouble nom
= fma(two
, w
, y
);
2108 SimdDouble invDenom
= inv(z
* fma(two
, y
, w
));
2109 SimdDouble offsetExp
= cvtI2R(exponent
) + SimdDouble(static_cast<double>(3 * offsetDiv3
) + 0.1);
2110 SimdDouble offsetExpDiv3
=
2111 trunc(offsetExp
* oneThird
); // important to truncate here to mimic integer division
2112 SimdDInt32 expDiv3
= cvtR2I(SimdDouble(static_cast<double>(offsetDiv3
)) - offsetExpDiv3
);
2113 SimdDouble remainder
= offsetExpDiv3
* three
- offsetExp
;
2114 SimdDouble factor
= blend(one
, invCbrt2
, remainder
< SimdDouble(-0.5));
2115 factor
= blend(factor
, invSqrCbrt2
, remainder
< SimdDouble(-1.5));
2116 SimdDouble fraction
= (nom
* invDenom
* factor
) ^ xSignBit
;
2117 SimdDouble result
= ldexp(fraction
, expDiv3
);
2121 /*! \brief SIMD double log2(x). This is the base-2 logarithm.
2123 * \param x Argument, should be >0.
2124 * \result The base-2 logarithm of x. Undefined if argument is invalid.
2126 static inline SimdDouble gmx_simdcall
log2(SimdDouble x
)
2128 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2129 // Just rescale if native log2() is not present, but log is.
2130 return log(x
) * SimdDouble(std::log2(std::exp(1.0)));
2132 const SimdDouble
one(1.0);
2133 const SimdDouble
two(2.0);
2134 const SimdDouble
invsqrt2(1.0 / std::sqrt(2.0));
2135 const SimdDouble
CL15(0.2138031565795550370534528);
2136 const SimdDouble
CL13(0.2208884091496370882801159);
2137 const SimdDouble
CL11(0.2623358279761824340958754);
2138 const SimdDouble
CL9(0.3205984930182496084327681);
2139 const SimdDouble
CL7(0.4121985864521960363227038);
2140 const SimdDouble
CL5(0.5770780163410746954610886);
2141 const SimdDouble
CL3(0.9617966939260027547931031);
2142 const SimdDouble
CL1(2.885390081777926774009302);
2143 SimdDouble fExp
, x2
, p
;
2147 // For log2(), the argument cannot be 0, so use the faster version of frexp()
2148 x
= frexp
<MathOptimization::Unsafe
>(x
, &iExp
);
2149 fExp
= cvtI2R(iExp
);
2152 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2153 fExp
= fExp
- selectByMask(one
, m
);
2154 x
= x
* blend(one
, two
, m
);
2156 x
= (x
- one
) * inv(x
+ one
);
2159 p
= fma(CL15
, x2
, CL13
);
2160 p
= fma(p
, x2
, CL11
);
2161 p
= fma(p
, x2
, CL9
);
2162 p
= fma(p
, x2
, CL7
);
2163 p
= fma(p
, x2
, CL5
);
2164 p
= fma(p
, x2
, CL3
);
2165 p
= fma(p
, x2
, CL1
);
2166 p
= fma(p
, x
, fExp
);
2172 # if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2173 /*! \brief SIMD double log(x). This is the natural logarithm.
2175 * \param x Argument, should be >0.
2176 * \result The natural logarithm of x. Undefined if argument is invalid.
2178 static inline SimdDouble gmx_simdcall
log(SimdDouble x
)
2180 const SimdDouble
one(1.0);
2181 const SimdDouble
two(2.0);
2182 const SimdDouble
invsqrt2(1.0 / std::sqrt(2.0));
2183 const SimdDouble
corr(0.693147180559945286226764);
2184 const SimdDouble
CL15(0.148197055177935105296783);
2185 const SimdDouble
CL13(0.153108178020442575739679);
2186 const SimdDouble
CL11(0.181837339521549679055568);
2187 const SimdDouble
CL9(0.22222194152736701733275);
2188 const SimdDouble
CL7(0.285714288030134544449368);
2189 const SimdDouble
CL5(0.399999999989941956712869);
2190 const SimdDouble
CL3(0.666666666666685503450651);
2191 const SimdDouble
CL1(2.0);
2192 SimdDouble fExp
, x2
, p
;
2196 // For log(), the argument cannot be 0, so use the faster version of frexp()
2197 x
= frexp
<MathOptimization::Unsafe
>(x
, &iExp
);
2198 fExp
= cvtI2R(iExp
);
2201 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2202 fExp
= fExp
- selectByMask(one
, m
);
2203 x
= x
* blend(one
, two
, m
);
2205 x
= (x
- one
) * inv(x
+ one
);
2208 p
= fma(CL15
, x2
, CL13
);
2209 p
= fma(p
, x2
, CL11
);
2210 p
= fma(p
, x2
, CL9
);
2211 p
= fma(p
, x2
, CL7
);
2212 p
= fma(p
, x2
, CL5
);
2213 p
= fma(p
, x2
, CL3
);
2214 p
= fma(p
, x2
, CL1
);
2215 p
= fma(p
, x
, corr
* fExp
);
2221 # if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
2222 /*! \brief SIMD double 2^x.
2224 * \copydetails exp2(SimdFloat)
2226 template<MathOptimization opt
= MathOptimization::Safe
>
2227 static inline SimdDouble gmx_simdcall
exp2(SimdDouble x
)
2229 const SimdDouble
CE11(4.435280790452730022081181e-10);
2230 const SimdDouble
CE10(7.074105630863314448024247e-09);
2231 const SimdDouble
CE9(1.017819803432096698472621e-07);
2232 const SimdDouble
CE8(1.321543308956718799557863e-06);
2233 const SimdDouble
CE7(0.00001525273348995851746990884);
2234 const SimdDouble
CE6(0.0001540353046251466849082632);
2235 const SimdDouble
CE5(0.001333355814678995257307880);
2236 const SimdDouble
CE4(0.009618129107588335039176502);
2237 const SimdDouble
CE3(0.05550410866481992147457793);
2238 const SimdDouble
CE2(0.2402265069591015620470894);
2239 const SimdDouble
CE1(0.6931471805599453304615075);
2240 const SimdDouble
one(1.0);
2243 SimdDouble fexppart
;
2246 // Large negative values are valid arguments to exp2(), so there are two
2247 // things we need to account for:
2248 // 1. When the exponents reaches -1023, the (biased) exponent field will be
2249 // zero and we can no longer multiply with it. There are special IEEE
2250 // formats to handle this range, but for now we have to accept that
2251 // we cannot handle those arguments. If input value becomes even more
2252 // negative, it will start to loop and we would end up with invalid
2253 // exponents. Thus, we need to limit or mask this.
2254 // 2. For VERY large negative values, we will have problems that the
2255 // subtraction to get the fractional part loses accuracy, and then we
2256 // can end up with overflows in the polynomial.
2258 // For now, we handle this by forwarding the math optimization setting to
2259 // ldexp, where the routine will return zero for very small arguments.
2261 // However, before doing that we need to make sure we do not call cvtR2I
2262 // with an argument that is so negative it cannot be converted to an integer.
2263 if (opt
== MathOptimization::Safe
)
2265 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest()));
2268 fexppart
= ldexp
<opt
>(one
, cvtR2I(x
));
2272 p
= fma(CE11
, x
, CE10
);
2288 # if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
2289 /*! \brief SIMD double exp(x).
2291 * \copydetails exp(SimdFloat)
2293 template<MathOptimization opt
= MathOptimization::Safe
>
2294 static inline SimdDouble gmx_simdcall
exp(SimdDouble x
)
2296 const SimdDouble
argscale(1.44269504088896340735992468100);
2297 const SimdDouble
invargscale0(-0.69314718055966295651160180568695068359375);
2298 const SimdDouble
invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
2299 const SimdDouble
CE12(2.078375306791423699350304e-09);
2300 const SimdDouble
CE11(2.518173854179933105218635e-08);
2301 const SimdDouble
CE10(2.755842049600488770111608e-07);
2302 const SimdDouble
CE9(2.755691815216689746619849e-06);
2303 const SimdDouble
CE8(2.480158383706245033920920e-05);
2304 const SimdDouble
CE7(0.0001984127043518048611841321);
2305 const SimdDouble
CE6(0.001388888889360258341755930);
2306 const SimdDouble
CE5(0.008333333332907368102819109);
2307 const SimdDouble
CE4(0.04166666666663836745814631);
2308 const SimdDouble
CE3(0.1666666666666796929434570);
2309 const SimdDouble
CE2(0.5);
2310 const SimdDouble
one(1.0);
2311 SimdDouble fexppart
;
2315 // Large negative values are valid arguments to exp2(), so there are two
2316 // things we need to account for:
2317 // 1. When the exponents reaches -1023, the (biased) exponent field will be
2318 // zero and we can no longer multiply with it. There are special IEEE
2319 // formats to handle this range, but for now we have to accept that
2320 // we cannot handle those arguments. If input value becomes even more
2321 // negative, it will start to loop and we would end up with invalid
2322 // exponents. Thus, we need to limit or mask this.
2323 // 2. For VERY large negative values, we will have problems that the
2324 // subtraction to get the fractional part loses accuracy, and then we
2325 // can end up with overflows in the polynomial.
2327 // For now, we handle this by forwarding the math optimization setting to
2328 // ldexp, where the routine will return zero for very small arguments.
2330 // However, before doing that we need to make sure we do not call cvtR2I
2331 // with an argument that is so negative it cannot be converted to an integer
2332 // after being multiplied by argscale.
2334 if (opt
== MathOptimization::Safe
)
2336 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest()) / argscale
);
2341 fexppart
= ldexp
<opt
>(one
, cvtR2I(y
));
2344 // Extended precision arithmetics
2345 x
= fma(invargscale0
, intpart
, x
);
2346 x
= fma(invargscale1
, intpart
, x
);
2348 p
= fma(CE12
, x
, CE11
);
2349 p
= fma(p
, x
, CE10
);
2358 p
= fma(p
, x
* x
, x
);
2359 # if GMX_SIMD_HAVE_FMA
2360 x
= fma(p
, fexppart
, fexppart
);
2362 x
= (p
+ one
) * fexppart
;
2369 /*! \brief SIMD double pow(x,y)
2371 * This returns x^y for SIMD values.
2373 * \tparam opt If this is changed from the default (safe) into the unsafe
2374 * option, there are no guarantees about correct results for x==0.
2378 * \param y exponent.
2380 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
2381 * depending on the underlying implementation. If unsafe optimizations
2382 * are enabled, this is also true for x==0.
2384 * \warning You cannot rely on this implementation returning inf for arguments
2385 * that cause overflow. If you have some very large
2386 * values and need to rely on getting a valid numerical output,
2387 * take the minimum of your variable and the largest valid argument
2388 * before calling this routine.
2390 template<MathOptimization opt
= MathOptimization::Safe
>
2391 static inline SimdDouble gmx_simdcall
pow(SimdDouble x
, SimdDouble y
)
2395 if (opt
== MathOptimization::Safe
)
2397 xcorr
= max(x
, SimdDouble(std::numeric_limits
<double>::min()));
2404 SimdDouble result
= exp2
<opt
>(y
* log2(xcorr
));
2406 if (opt
== MathOptimization::Safe
)
2408 // if x==0 and y>0 we explicitly set the result to 0.0
2409 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
2410 result
= blend(result
, setZero(), x
== setZero() && setZero() < y
);
2417 /*! \brief SIMD double erf(x).
2419 * \param x The value to calculate erf(x) for.
2422 * This routine achieves very close to full precision, but we do not care about
2423 * the last bit or the subnormal result range.
2425 static inline SimdDouble gmx_simdcall
erf(SimdDouble x
)
2427 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2428 const SimdDouble
CAP4(-0.431780540597889301512e-4);
2429 const SimdDouble
CAP3(-0.00578562306260059236059);
2430 const SimdDouble
CAP2(-0.028593586920219752446);
2431 const SimdDouble
CAP1(-0.315924962948621698209);
2432 const SimdDouble
CAP0(0.14952975608477029151);
2434 const SimdDouble
CAQ5(-0.374089300177174709737e-5);
2435 const SimdDouble
CAQ4(0.00015126584532155383535);
2436 const SimdDouble
CAQ3(0.00536692680669480725423);
2437 const SimdDouble
CAQ2(0.0668686825594046122636);
2438 const SimdDouble
CAQ1(0.402604990869284362773);
2440 const SimdDouble
CAoffset(0.9788494110107421875);
2442 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2443 const SimdDouble
CBP6(2.49650423685462752497647637088e-10);
2444 const SimdDouble
CBP5(0.00119770193298159629350136085658);
2445 const SimdDouble
CBP4(0.0164944422378370965881008942733);
2446 const SimdDouble
CBP3(0.0984581468691775932063932439252);
2447 const SimdDouble
CBP2(0.317364595806937763843589437418);
2448 const SimdDouble
CBP1(0.554167062641455850932670067075);
2449 const SimdDouble
CBP0(0.427583576155807163756925301060);
2450 const SimdDouble
CBQ7(0.00212288829699830145976198384930);
2451 const SimdDouble
CBQ6(0.0334810979522685300554606393425);
2452 const SimdDouble
CBQ5(0.2361713785181450957579508850717);
2453 const SimdDouble
CBQ4(0.955364736493055670530981883072);
2454 const SimdDouble
CBQ3(2.36815675631420037315349279199);
2455 const SimdDouble
CBQ2(3.55261649184083035537184223542);
2456 const SimdDouble
CBQ1(2.93501136050160872574376997993);
2459 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2460 const SimdDouble
CCP6(-2.8175401114513378771);
2461 const SimdDouble
CCP5(-3.22729451764143718517);
2462 const SimdDouble
CCP4(-2.5518551727311523996);
2463 const SimdDouble
CCP3(-0.687717681153649930619);
2464 const SimdDouble
CCP2(-0.212652252872804219852);
2465 const SimdDouble
CCP1(0.0175389834052493308818);
2466 const SimdDouble
CCP0(0.00628057170626964891937);
2468 const SimdDouble
CCQ6(5.48409182238641741584);
2469 const SimdDouble
CCQ5(13.5064170191802889145);
2470 const SimdDouble
CCQ4(22.9367376522880577224);
2471 const SimdDouble
CCQ3(15.930646027911794143);
2472 const SimdDouble
CCQ2(11.0567237927800161565);
2473 const SimdDouble
CCQ1(2.79257750980575282228);
2475 const SimdDouble
CCoffset(0.5579090118408203125);
2477 const SimdDouble
one(1.0);
2478 const SimdDouble
two(2.0);
2479 const SimdDouble
minFloat(std::numeric_limits
<float>::min());
2481 SimdDouble xabs
, x2
, x4
, t
, t2
, w
, w2
;
2482 SimdDouble PolyAP0
, PolyAP1
, PolyAQ0
, PolyAQ1
;
2483 SimdDouble PolyBP0
, PolyBP1
, PolyBQ0
, PolyBQ1
;
2484 SimdDouble PolyCP0
, PolyCP1
, PolyCQ0
, PolyCQ1
;
2485 SimdDouble res_erf
, res_erfcB
, res_erfcC
, res_erfc
, res
;
2487 SimdDBool mask
, mask_erf
, notmask_erf
;
2491 mask_erf
= (xabs
< one
);
2492 notmask_erf
= (one
<= xabs
);
2496 PolyAP0
= fma(CAP4
, x4
, CAP2
);
2497 PolyAP1
= fma(CAP3
, x4
, CAP1
);
2498 PolyAP0
= fma(PolyAP0
, x4
, CAP0
);
2499 PolyAP0
= fma(PolyAP1
, x2
, PolyAP0
);
2501 PolyAQ1
= fma(CAQ5
, x4
, CAQ3
);
2502 PolyAQ0
= fma(CAQ4
, x4
, CAQ2
);
2503 PolyAQ1
= fma(PolyAQ1
, x4
, CAQ1
);
2504 PolyAQ0
= fma(PolyAQ0
, x4
, one
);
2505 PolyAQ0
= fma(PolyAQ1
, x2
, PolyAQ0
);
2507 res_erf
= PolyAP0
* maskzInv(PolyAQ0
, mask_erf
&& (minFloat
<= abs(PolyAQ0
)));
2508 res_erf
= CAoffset
+ res_erf
;
2509 res_erf
= x
* res_erf
;
2511 // Calculate erfc() in range [1,4.5]
2515 PolyBP0
= fma(CBP6
, t2
, CBP4
);
2516 PolyBP1
= fma(CBP5
, t2
, CBP3
);
2517 PolyBP0
= fma(PolyBP0
, t2
, CBP2
);
2518 PolyBP1
= fma(PolyBP1
, t2
, CBP1
);
2519 PolyBP0
= fma(PolyBP0
, t2
, CBP0
);
2520 PolyBP0
= fma(PolyBP1
, t
, PolyBP0
);
2522 PolyBQ1
= fma(CBQ7
, t2
, CBQ5
);
2523 PolyBQ0
= fma(CBQ6
, t2
, CBQ4
);
2524 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ3
);
2525 PolyBQ0
= fma(PolyBQ0
, t2
, CBQ2
);
2526 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ1
);
2527 PolyBQ0
= fma(PolyBQ0
, t2
, one
);
2528 PolyBQ0
= fma(PolyBQ1
, t
, PolyBQ0
);
2530 // The denominator polynomial can be zero outside the range
2531 res_erfcB
= PolyBP0
* maskzInv(PolyBQ0
, notmask_erf
&& (minFloat
<= abs(PolyBQ0
)));
2533 res_erfcB
= res_erfcB
* xabs
;
2535 // Calculate erfc() in range [4.5,inf]
2536 w
= maskzInv(xabs
, notmask_erf
&& (minFloat
<= xabs
));
2539 PolyCP0
= fma(CCP6
, w2
, CCP4
);
2540 PolyCP1
= fma(CCP5
, w2
, CCP3
);
2541 PolyCP0
= fma(PolyCP0
, w2
, CCP2
);
2542 PolyCP1
= fma(PolyCP1
, w2
, CCP1
);
2543 PolyCP0
= fma(PolyCP0
, w2
, CCP0
);
2544 PolyCP0
= fma(PolyCP1
, w
, PolyCP0
);
2546 PolyCQ0
= fma(CCQ6
, w2
, CCQ4
);
2547 PolyCQ1
= fma(CCQ5
, w2
, CCQ3
);
2548 PolyCQ0
= fma(PolyCQ0
, w2
, CCQ2
);
2549 PolyCQ1
= fma(PolyCQ1
, w2
, CCQ1
);
2550 PolyCQ0
= fma(PolyCQ0
, w2
, one
);
2551 PolyCQ0
= fma(PolyCQ1
, w
, PolyCQ0
);
2555 // The denominator polynomial can be zero outside the range
2556 res_erfcC
= PolyCP0
* maskzInv(PolyCQ0
, notmask_erf
&& (minFloat
<= abs(PolyCQ0
)));
2557 res_erfcC
= res_erfcC
+ CCoffset
;
2558 res_erfcC
= res_erfcC
* w
;
2560 mask
= (SimdDouble(4.5) < xabs
);
2561 res_erfc
= blend(res_erfcB
, res_erfcC
, mask
);
2563 res_erfc
= res_erfc
* expmx2
;
2565 // erfc(x<0) = 2-erfc(|x|)
2566 mask
= (x
< setZero());
2567 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
2569 // Select erf() or erfc()
2570 res
= blend(one
- res_erfc
, res_erf
, mask_erf
);
2575 /*! \brief SIMD double erfc(x).
2577 * \param x The value to calculate erfc(x) for.
2580 * This routine achieves full precision (bar the last bit) over most of the
2581 * input range, but for large arguments where the result is getting close
2582 * to the minimum representable numbers we accept slightly larger errors
2583 * (think results that are in the ballpark of 10^-200 for double)
2584 * since that is not relevant for MD.
2586 static inline SimdDouble gmx_simdcall
erfc(SimdDouble x
)
2588 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2589 const SimdDouble
CAP4(-0.431780540597889301512e-4);
2590 const SimdDouble
CAP3(-0.00578562306260059236059);
2591 const SimdDouble
CAP2(-0.028593586920219752446);
2592 const SimdDouble
CAP1(-0.315924962948621698209);
2593 const SimdDouble
CAP0(0.14952975608477029151);
2595 const SimdDouble
CAQ5(-0.374089300177174709737e-5);
2596 const SimdDouble
CAQ4(0.00015126584532155383535);
2597 const SimdDouble
CAQ3(0.00536692680669480725423);
2598 const SimdDouble
CAQ2(0.0668686825594046122636);
2599 const SimdDouble
CAQ1(0.402604990869284362773);
2601 const SimdDouble
CAoffset(0.9788494110107421875);
2603 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2604 const SimdDouble
CBP6(2.49650423685462752497647637088e-10);
2605 const SimdDouble
CBP5(0.00119770193298159629350136085658);
2606 const SimdDouble
CBP4(0.0164944422378370965881008942733);
2607 const SimdDouble
CBP3(0.0984581468691775932063932439252);
2608 const SimdDouble
CBP2(0.317364595806937763843589437418);
2609 const SimdDouble
CBP1(0.554167062641455850932670067075);
2610 const SimdDouble
CBP0(0.427583576155807163756925301060);
2611 const SimdDouble
CBQ7(0.00212288829699830145976198384930);
2612 const SimdDouble
CBQ6(0.0334810979522685300554606393425);
2613 const SimdDouble
CBQ5(0.2361713785181450957579508850717);
2614 const SimdDouble
CBQ4(0.955364736493055670530981883072);
2615 const SimdDouble
CBQ3(2.36815675631420037315349279199);
2616 const SimdDouble
CBQ2(3.55261649184083035537184223542);
2617 const SimdDouble
CBQ1(2.93501136050160872574376997993);
2620 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2621 const SimdDouble
CCP6(-2.8175401114513378771);
2622 const SimdDouble
CCP5(-3.22729451764143718517);
2623 const SimdDouble
CCP4(-2.5518551727311523996);
2624 const SimdDouble
CCP3(-0.687717681153649930619);
2625 const SimdDouble
CCP2(-0.212652252872804219852);
2626 const SimdDouble
CCP1(0.0175389834052493308818);
2627 const SimdDouble
CCP0(0.00628057170626964891937);
2629 const SimdDouble
CCQ6(5.48409182238641741584);
2630 const SimdDouble
CCQ5(13.5064170191802889145);
2631 const SimdDouble
CCQ4(22.9367376522880577224);
2632 const SimdDouble
CCQ3(15.930646027911794143);
2633 const SimdDouble
CCQ2(11.0567237927800161565);
2634 const SimdDouble
CCQ1(2.79257750980575282228);
2636 const SimdDouble
CCoffset(0.5579090118408203125);
2638 const SimdDouble
one(1.0);
2639 const SimdDouble
two(2.0);
2640 const SimdDouble
minFloat(std::numeric_limits
<float>::min());
2642 SimdDouble xabs
, x2
, x4
, t
, t2
, w
, w2
;
2643 SimdDouble PolyAP0
, PolyAP1
, PolyAQ0
, PolyAQ1
;
2644 SimdDouble PolyBP0
, PolyBP1
, PolyBQ0
, PolyBQ1
;
2645 SimdDouble PolyCP0
, PolyCP1
, PolyCQ0
, PolyCQ1
;
2646 SimdDouble res_erf
, res_erfcB
, res_erfcC
, res_erfc
, res
;
2648 SimdDBool mask
, mask_erf
, notmask_erf
;
2652 mask_erf
= (xabs
< one
);
2653 notmask_erf
= (one
<= xabs
);
2657 PolyAP0
= fma(CAP4
, x4
, CAP2
);
2658 PolyAP1
= fma(CAP3
, x4
, CAP1
);
2659 PolyAP0
= fma(PolyAP0
, x4
, CAP0
);
2660 PolyAP0
= fma(PolyAP1
, x2
, PolyAP0
);
2661 PolyAQ1
= fma(CAQ5
, x4
, CAQ3
);
2662 PolyAQ0
= fma(CAQ4
, x4
, CAQ2
);
2663 PolyAQ1
= fma(PolyAQ1
, x4
, CAQ1
);
2664 PolyAQ0
= fma(PolyAQ0
, x4
, one
);
2665 PolyAQ0
= fma(PolyAQ1
, x2
, PolyAQ0
);
2667 res_erf
= PolyAP0
* maskzInv(PolyAQ0
, mask_erf
&& (minFloat
<= abs(PolyAQ0
)));
2668 res_erf
= CAoffset
+ res_erf
;
2669 res_erf
= x
* res_erf
;
2671 // Calculate erfc() in range [1,4.5]
2675 PolyBP0
= fma(CBP6
, t2
, CBP4
);
2676 PolyBP1
= fma(CBP5
, t2
, CBP3
);
2677 PolyBP0
= fma(PolyBP0
, t2
, CBP2
);
2678 PolyBP1
= fma(PolyBP1
, t2
, CBP1
);
2679 PolyBP0
= fma(PolyBP0
, t2
, CBP0
);
2680 PolyBP0
= fma(PolyBP1
, t
, PolyBP0
);
2682 PolyBQ1
= fma(CBQ7
, t2
, CBQ5
);
2683 PolyBQ0
= fma(CBQ6
, t2
, CBQ4
);
2684 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ3
);
2685 PolyBQ0
= fma(PolyBQ0
, t2
, CBQ2
);
2686 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ1
);
2687 PolyBQ0
= fma(PolyBQ0
, t2
, one
);
2688 PolyBQ0
= fma(PolyBQ1
, t
, PolyBQ0
);
2690 // The denominator polynomial can be zero outside the range
2691 res_erfcB
= PolyBP0
* maskzInv(PolyBQ0
, notmask_erf
&& (minFloat
<= abs(PolyBQ0
)));
2693 res_erfcB
= res_erfcB
* xabs
;
2695 // Calculate erfc() in range [4.5,inf]
2696 // Note that 1/x can only handle single precision!
2697 w
= maskzInv(xabs
, minFloat
<= xabs
);
2700 PolyCP0
= fma(CCP6
, w2
, CCP4
);
2701 PolyCP1
= fma(CCP5
, w2
, CCP3
);
2702 PolyCP0
= fma(PolyCP0
, w2
, CCP2
);
2703 PolyCP1
= fma(PolyCP1
, w2
, CCP1
);
2704 PolyCP0
= fma(PolyCP0
, w2
, CCP0
);
2705 PolyCP0
= fma(PolyCP1
, w
, PolyCP0
);
2707 PolyCQ0
= fma(CCQ6
, w2
, CCQ4
);
2708 PolyCQ1
= fma(CCQ5
, w2
, CCQ3
);
2709 PolyCQ0
= fma(PolyCQ0
, w2
, CCQ2
);
2710 PolyCQ1
= fma(PolyCQ1
, w2
, CCQ1
);
2711 PolyCQ0
= fma(PolyCQ0
, w2
, one
);
2712 PolyCQ0
= fma(PolyCQ1
, w
, PolyCQ0
);
2716 // The denominator polynomial can be zero outside the range
2717 res_erfcC
= PolyCP0
* maskzInv(PolyCQ0
, notmask_erf
&& (minFloat
<= abs(PolyCQ0
)));
2718 res_erfcC
= res_erfcC
+ CCoffset
;
2719 res_erfcC
= res_erfcC
* w
;
2721 mask
= (SimdDouble(4.5) < xabs
);
2722 res_erfc
= blend(res_erfcB
, res_erfcC
, mask
);
2724 res_erfc
= res_erfc
* expmx2
;
2726 // erfc(x<0) = 2-erfc(|x|)
2727 mask
= (x
< setZero());
2728 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
2730 // Select erf() or erfc()
2731 res
= blend(res_erfc
, one
- res_erf
, mask_erf
);
2736 /*! \brief SIMD double sin \& cos.
2738 * \param x The argument to evaluate sin/cos for
2739 * \param[out] sinval Sin(x)
2740 * \param[out] cosval Cos(x)
2742 * This version achieves close to machine precision, but for very large
2743 * magnitudes of the argument we inherently begin to lose accuracy due to the
2744 * argument reduction, despite using extended precision arithmetics internally.
2746 static inline void gmx_simdcall
sincos(SimdDouble x
, SimdDouble
* sinval
, SimdDouble
* cosval
)
2748 // Constants to subtract Pi/4*x from y while minimizing precision loss
2749 const SimdDouble
argred0(-2 * 0.78539816290140151978);
2750 const SimdDouble
argred1(-2 * 4.9604678871439933374e-10);
2751 const SimdDouble
argred2(-2 * 1.1258708853173288931e-18);
2752 const SimdDouble
argred3(-2 * 1.7607799325916000908e-27);
2753 const SimdDouble
two_over_pi(2.0 / M_PI
);
2754 const SimdDouble
const_sin5(1.58938307283228937328511e-10);
2755 const SimdDouble
const_sin4(-2.50506943502539773349318e-08);
2756 const SimdDouble
const_sin3(2.75573131776846360512547e-06);
2757 const SimdDouble
const_sin2(-0.000198412698278911770864914);
2758 const SimdDouble
const_sin1(0.0083333333333191845961746);
2759 const SimdDouble
const_sin0(-0.166666666666666130709393);
2761 const SimdDouble
const_cos7(-1.13615350239097429531523e-11);
2762 const SimdDouble
const_cos6(2.08757471207040055479366e-09);
2763 const SimdDouble
const_cos5(-2.75573144028847567498567e-07);
2764 const SimdDouble
const_cos4(2.48015872890001867311915e-05);
2765 const SimdDouble
const_cos3(-0.00138888888888714019282329);
2766 const SimdDouble
const_cos2(0.0416666666666665519592062);
2767 const SimdDouble
half(0.5);
2768 const SimdDouble
one(1.0);
2769 SimdDouble ssign
, csign
;
2770 SimdDouble x2
, y
, z
, psin
, pcos
, sss
, ccc
;
2772 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2773 const SimdDInt32
ione(1);
2774 const SimdDInt32
itwo(2);
2777 z
= x
* two_over_pi
;
2781 mask
= cvtIB2B((iy
& ione
) == setZero());
2782 ssign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B((iy
& itwo
) == itwo
));
2783 csign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B(((iy
+ ione
) & itwo
) == itwo
));
2785 const SimdDouble
quarter(0.25);
2786 const SimdDouble
minusquarter(-0.25);
2788 SimdDBool m1
, m2
, m3
;
2790 /* The most obvious way to find the arguments quadrant in the unit circle
2791 * to calculate the sign is to use integer arithmetic, but that is not
2792 * present in all SIMD implementations. As an alternative, we have devised a
2793 * pure floating-point algorithm that uses truncation for argument reduction
2794 * so that we get a new value 0<=q<1 over the unit circle, and then
2795 * do floating-point comparisons with fractions. This is likely to be
2796 * slightly slower (~10%) due to the longer latencies of floating-point, so
2797 * we only use it when integer SIMD arithmetic is not present.
2801 // It is critical that half-way cases are rounded down
2802 z
= fma(x
, two_over_pi
, half
);
2806 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2807 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2808 * This removes the 2*Pi periodicity without using any integer arithmetic.
2809 * First check if y had the value 2 or 3, set csign if true.
2812 /* If we have logical operations we can work directly on the signbit, which
2813 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2814 * Thus, if you are altering defines to debug alternative code paths, the
2815 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2816 * active or inactive - you will get errors if only one is used.
2818 # if GMX_SIMD_HAVE_LOGICAL
2819 ssign
= ssign
& SimdDouble(GMX_DOUBLE_NEGZERO
);
2820 csign
= andNot(q
, SimdDouble(GMX_DOUBLE_NEGZERO
));
2821 ssign
= ssign
^ csign
;
2823 ssign
= copysign(SimdDouble(1.0), ssign
);
2824 csign
= copysign(SimdDouble(1.0), q
);
2826 ssign
= ssign
* csign
; // swap ssign if csign was set.
2828 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2829 m1
= (q
< minusquarter
);
2830 m2
= (setZero() <= q
);
2834 // where mask is FALSE, swap sign.
2835 csign
= csign
* blend(SimdDouble(-1.0), one
, mask
);
2837 x
= fma(y
, argred0
, x
);
2838 x
= fma(y
, argred1
, x
);
2839 x
= fma(y
, argred2
, x
);
2840 x
= fma(y
, argred3
, x
);
2843 psin
= fma(const_sin5
, x2
, const_sin4
);
2844 psin
= fma(psin
, x2
, const_sin3
);
2845 psin
= fma(psin
, x2
, const_sin2
);
2846 psin
= fma(psin
, x2
, const_sin1
);
2847 psin
= fma(psin
, x2
, const_sin0
);
2848 psin
= fma(psin
, x2
* x
, x
);
2850 pcos
= fma(const_cos7
, x2
, const_cos6
);
2851 pcos
= fma(pcos
, x2
, const_cos5
);
2852 pcos
= fma(pcos
, x2
, const_cos4
);
2853 pcos
= fma(pcos
, x2
, const_cos3
);
2854 pcos
= fma(pcos
, x2
, const_cos2
);
2855 pcos
= fms(pcos
, x2
, half
);
2856 pcos
= fma(pcos
, x2
, one
);
2858 sss
= blend(pcos
, psin
, mask
);
2859 ccc
= blend(psin
, pcos
, mask
);
2860 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2861 # if GMX_SIMD_HAVE_LOGICAL
2862 *sinval
= sss
^ ssign
;
2863 *cosval
= ccc
^ csign
;
2865 *sinval
= sss
* ssign
;
2866 *cosval
= ccc
* csign
;
2870 /*! \brief SIMD double sin(x).
2872 * \param x The argument to evaluate sin for
2875 * \attention Do NOT call both sin & cos if you need both results, since each of them
2876 * will then call \ref sincos and waste a factor 2 in performance.
2878 static inline SimdDouble gmx_simdcall
sin(SimdDouble x
)
2885 /*! \brief SIMD double cos(x).
2887 * \param x The argument to evaluate cos for
2890 * \attention Do NOT call both sin & cos if you need both results, since each of them
2891 * will then call \ref sincos and waste a factor 2 in performance.
2893 static inline SimdDouble gmx_simdcall
cos(SimdDouble x
)
2900 /*! \brief SIMD double tan(x).
2902 * \param x The argument to evaluate tan for
2905 static inline SimdDouble gmx_simdcall
tan(SimdDouble x
)
2907 const SimdDouble
argred0(-2 * 0.78539816290140151978);
2908 const SimdDouble
argred1(-2 * 4.9604678871439933374e-10);
2909 const SimdDouble
argred2(-2 * 1.1258708853173288931e-18);
2910 const SimdDouble
argred3(-2 * 1.7607799325916000908e-27);
2911 const SimdDouble
two_over_pi(2.0 / M_PI
);
2912 const SimdDouble
CT15(1.01419718511083373224408e-05);
2913 const SimdDouble
CT14(-2.59519791585924697698614e-05);
2914 const SimdDouble
CT13(5.23388081915899855325186e-05);
2915 const SimdDouble
CT12(-3.05033014433946488225616e-05);
2916 const SimdDouble
CT11(7.14707504084242744267497e-05);
2917 const SimdDouble
CT10(8.09674518280159187045078e-05);
2918 const SimdDouble
CT9(0.000244884931879331847054404);
2919 const SimdDouble
CT8(0.000588505168743587154904506);
2920 const SimdDouble
CT7(0.00145612788922812427978848);
2921 const SimdDouble
CT6(0.00359208743836906619142924);
2922 const SimdDouble
CT5(0.00886323944362401618113356);
2923 const SimdDouble
CT4(0.0218694882853846389592078);
2924 const SimdDouble
CT3(0.0539682539781298417636002);
2925 const SimdDouble
CT2(0.133333333333125941821962);
2926 const SimdDouble
CT1(0.333333333333334980164153);
2927 const SimdDouble
minFloat(std::numeric_limits
<float>::min());
2929 SimdDouble x2
, p
, y
, z
;
2932 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2936 z
= x
* two_over_pi
;
2939 m
= cvtIB2B((iy
& ione
) == ione
);
2941 x
= fma(y
, argred0
, x
);
2942 x
= fma(y
, argred1
, x
);
2943 x
= fma(y
, argred2
, x
);
2944 x
= fma(y
, argred3
, x
);
2945 x
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), m
) ^ x
;
2947 const SimdDouble
quarter(0.25);
2948 const SimdDouble
half(0.5);
2949 const SimdDouble
threequarter(0.75);
2950 const SimdDouble
minFloat(std::numeric_limits
<float>::min());
2952 SimdDBool m1
, m2
, m3
;
2955 z
= fma(w
, two_over_pi
, half
);
2959 m1
= (quarter
<= q
);
2961 m3
= (threequarter
<= q
);
2964 w
= fma(y
, argred0
, w
);
2965 w
= fma(y
, argred1
, w
);
2966 w
= fma(y
, argred2
, w
);
2967 w
= fma(y
, argred3
, w
);
2969 w
= blend(w
, -w
, m
);
2970 x
= w
* copysign(SimdDouble(1.0), x
);
2973 p
= fma(CT15
, x2
, CT14
);
2974 p
= fma(p
, x2
, CT13
);
2975 p
= fma(p
, x2
, CT12
);
2976 p
= fma(p
, x2
, CT11
);
2977 p
= fma(p
, x2
, CT10
);
2978 p
= fma(p
, x2
, CT9
);
2979 p
= fma(p
, x2
, CT8
);
2980 p
= fma(p
, x2
, CT7
);
2981 p
= fma(p
, x2
, CT6
);
2982 p
= fma(p
, x2
, CT5
);
2983 p
= fma(p
, x2
, CT4
);
2984 p
= fma(p
, x2
, CT3
);
2985 p
= fma(p
, x2
, CT2
);
2986 p
= fma(p
, x2
, CT1
);
2987 p
= fma(x2
, p
* x
, x
);
2989 p
= blend(p
, maskzInv(p
, m
&& (minFloat
< abs(p
))), m
);
2993 /*! \brief SIMD double asin(x).
2995 * \param x The argument to evaluate asin for
2998 static inline SimdDouble gmx_simdcall
asin(SimdDouble x
)
3000 // Same algorithm as cephes library
3001 const SimdDouble
limit1(0.625);
3002 const SimdDouble
limit2(1e-8);
3003 const SimdDouble
one(1.0);
3004 const SimdDouble
quarterpi(M_PI
/ 4.0);
3005 const SimdDouble
morebits(6.123233995736765886130e-17);
3007 const SimdDouble
P5(4.253011369004428248960e-3);
3008 const SimdDouble
P4(-6.019598008014123785661e-1);
3009 const SimdDouble
P3(5.444622390564711410273e0
);
3010 const SimdDouble
P2(-1.626247967210700244449e1
);
3011 const SimdDouble
P1(1.956261983317594739197e1
);
3012 const SimdDouble
P0(-8.198089802484824371615e0
);
3014 const SimdDouble
Q4(-1.474091372988853791896e1
);
3015 const SimdDouble
Q3(7.049610280856842141659e1
);
3016 const SimdDouble
Q2(-1.471791292232726029859e2
);
3017 const SimdDouble
Q1(1.395105614657485689735e2
);
3018 const SimdDouble
Q0(-4.918853881490881290097e1
);
3020 const SimdDouble
R4(2.967721961301243206100e-3);
3021 const SimdDouble
R3(-5.634242780008963776856e-1);
3022 const SimdDouble
R2(6.968710824104713396794e0
);
3023 const SimdDouble
R1(-2.556901049652824852289e1
);
3024 const SimdDouble
R0(2.853665548261061424989e1
);
3026 const SimdDouble
S3(-2.194779531642920639778e1
);
3027 const SimdDouble
S2(1.470656354026814941758e2
);
3028 const SimdDouble
S1(-3.838770957603691357202e2
);
3029 const SimdDouble
S0(3.424398657913078477438e2
);
3032 SimdDouble zz
, ww
, z
, q
, w
, zz2
, ww2
;
3037 SimdDouble nom
, denom
;
3038 SimdDBool mask
, mask2
;
3042 mask
= (limit1
< xabs
);
3050 RA
= fma(R4
, zz2
, R2
);
3051 RB
= fma(R3
, zz2
, R1
);
3052 RA
= fma(RA
, zz2
, R0
);
3053 RA
= fma(RB
, zz
, RA
);
3056 SB
= fma(S3
, zz2
, S1
);
3058 SA
= fma(SA
, zz2
, S0
);
3059 SA
= fma(SB
, zz
, SA
);
3062 PA
= fma(P5
, ww2
, P3
);
3063 PB
= fma(P4
, ww2
, P2
);
3064 PA
= fma(PA
, ww2
, P1
);
3065 PB
= fma(PB
, ww2
, P0
);
3066 PA
= fma(PA
, ww
, PB
);
3069 QB
= fma(Q4
, ww2
, Q2
);
3071 QA
= fma(QA
, ww2
, Q1
);
3072 QB
= fma(QB
, ww2
, Q0
);
3073 QA
= fma(QA
, ww
, QB
);
3078 nom
= blend(PA
, RA
, mask
);
3079 denom
= blend(QA
, SA
, mask
);
3081 mask2
= (limit2
< xabs
);
3082 q
= nom
* maskzInv(denom
, mask2
);
3087 zz
= fms(zz
, q
, morebits
);
3094 z
= blend(w
, z
, mask
);
3096 z
= blend(xabs
, z
, mask2
);
3103 /*! \brief SIMD double acos(x).
3105 * \param x The argument to evaluate acos for
3108 static inline SimdDouble gmx_simdcall
acos(SimdDouble x
)
3110 const SimdDouble
one(1.0);
3111 const SimdDouble
half(0.5);
3112 const SimdDouble
quarterpi0(7.85398163397448309616e-1);
3113 const SimdDouble
quarterpi1(6.123233995736765886130e-17);
3116 SimdDouble z
, z1
, z2
;
3119 z1
= half
* (one
- x
);
3121 z
= blend(x
, z1
, mask1
);
3127 z2
= quarterpi0
- z
;
3128 z2
= z2
+ quarterpi1
;
3129 z2
= z2
+ quarterpi0
;
3131 z
= blend(z2
, z1
, mask1
);
3136 /*! \brief SIMD double asin(x).
3138 * \param x The argument to evaluate atan for
3139 * \result Atan(x), same argument/value range as standard math library.
3141 static inline SimdDouble gmx_simdcall
atan(SimdDouble x
)
3143 // Same algorithm as cephes library
3144 const SimdDouble
limit1(0.66);
3145 const SimdDouble
limit2(2.41421356237309504880);
3146 const SimdDouble
quarterpi(M_PI
/ 4.0);
3147 const SimdDouble
halfpi(M_PI
/ 2.0);
3148 const SimdDouble
mone(-1.0);
3149 const SimdDouble
morebits1(0.5 * 6.123233995736765886130E-17);
3150 const SimdDouble
morebits2(6.123233995736765886130E-17);
3152 const SimdDouble
P4(-8.750608600031904122785E-1);
3153 const SimdDouble
P3(-1.615753718733365076637E1
);
3154 const SimdDouble
P2(-7.500855792314704667340E1
);
3155 const SimdDouble
P1(-1.228866684490136173410E2
);
3156 const SimdDouble
P0(-6.485021904942025371773E1
);
3158 const SimdDouble
Q4(2.485846490142306297962E1
);
3159 const SimdDouble
Q3(1.650270098316988542046E2
);
3160 const SimdDouble
Q2(4.328810604912902668951E2
);
3161 const SimdDouble
Q1(4.853903996359136964868E2
);
3162 const SimdDouble
Q0(1.945506571482613964425E2
);
3164 SimdDouble y
, xabs
, t1
, t2
;
3166 SimdDouble P_A
, P_B
, Q_A
, Q_B
;
3167 SimdDBool mask1
, mask2
;
3171 mask1
= (limit1
< xabs
);
3172 mask2
= (limit2
< xabs
);
3174 t1
= (xabs
+ mone
) * maskzInv(xabs
- mone
, mask1
);
3175 t2
= mone
* maskzInv(xabs
, mask2
);
3177 y
= selectByMask(quarterpi
, mask1
);
3178 y
= blend(y
, halfpi
, mask2
);
3179 xabs
= blend(xabs
, t1
, mask1
);
3180 xabs
= blend(xabs
, t2
, mask2
);
3185 P_A
= fma(P4
, z2
, P2
);
3186 P_B
= fma(P3
, z2
, P1
);
3187 P_A
= fma(P_A
, z2
, P0
);
3188 P_A
= fma(P_B
, z
, P_A
);
3191 Q_B
= fma(Q4
, z2
, Q2
);
3193 Q_A
= fma(Q_A
, z2
, Q1
);
3194 Q_B
= fma(Q_B
, z2
, Q0
);
3195 Q_A
= fma(Q_A
, z
, Q_B
);
3199 z
= fma(z
, xabs
, xabs
);
3201 t1
= selectByMask(morebits1
, mask1
);
3202 t1
= blend(t1
, morebits2
, mask2
);
3212 /*! \brief SIMD double atan2(y,x).
3214 * \param y Y component of vector, any quartile
3215 * \param x X component of vector, any quartile
3216 * \result Atan(y,x), same argument/value range as standard math library.
3218 * \note This routine should provide correct results for all finite
3219 * non-zero or positive-zero arguments. However, negative zero arguments will
3220 * be treated as positive zero, which means the return value will deviate from
3221 * the standard math library atan2(y,x) for those cases. That should not be
3222 * of any concern in Gromacs, and in particular it will not affect calculations
3223 * of angles from vectors.
3225 static inline SimdDouble gmx_simdcall
atan2(SimdDouble y
, SimdDouble x
)
3227 const SimdDouble
pi(M_PI
);
3228 const SimdDouble
halfpi(M_PI
/ 2.0);
3229 const SimdDouble
minFloat(std::numeric_limits
<float>::min());
3230 SimdDouble xinv
, p
, aoffset
;
3231 SimdDBool mask_xnz
, mask_ynz
, mask_xlt0
, mask_ylt0
;
3233 mask_xnz
= x
!= setZero();
3234 mask_ynz
= y
!= setZero();
3235 mask_xlt0
= (x
< setZero());
3236 mask_ylt0
= (y
< setZero());
3238 aoffset
= selectByNotMask(halfpi
, mask_xnz
);
3239 aoffset
= selectByMask(aoffset
, mask_ynz
);
3241 aoffset
= blend(aoffset
, pi
, mask_xlt0
);
3242 aoffset
= blend(aoffset
, -aoffset
, mask_ylt0
);
3244 xinv
= maskzInv(x
, mask_xnz
&& (minFloat
<= abs(x
)));
3253 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
3255 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3256 * interaction distance and beta the ewald splitting parameters.
3257 * \result Correction factor to coulomb force.
3259 * This routine is meant to enable analytical evaluation of the
3260 * direct-space PME electrostatic force to avoid tables. For details, see the
3261 * single precision function.
3263 static inline SimdDouble gmx_simdcall
pmeForceCorrection(SimdDouble z2
)
3265 const SimdDouble
FN10(-8.0072854618360083154e-14);
3266 const SimdDouble
FN9(1.1859116242260148027e-11);
3267 const SimdDouble
FN8(-8.1490406329798423616e-10);
3268 const SimdDouble
FN7(3.4404793543907847655e-8);
3269 const SimdDouble
FN6(-9.9471420832602741006e-7);
3270 const SimdDouble
FN5(0.000020740315999115847456);
3271 const SimdDouble
FN4(-0.00031991745139313364005);
3272 const SimdDouble
FN3(0.0035074449373659008203);
3273 const SimdDouble
FN2(-0.031750380176100813405);
3274 const SimdDouble
FN1(0.13884101728898463426);
3275 const SimdDouble
FN0(-0.75225277815249618847);
3277 const SimdDouble
FD5(0.000016009278224355026701);
3278 const SimdDouble
FD4(0.00051055686934806966046);
3279 const SimdDouble
FD3(0.0081803507497974289008);
3280 const SimdDouble
FD2(0.077181146026670287235);
3281 const SimdDouble
FD1(0.41543303143712535988);
3282 const SimdDouble
FD0(1.0);
3285 SimdDouble polyFN0
, polyFN1
, polyFD0
, polyFD1
;
3289 polyFD1
= fma(FD5
, z4
, FD3
);
3290 polyFD1
= fma(polyFD1
, z4
, FD1
);
3291 polyFD1
= polyFD1
* z2
;
3292 polyFD0
= fma(FD4
, z4
, FD2
);
3293 polyFD0
= fma(polyFD0
, z4
, FD0
);
3294 polyFD0
= polyFD0
+ polyFD1
;
3296 polyFD0
= inv(polyFD0
);
3298 polyFN0
= fma(FN10
, z4
, FN8
);
3299 polyFN0
= fma(polyFN0
, z4
, FN6
);
3300 polyFN0
= fma(polyFN0
, z4
, FN4
);
3301 polyFN0
= fma(polyFN0
, z4
, FN2
);
3302 polyFN0
= fma(polyFN0
, z4
, FN0
);
3303 polyFN1
= fma(FN9
, z4
, FN7
);
3304 polyFN1
= fma(polyFN1
, z4
, FN5
);
3305 polyFN1
= fma(polyFN1
, z4
, FN3
);
3306 polyFN1
= fma(polyFN1
, z4
, FN1
);
3307 polyFN0
= fma(polyFN1
, z2
, polyFN0
);
3310 return polyFN0
* polyFD0
;
3314 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
3316 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3317 * interaction distance and beta the ewald splitting parameters.
3318 * \result Correction factor to coulomb force.
3320 * This routine is meant to enable analytical evaluation of the
3321 * direct-space PME electrostatic potential to avoid tables. For details, see the
3322 * single precision function.
3324 static inline SimdDouble gmx_simdcall
pmePotentialCorrection(SimdDouble z2
)
3326 const SimdDouble
VN9(-9.3723776169321855475e-13);
3327 const SimdDouble
VN8(1.2280156762674215741e-10);
3328 const SimdDouble
VN7(-7.3562157912251309487e-9);
3329 const SimdDouble
VN6(2.6215886208032517509e-7);
3330 const SimdDouble
VN5(-4.9532491651265819499e-6);
3331 const SimdDouble
VN4(0.00025907400778966060389);
3332 const SimdDouble
VN3(0.0010585044856156469792);
3333 const SimdDouble
VN2(0.045247661136833092885);
3334 const SimdDouble
VN1(0.11643931522926034421);
3335 const SimdDouble
VN0(1.1283791671726767970);
3337 const SimdDouble
VD5(0.000021784709867336150342);
3338 const SimdDouble
VD4(0.00064293662010911388448);
3339 const SimdDouble
VD3(0.0096311444822588683504);
3340 const SimdDouble
VD2(0.085608012351550627051);
3341 const SimdDouble
VD1(0.43652499166614811084);
3342 const SimdDouble
VD0(1.0);
3345 SimdDouble polyVN0
, polyVN1
, polyVD0
, polyVD1
;
3349 polyVD1
= fma(VD5
, z4
, VD3
);
3350 polyVD0
= fma(VD4
, z4
, VD2
);
3351 polyVD1
= fma(polyVD1
, z4
, VD1
);
3352 polyVD0
= fma(polyVD0
, z4
, VD0
);
3353 polyVD0
= fma(polyVD1
, z2
, polyVD0
);
3355 polyVD0
= inv(polyVD0
);
3357 polyVN1
= fma(VN9
, z4
, VN7
);
3358 polyVN0
= fma(VN8
, z4
, VN6
);
3359 polyVN1
= fma(polyVN1
, z4
, VN5
);
3360 polyVN0
= fma(polyVN0
, z4
, VN4
);
3361 polyVN1
= fma(polyVN1
, z4
, VN3
);
3362 polyVN0
= fma(polyVN0
, z4
, VN2
);
3363 polyVN1
= fma(polyVN1
, z4
, VN1
);
3364 polyVN0
= fma(polyVN0
, z4
, VN0
);
3365 polyVN0
= fma(polyVN1
, z2
, polyVN0
);
3367 return polyVN0
* polyVD0
;
3373 /*! \name SIMD math functions for double prec. data, single prec. accuracy
3375 * \note In some cases we do not need full double accuracy of individual
3376 * SIMD math functions, although the data is stored in double precision
3377 * SIMD registers. This might be the case for special algorithms, or
3378 * if the architecture does not support single precision.
3379 * Since the full double precision evaluation of math functions
3380 * typically require much more expensive polynomial approximations
3381 * these functions implement the algorithms used in the single precision
3382 * SIMD math functions, but they operate on double precision
3388 /*********************************************************************
3389 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3390 *********************************************************************/
3392 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
3394 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3395 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3396 * For the single precision implementation this is obviously always
3397 * true for positive values, but for double precision it adds an
3398 * extra restriction since the first lookup step might have to be
3399 * performed in single precision on some architectures. Note that the
3400 * responsibility for checking falls on you - this routine does not
3403 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3405 static inline SimdDouble gmx_simdcall
invsqrtSingleAccuracy(SimdDouble x
)
3407 SimdDouble lu
= rsqrt(x
);
3408 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3409 lu
= rsqrtIter(lu
, x
);
3411 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3412 lu
= rsqrtIter(lu
, x
);
3414 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3415 lu
= rsqrtIter(lu
, x
);
3420 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
3422 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3423 * Illegal values in the masked-out elements will not lead to
3424 * floating-point exceptions.
3426 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3427 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3428 * For the single precision implementation this is obviously always
3429 * true for positive values, but for double precision it adds an
3430 * extra restriction since the first lookup step might have to be
3431 * performed in single precision on some architectures. Note that the
3432 * responsibility for checking falls on you - this routine does not
3436 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
3437 * entry was not masked, and 0.0 for masked-out entries.
3439 static inline SimdDouble
maskzInvsqrtSingleAccuracy(SimdDouble x
, SimdDBool m
)
3441 SimdDouble lu
= maskzRsqrt(x
, m
);
3442 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3443 lu
= rsqrtIter(lu
, x
);
3445 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3446 lu
= rsqrtIter(lu
, x
);
3448 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3449 lu
= rsqrtIter(lu
, x
);
3454 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
3456 * \param x0 First set of arguments, x0 must be in single range (see below).
3457 * \param x1 Second set of arguments, x1 must be in single range (see below).
3458 * \param[out] out0 Result 1/sqrt(x0)
3459 * \param[out] out1 Result 1/sqrt(x1)
3461 * In particular for double precision we can sometimes calculate square root
3462 * pairs slightly faster by using single precision until the very last step.
3464 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
3465 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3466 * For the single precision implementation this is obviously always
3467 * true for positive values, but for double precision it adds an
3468 * extra restriction since the first lookup step might have to be
3469 * performed in single precision on some architectures. Note that the
3470 * responsibility for checking falls on you - this routine does not
3473 static inline void gmx_simdcall
invsqrtPairSingleAccuracy(SimdDouble x0
,
3478 # if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
3479 && (GMX_SIMD_RSQRT_BITS < 22)
3480 SimdFloat xf
= cvtDD2F(x0
, x1
);
3481 SimdFloat luf
= rsqrt(xf
);
3482 SimdDouble lu0
, lu1
;
3483 // Intermediate target is single - mantissa+1 bits
3484 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3485 luf
= rsqrtIter(luf
, xf
);
3487 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3488 luf
= rsqrtIter(luf
, xf
);
3490 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3491 luf
= rsqrtIter(luf
, xf
);
3493 cvtF2DD(luf
, &lu0
, &lu1
);
3494 // We now have single-precision accuracy values in lu0/lu1
3498 *out0
= invsqrtSingleAccuracy(x0
);
3499 *out1
= invsqrtSingleAccuracy(x1
);
3503 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3505 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3506 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3507 * For the single precision implementation this is obviously always
3508 * true for positive values, but for double precision it adds an
3509 * extra restriction since the first lookup step might have to be
3510 * performed in single precision on some architectures. Note that the
3511 * responsibility for checking falls on you - this routine does not
3514 * \return 1/x. Result is undefined if your argument was invalid.
3516 static inline SimdDouble gmx_simdcall
invSingleAccuracy(SimdDouble x
)
3518 SimdDouble lu
= rcp(x
);
3519 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3520 lu
= rcpIter(lu
, x
);
3522 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3523 lu
= rcpIter(lu
, x
);
3525 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3526 lu
= rcpIter(lu
, x
);
3531 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3533 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3534 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3535 * For the single precision implementation this is obviously always
3536 * true for positive values, but for double precision it adds an
3537 * extra restriction since the first lookup step might have to be
3538 * performed in single precision on some architectures. Note that the
3539 * responsibility for checking falls on you - this routine does not
3543 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3545 static inline SimdDouble gmx_simdcall
maskzInvSingleAccuracy(SimdDouble x
, SimdDBool m
)
3547 SimdDouble lu
= maskzRcp(x
, m
);
3548 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3549 lu
= rcpIter(lu
, x
);
3551 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3552 lu
= rcpIter(lu
, x
);
3554 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3555 lu
= rcpIter(lu
, x
);
3561 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3563 * \copydetails sqrt(SimdFloat)
3565 template<MathOptimization opt
= MathOptimization::Safe
>
3566 static inline SimdDouble gmx_simdcall
sqrtSingleAccuracy(SimdDouble x
)
3568 if (opt
== MathOptimization::Safe
)
3570 SimdDouble res
= maskzInvsqrt(x
, SimdDouble(GMX_FLOAT_MIN
) < x
);
3575 return x
* invsqrtSingleAccuracy(x
);
3579 /*! \brief Cube root for SIMD doubles, single accuracy.
3581 * \param x Argument to calculate cube root of. Can be negative or zero,
3582 * but NaN or Inf values are not supported. Denormal values will
3583 * be treated as 0.0.
3584 * \return Cube root of x.
3586 static inline SimdDouble gmx_simdcall
cbrtSingleAccuracy(SimdDouble x
)
3588 const SimdDouble
signBit(GMX_DOUBLE_NEGZERO
);
3589 const SimdDouble
minDouble(std::numeric_limits
<double>::min());
3590 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3591 // Use the divided value as original constant to avoid division warnings.
3592 const std::int32_t offsetDiv3(341);
3593 const SimdDouble
c2(-0.191502161678719066);
3594 const SimdDouble
c1(0.697570460207922770);
3595 const SimdDouble
c0(0.492659620528969547);
3596 const SimdDouble
one(1.0);
3597 const SimdDouble
two(2.0);
3598 const SimdDouble
three(3.0);
3599 const SimdDouble
oneThird(1.0 / 3.0);
3600 const SimdDouble
cbrt2(1.2599210498948731648);
3601 const SimdDouble
sqrCbrt2(1.5874010519681994748);
3603 // See the single precision routines for documentation of the algorithm
3605 SimdDouble xSignBit
= x
& signBit
; // create bit mask where the sign bit is 1 for x elements < 0
3606 SimdDouble xAbs
= andNot(signBit
, x
); // select everthing but the sign bit => abs(x)
3607 SimdDBool xIsNonZero
= (minDouble
<= xAbs
); // treat denormals as 0
3609 SimdDInt32 exponent
;
3610 SimdDouble y
= frexp(xAbs
, &exponent
);
3611 SimdDouble z
= fma(fma(y
, c2
, c1
), y
, c0
);
3612 SimdDouble w
= z
* z
* z
;
3613 SimdDouble nom
= z
* fma(two
, y
, w
);
3614 SimdDouble invDenom
= inv(fma(two
, w
, y
));
3616 SimdDouble offsetExp
= cvtI2R(exponent
) + SimdDouble(static_cast<double>(3 * offsetDiv3
) + 0.1);
3617 SimdDouble offsetExpDiv3
=
3618 trunc(offsetExp
* oneThird
); // important to truncate here to mimic integer division
3619 SimdDInt32 expDiv3
= cvtR2I(offsetExpDiv3
- SimdDouble(static_cast<double>(offsetDiv3
)));
3620 SimdDouble remainder
= offsetExp
- offsetExpDiv3
* three
;
3621 SimdDouble factor
= blend(one
, cbrt2
, SimdDouble(0.5) < remainder
);
3622 factor
= blend(factor
, sqrCbrt2
, SimdDouble(1.5) < remainder
);
3623 SimdDouble fraction
= (nom
* invDenom
* factor
) ^ xSignBit
;
3624 SimdDouble result
= selectByMask(ldexp(fraction
, expDiv3
), xIsNonZero
);
3628 /*! \brief Inverse cube root for SIMD doubles, single accuracy.
3630 * \param x Argument to calculate cube root of. Can be positive or
3631 * negative, but the magnitude cannot be lower than
3632 * the smallest normal number.
3633 * \return Cube root of x. Undefined for values that don't
3634 * fulfill the restriction of abs(x) > minDouble.
3636 static inline SimdDouble gmx_simdcall
invcbrtSingleAccuracy(SimdDouble x
)
3638 const SimdDouble
signBit(GMX_DOUBLE_NEGZERO
);
3639 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3640 // Use the divided value as original constant to avoid division warnings.
3641 const std::int32_t offsetDiv3(341);
3642 const SimdDouble
c2(-0.191502161678719066);
3643 const SimdDouble
c1(0.697570460207922770);
3644 const SimdDouble
c0(0.492659620528969547);
3645 const SimdDouble
one(1.0);
3646 const SimdDouble
two(2.0);
3647 const SimdDouble
three(3.0);
3648 const SimdDouble
oneThird(1.0 / 3.0);
3649 const SimdDouble
invCbrt2(1.0 / 1.2599210498948731648);
3650 const SimdDouble
invSqrCbrt2(1.0F
/ 1.5874010519681994748);
3652 // See the single precision routines for documentation of the algorithm
3654 SimdDouble xSignBit
= x
& signBit
; // create bit mask where the sign bit is 1 for x elements < 0
3655 SimdDouble xAbs
= andNot(signBit
, x
); // select everthing but the sign bit => abs(x)
3657 SimdDInt32 exponent
;
3658 SimdDouble y
= frexp(xAbs
, &exponent
);
3659 SimdDouble z
= fma(fma(y
, c2
, c1
), y
, c0
);
3660 SimdDouble w
= z
* z
* z
;
3661 SimdDouble nom
= fma(two
, w
, y
);
3662 SimdDouble invDenom
= inv(z
* fma(two
, y
, w
));
3663 SimdDouble offsetExp
= cvtI2R(exponent
) + SimdDouble(static_cast<double>(3 * offsetDiv3
) + 0.1);
3664 SimdDouble offsetExpDiv3
=
3665 trunc(offsetExp
* oneThird
); // important to truncate here to mimic integer division
3666 SimdDInt32 expDiv3
= cvtR2I(SimdDouble(static_cast<double>(offsetDiv3
)) - offsetExpDiv3
);
3667 SimdDouble remainder
= offsetExpDiv3
* three
- offsetExp
;
3668 SimdDouble factor
= blend(one
, invCbrt2
, remainder
< SimdDouble(-0.5));
3669 factor
= blend(factor
, invSqrCbrt2
, remainder
< SimdDouble(-1.5));
3670 SimdDouble fraction
= (nom
* invDenom
* factor
) ^ xSignBit
;
3671 SimdDouble result
= ldexp(fraction
, expDiv3
);
3675 /*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
3677 * \param x Argument, should be >0.
3678 * \result The base 2 logarithm of x. Undefined if argument is invalid.
3680 static inline SimdDouble gmx_simdcall
log2SingleAccuracy(SimdDouble x
)
3682 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3683 return log(x
) * SimdDouble(std::log2(std::exp(1.0)));
3685 const SimdDouble
one(1.0);
3686 const SimdDouble
two(2.0);
3687 const SimdDouble
sqrt2(std::sqrt(2.0));
3688 const SimdDouble
CL9(0.342149508897807708152F
);
3689 const SimdDouble
CL7(0.411570606888219447939F
);
3690 const SimdDouble
CL5(0.577085979152320294183F
);
3691 const SimdDouble
CL3(0.961796550607099898222F
);
3692 const SimdDouble
CL1(2.885390081777926774009F
);
3693 SimdDouble fexp
, x2
, p
;
3697 // For log2(), the argument cannot be 0, so use the faster version of frexp
3698 x
= frexp
<MathOptimization::Unsafe
>(x
, &iexp
);
3699 fexp
= cvtI2R(iexp
);
3702 // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3703 fexp
= fexp
- selectByMask(one
, mask
);
3704 x
= x
* blend(one
, two
, mask
);
3706 x
= (x
- one
) * invSingleAccuracy(x
+ one
);
3709 p
= fma(CL9
, x2
, CL7
);
3710 p
= fma(p
, x2
, CL5
);
3711 p
= fma(p
, x2
, CL3
);
3712 p
= fma(p
, x2
, CL1
);
3713 p
= fma(p
, x
, fexp
);
3719 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3721 * \param x Argument, should be >0.
3722 * \result The natural logarithm of x. Undefined if argument is invalid.
3724 static inline SimdDouble gmx_simdcall
logSingleAccuracy(SimdDouble x
)
3726 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3729 const SimdDouble
one(1.0);
3730 const SimdDouble
two(2.0);
3731 const SimdDouble
invsqrt2(1.0 / std::sqrt(2.0));
3732 const SimdDouble
corr(0.693147180559945286226764);
3733 const SimdDouble
CL9(0.2371599674224853515625);
3734 const SimdDouble
CL7(0.285279005765914916992188);
3735 const SimdDouble
CL5(0.400005519390106201171875);
3736 const SimdDouble
CL3(0.666666567325592041015625);
3737 const SimdDouble
CL1(2.0);
3738 SimdDouble fexp
, x2
, p
;
3742 // For log(), the argument cannot be 0, so use the faster version of frexp
3743 x
= frexp
<MathOptimization::Unsafe
>(x
, &iexp
);
3744 fexp
= cvtI2R(iexp
);
3746 mask
= x
< invsqrt2
;
3747 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
3748 fexp
= fexp
- selectByMask(one
, mask
);
3749 x
= x
* blend(one
, two
, mask
);
3751 x
= (x
- one
) * invSingleAccuracy(x
+ one
);
3754 p
= fma(CL9
, x2
, CL7
);
3755 p
= fma(p
, x2
, CL5
);
3756 p
= fma(p
, x2
, CL3
);
3757 p
= fma(p
, x2
, CL1
);
3758 p
= fma(p
, x
, corr
* fexp
);
3764 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3766 * \copydetails exp2(SimdFloat)
3768 template<MathOptimization opt
= MathOptimization::Safe
>
3769 static inline SimdDouble gmx_simdcall
exp2SingleAccuracy(SimdDouble x
)
3771 # if GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
3774 const SimdDouble
CC6(0.0001534581200287996416911311);
3775 const SimdDouble
CC5(0.001339993121934088894618990);
3776 const SimdDouble
CC4(0.009618488957115180159497841);
3777 const SimdDouble
CC3(0.05550328776964726865751735);
3778 const SimdDouble
CC2(0.2402264689063408646490722);
3779 const SimdDouble
CC1(0.6931472057372680777553816);
3780 const SimdDouble
one(1.0);
3786 // Large negative values are valid arguments to exp2(), so there are two
3787 // things we need to account for:
3788 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3789 // zero and we can no longer multiply with it. There are special IEEE
3790 // formats to handle this range, but for now we have to accept that
3791 // we cannot handle those arguments. If input value becomes even more
3792 // negative, it will start to loop and we would end up with invalid
3793 // exponents. Thus, we need to limit or mask this.
3794 // 2. For VERY large negative values, we will have problems that the
3795 // subtraction to get the fractional part loses accuracy, and then we
3796 // can end up with overflows in the polynomial.
3798 // For now, we handle this by forwarding the math optimization setting to
3799 // ldexp, where the routine will return zero for very small arguments.
3801 // However, before doing that we need to make sure we do not call cvtR2I
3802 // with an argument that is so negative it cannot be converted to an integer.
3803 if (opt
== MathOptimization::Safe
)
3805 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest()));
3812 p
= fma(CC6
, x
, CC5
);
3818 x
= ldexp
<opt
>(p
, ix
);
3825 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3827 * \copydetails exp(SimdFloat)
3829 template<MathOptimization opt
= MathOptimization::Safe
>
3830 static inline SimdDouble gmx_simdcall
expSingleAccuracy(SimdDouble x
)
3832 # if GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
3835 const SimdDouble
argscale(1.44269504088896341);
3836 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3837 const SimdDouble
smallArgLimit(-709.0895657128);
3838 const SimdDouble
invargscale(-0.69314718055994528623);
3839 const SimdDouble
CC4(0.00136324646882712841033936);
3840 const SimdDouble
CC3(0.00836596917361021041870117);
3841 const SimdDouble
CC2(0.0416710823774337768554688);
3842 const SimdDouble
CC1(0.166665524244308471679688);
3843 const SimdDouble
CC0(0.499999850988388061523438);
3844 const SimdDouble
one(1.0);
3849 // Large negative values are valid arguments to exp2(), so there are two
3850 // things we need to account for:
3851 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3852 // zero and we can no longer multiply with it. There are special IEEE
3853 // formats to handle this range, but for now we have to accept that
3854 // we cannot handle those arguments. If input value becomes even more
3855 // negative, it will start to loop and we would end up with invalid
3856 // exponents. Thus, we need to limit or mask this.
3857 // 2. For VERY large negative values, we will have problems that the
3858 // subtraction to get the fractional part loses accuracy, and then we
3859 // can end up with overflows in the polynomial.
3861 // For now, we handle this by forwarding the math optimization setting to
3862 // ldexp, where the routine will return zero for very small arguments.
3864 // However, before doing that we need to make sure we do not call cvtR2I
3865 // with an argument that is so negative it cannot be converted to an integer
3866 // after being multiplied by argscale.
3868 if (opt
== MathOptimization::Safe
)
3870 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest()) / argscale
);
3876 intpart
= round(y
); // use same rounding algorithm here
3878 // Extended precision arithmetics not needed since
3879 // we have double precision and only need single accuracy.
3880 x
= fma(invargscale
, intpart
, x
);
3882 p
= fma(CC4
, x
, CC3
);
3886 p
= fma(x
* x
, p
, x
);
3888 x
= ldexp
<opt
>(p
, iy
);
3893 /*! \brief SIMD pow(x,y). Double precision SIMD data, single accuracy.
3895 * This returns x^y for SIMD values.
3897 * \tparam opt If this is changed from the default (safe) into the unsafe
3898 * option, there are no guarantees about correct results for x==0.
3902 * \param y exponent.
3904 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
3905 * depending on the underlying implementation. If unsafe optimizations
3906 * are enabled, this is also true for x==0.
3908 * \warning You cannot rely on this implementation returning inf for arguments
3909 * that cause overflow. If you have some very large
3910 * values and need to rely on getting a valid numerical output,
3911 * take the minimum of your variable and the largest valid argument
3912 * before calling this routine.
3914 template<MathOptimization opt
= MathOptimization::Safe
>
3915 static inline SimdDouble gmx_simdcall
powSingleAccuracy(SimdDouble x
, SimdDouble y
)
3919 if (opt
== MathOptimization::Safe
)
3921 xcorr
= max(x
, SimdDouble(std::numeric_limits
<double>::min()));
3928 SimdDouble result
= exp2SingleAccuracy
<opt
>(y
* log2SingleAccuracy(xcorr
));
3930 if (opt
== MathOptimization::Safe
)
3932 // if x==0 and y>0 we explicitly set the result to 0.0
3933 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
3934 result
= blend(result
, setZero(), x
== setZero() && setZero() < y
);
3940 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3942 * \param x The value to calculate erf(x) for.
3945 * This routine achieves very close to single precision, but we do not care about
3946 * the last bit or the subnormal result range.
3948 static inline SimdDouble gmx_simdcall
erfSingleAccuracy(SimdDouble x
)
3950 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3951 const SimdDouble
CA6(7.853861353153693e-5);
3952 const SimdDouble
CA5(-8.010193625184903e-4);
3953 const SimdDouble
CA4(5.188327685732524e-3);
3954 const SimdDouble
CA3(-2.685381193529856e-2);
3955 const SimdDouble
CA2(1.128358514861418e-1);
3956 const SimdDouble
CA1(-3.761262582423300e-1);
3957 const SimdDouble
CA0(1.128379165726710);
3958 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3959 const SimdDouble
CB9(-0.0018629930017603923);
3960 const SimdDouble
CB8(0.003909821287598495);
3961 const SimdDouble
CB7(-0.0052094582210355615);
3962 const SimdDouble
CB6(0.005685614362160572);
3963 const SimdDouble
CB5(-0.0025367682853477272);
3964 const SimdDouble
CB4(-0.010199799682318782);
3965 const SimdDouble
CB3(0.04369575504816542);
3966 const SimdDouble
CB2(-0.11884063474674492);
3967 const SimdDouble
CB1(0.2732120154030589);
3968 const SimdDouble
CB0(0.42758357702025784);
3969 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3970 const SimdDouble
CC10(-0.0445555913112064);
3971 const SimdDouble
CC9(0.21376355144663348);
3972 const SimdDouble
CC8(-0.3473187200259257);
3973 const SimdDouble
CC7(0.016690861551248114);
3974 const SimdDouble
CC6(0.7560973182491192);
3975 const SimdDouble
CC5(-1.2137903600145787);
3976 const SimdDouble
CC4(0.8411872321232948);
3977 const SimdDouble
CC3(-0.08670413896296343);
3978 const SimdDouble
CC2(-0.27124782687240334);
3979 const SimdDouble
CC1(-0.0007502488047806069);
3980 const SimdDouble
CC0(0.5642114853803148);
3981 const SimdDouble
one(1.0);
3982 const SimdDouble
two(2.0);
3984 SimdDouble x2
, x4
, y
;
3985 SimdDouble t
, t2
, w
, w2
;
3986 SimdDouble pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
3988 SimdDouble res_erf
, res_erfc
, res
;
3989 SimdDBool mask
, msk_erf
;
3995 pA0
= fma(CA6
, x4
, CA4
);
3996 pA1
= fma(CA5
, x4
, CA3
);
3997 pA0
= fma(pA0
, x4
, CA2
);
3998 pA1
= fma(pA1
, x4
, CA1
);
4000 pA0
= fma(pA1
, x2
, pA0
);
4001 // Constant term must come last for precision reasons
4008 msk_erf
= (SimdDouble(0.75) <= y
);
4009 t
= maskzInvSingleAccuracy(y
, msk_erf
);
4014 expmx2
= expSingleAccuracy(-y
* y
);
4016 pB1
= fma(CB9
, w2
, CB7
);
4017 pB0
= fma(CB8
, w2
, CB6
);
4018 pB1
= fma(pB1
, w2
, CB5
);
4019 pB0
= fma(pB0
, w2
, CB4
);
4020 pB1
= fma(pB1
, w2
, CB3
);
4021 pB0
= fma(pB0
, w2
, CB2
);
4022 pB1
= fma(pB1
, w2
, CB1
);
4023 pB0
= fma(pB0
, w2
, CB0
);
4024 pB0
= fma(pB1
, w
, pB0
);
4026 pC0
= fma(CC10
, t2
, CC8
);
4027 pC1
= fma(CC9
, t2
, CC7
);
4028 pC0
= fma(pC0
, t2
, CC6
);
4029 pC1
= fma(pC1
, t2
, CC5
);
4030 pC0
= fma(pC0
, t2
, CC4
);
4031 pC1
= fma(pC1
, t2
, CC3
);
4032 pC0
= fma(pC0
, t2
, CC2
);
4033 pC1
= fma(pC1
, t2
, CC1
);
4035 pC0
= fma(pC0
, t2
, CC0
);
4036 pC0
= fma(pC1
, t
, pC0
);
4039 // Select pB0 or pC0 for erfc()
4041 res_erfc
= blend(pB0
, pC0
, mask
);
4042 res_erfc
= res_erfc
* expmx2
;
4044 // erfc(x<0) = 2-erfc(|x|)
4045 mask
= (x
< setZero());
4046 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
4048 // Select erf() or erfc()
4049 mask
= (y
< SimdDouble(0.75));
4050 res
= blend(one
- res_erfc
, res_erf
, mask
);
4055 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
4057 * \param x The value to calculate erfc(x) for.
4060 * This routine achieves singleprecision (bar the last bit) over most of the
4061 * input range, but for large arguments where the result is getting close
4062 * to the minimum representable numbers we accept slightly larger errors
4063 * (think results that are in the ballpark of 10^-30) since that is not
4066 static inline SimdDouble gmx_simdcall
erfcSingleAccuracy(SimdDouble x
)
4068 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
4069 const SimdDouble
CA6(7.853861353153693e-5);
4070 const SimdDouble
CA5(-8.010193625184903e-4);
4071 const SimdDouble
CA4(5.188327685732524e-3);
4072 const SimdDouble
CA3(-2.685381193529856e-2);
4073 const SimdDouble
CA2(1.128358514861418e-1);
4074 const SimdDouble
CA1(-3.761262582423300e-1);
4075 const SimdDouble
CA0(1.128379165726710);
4076 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
4077 const SimdDouble
CB9(-0.0018629930017603923);
4078 const SimdDouble
CB8(0.003909821287598495);
4079 const SimdDouble
CB7(-0.0052094582210355615);
4080 const SimdDouble
CB6(0.005685614362160572);
4081 const SimdDouble
CB5(-0.0025367682853477272);
4082 const SimdDouble
CB4(-0.010199799682318782);
4083 const SimdDouble
CB3(0.04369575504816542);
4084 const SimdDouble
CB2(-0.11884063474674492);
4085 const SimdDouble
CB1(0.2732120154030589);
4086 const SimdDouble
CB0(0.42758357702025784);
4087 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
4088 const SimdDouble
CC10(-0.0445555913112064);
4089 const SimdDouble
CC9(0.21376355144663348);
4090 const SimdDouble
CC8(-0.3473187200259257);
4091 const SimdDouble
CC7(0.016690861551248114);
4092 const SimdDouble
CC6(0.7560973182491192);
4093 const SimdDouble
CC5(-1.2137903600145787);
4094 const SimdDouble
CC4(0.8411872321232948);
4095 const SimdDouble
CC3(-0.08670413896296343);
4096 const SimdDouble
CC2(-0.27124782687240334);
4097 const SimdDouble
CC1(-0.0007502488047806069);
4098 const SimdDouble
CC0(0.5642114853803148);
4099 const SimdDouble
one(1.0);
4100 const SimdDouble
two(2.0);
4102 SimdDouble x2
, x4
, y
;
4103 SimdDouble t
, t2
, w
, w2
;
4104 SimdDouble pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
4106 SimdDouble res_erf
, res_erfc
, res
;
4107 SimdDBool mask
, msk_erf
;
4113 pA0
= fma(CA6
, x4
, CA4
);
4114 pA1
= fma(CA5
, x4
, CA3
);
4115 pA0
= fma(pA0
, x4
, CA2
);
4116 pA1
= fma(pA1
, x4
, CA1
);
4118 pA0
= fma(pA0
, x4
, pA1
);
4119 // Constant term must come last for precision reasons
4126 msk_erf
= (SimdDouble(0.75) <= y
);
4127 t
= maskzInvSingleAccuracy(y
, msk_erf
);
4132 expmx2
= expSingleAccuracy(-y
* y
);
4134 pB1
= fma(CB9
, w2
, CB7
);
4135 pB0
= fma(CB8
, w2
, CB6
);
4136 pB1
= fma(pB1
, w2
, CB5
);
4137 pB0
= fma(pB0
, w2
, CB4
);
4138 pB1
= fma(pB1
, w2
, CB3
);
4139 pB0
= fma(pB0
, w2
, CB2
);
4140 pB1
= fma(pB1
, w2
, CB1
);
4141 pB0
= fma(pB0
, w2
, CB0
);
4142 pB0
= fma(pB1
, w
, pB0
);
4144 pC0
= fma(CC10
, t2
, CC8
);
4145 pC1
= fma(CC9
, t2
, CC7
);
4146 pC0
= fma(pC0
, t2
, CC6
);
4147 pC1
= fma(pC1
, t2
, CC5
);
4148 pC0
= fma(pC0
, t2
, CC4
);
4149 pC1
= fma(pC1
, t2
, CC3
);
4150 pC0
= fma(pC0
, t2
, CC2
);
4151 pC1
= fma(pC1
, t2
, CC1
);
4153 pC0
= fma(pC0
, t2
, CC0
);
4154 pC0
= fma(pC1
, t
, pC0
);
4157 // Select pB0 or pC0 for erfc()
4159 res_erfc
= blend(pB0
, pC0
, mask
);
4160 res_erfc
= res_erfc
* expmx2
;
4162 // erfc(x<0) = 2-erfc(|x|)
4163 mask
= (x
< setZero());
4164 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
4166 // Select erf() or erfc()
4167 mask
= (y
< SimdDouble(0.75));
4168 res
= blend(res_erfc
, one
- res_erf
, mask
);
4173 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
4175 * \param x The argument to evaluate sin/cos for
4176 * \param[out] sinval Sin(x)
4177 * \param[out] cosval Cos(x)
4179 static inline void gmx_simdcall
sinCosSingleAccuracy(SimdDouble x
, SimdDouble
* sinval
, SimdDouble
* cosval
)
4181 // Constants to subtract Pi/4*x from y while minimizing precision loss
4182 const SimdDouble
argred0(2 * 0.78539816290140151978);
4183 const SimdDouble
argred1(2 * 4.9604678871439933374e-10);
4184 const SimdDouble
argred2(2 * 1.1258708853173288931e-18);
4185 const SimdDouble
two_over_pi(2.0 / M_PI
);
4186 const SimdDouble
const_sin2(-1.9515295891e-4);
4187 const SimdDouble
const_sin1(8.3321608736e-3);
4188 const SimdDouble
const_sin0(-1.6666654611e-1);
4189 const SimdDouble
const_cos2(2.443315711809948e-5);
4190 const SimdDouble
const_cos1(-1.388731625493765e-3);
4191 const SimdDouble
const_cos0(4.166664568298827e-2);
4193 const SimdDouble
half(0.5);
4194 const SimdDouble
one(1.0);
4195 SimdDouble ssign
, csign
;
4196 SimdDouble x2
, y
, z
, psin
, pcos
, sss
, ccc
;
4199 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4200 const SimdDInt32
ione(1);
4201 const SimdDInt32
itwo(2);
4204 z
= x
* two_over_pi
;
4208 mask
= cvtIB2B((iy
& ione
) == setZero());
4209 ssign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B((iy
& itwo
) == itwo
));
4210 csign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B(((iy
+ ione
) & itwo
) == itwo
));
4212 const SimdDouble
quarter(0.25);
4213 const SimdDouble
minusquarter(-0.25);
4215 SimdDBool m1
, m2
, m3
;
4217 /* The most obvious way to find the arguments quadrant in the unit circle
4218 * to calculate the sign is to use integer arithmetic, but that is not
4219 * present in all SIMD implementations. As an alternative, we have devised a
4220 * pure floating-point algorithm that uses truncation for argument reduction
4221 * so that we get a new value 0<=q<1 over the unit circle, and then
4222 * do floating-point comparisons with fractions. This is likely to be
4223 * slightly slower (~10%) due to the longer latencies of floating-point, so
4224 * we only use it when integer SIMD arithmetic is not present.
4228 // It is critical that half-way cases are rounded down
4229 z
= fma(x
, two_over_pi
, half
);
4233 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
4234 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
4235 * This removes the 2*Pi periodicity without using any integer arithmetic.
4236 * First check if y had the value 2 or 3, set csign if true.
4239 /* If we have logical operations we can work directly on the signbit, which
4240 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
4241 * Thus, if you are altering defines to debug alternative code paths, the
4242 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
4243 * active or inactive - you will get errors if only one is used.
4245 # if GMX_SIMD_HAVE_LOGICAL
4246 ssign
= ssign
& SimdDouble(GMX_DOUBLE_NEGZERO
);
4247 csign
= andNot(q
, SimdDouble(GMX_DOUBLE_NEGZERO
));
4248 ssign
= ssign
^ csign
;
4250 ssign
= copysign(SimdDouble(1.0), ssign
);
4251 csign
= copysign(SimdDouble(1.0), q
);
4253 ssign
= ssign
* csign
; // swap ssign if csign was set.
4255 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
4256 m1
= (q
< minusquarter
);
4257 m2
= (setZero() <= q
);
4261 // where mask is FALSE, swap sign.
4262 csign
= csign
* blend(SimdDouble(-1.0), one
, mask
);
4264 x
= fnma(y
, argred0
, x
);
4265 x
= fnma(y
, argred1
, x
);
4266 x
= fnma(y
, argred2
, x
);
4269 psin
= fma(const_sin2
, x2
, const_sin1
);
4270 psin
= fma(psin
, x2
, const_sin0
);
4271 psin
= fma(psin
, x
* x2
, x
);
4272 pcos
= fma(const_cos2
, x2
, const_cos1
);
4273 pcos
= fma(pcos
, x2
, const_cos0
);
4274 pcos
= fms(pcos
, x2
, half
);
4275 pcos
= fma(pcos
, x2
, one
);
4277 sss
= blend(pcos
, psin
, mask
);
4278 ccc
= blend(psin
, pcos
, mask
);
4279 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
4280 # if GMX_SIMD_HAVE_LOGICAL
4281 *sinval
= sss
^ ssign
;
4282 *cosval
= ccc
^ csign
;
4284 *sinval
= sss
* ssign
;
4285 *cosval
= ccc
* csign
;
4289 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
4291 * \param x The argument to evaluate sin for
4294 * \attention Do NOT call both sin & cos if you need both results, since each of them
4295 * will then call \ref sincos and waste a factor 2 in performance.
4297 static inline SimdDouble gmx_simdcall
sinSingleAccuracy(SimdDouble x
)
4300 sinCosSingleAccuracy(x
, &s
, &c
);
4304 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
4306 * \param x The argument to evaluate cos for
4309 * \attention Do NOT call both sin & cos if you need both results, since each of them
4310 * will then call \ref sincos and waste a factor 2 in performance.
4312 static inline SimdDouble gmx_simdcall
cosSingleAccuracy(SimdDouble x
)
4315 sinCosSingleAccuracy(x
, &s
, &c
);
4319 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
4321 * \param x The argument to evaluate tan for
4324 static inline SimdDouble gmx_simdcall
tanSingleAccuracy(SimdDouble x
)
4326 const SimdDouble
argred0(2 * 0.78539816290140151978);
4327 const SimdDouble
argred1(2 * 4.9604678871439933374e-10);
4328 const SimdDouble
argred2(2 * 1.1258708853173288931e-18);
4329 const SimdDouble
two_over_pi(2.0 / M_PI
);
4330 const SimdDouble
CT6(0.009498288995810566122993911);
4331 const SimdDouble
CT5(0.002895755790837379295226923);
4332 const SimdDouble
CT4(0.02460087336161924491836265);
4333 const SimdDouble
CT3(0.05334912882656359828045988);
4334 const SimdDouble
CT2(0.1333989091464957704418495);
4335 const SimdDouble
CT1(0.3333307599244198227797507);
4337 SimdDouble x2
, p
, y
, z
;
4340 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4344 z
= x
* two_over_pi
;
4347 mask
= cvtIB2B((iy
& ione
) == ione
);
4349 x
= fnma(y
, argred0
, x
);
4350 x
= fnma(y
, argred1
, x
);
4351 x
= fnma(y
, argred2
, x
);
4352 x
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), mask
) ^ x
;
4354 const SimdDouble
quarter(0.25);
4355 const SimdDouble
half(0.5);
4356 const SimdDouble
threequarter(0.75);
4358 SimdDBool m1
, m2
, m3
;
4361 z
= fma(w
, two_over_pi
, half
);
4365 m1
= (quarter
<= q
);
4367 m3
= (threequarter
<= q
);
4370 w
= fnma(y
, argred0
, w
);
4371 w
= fnma(y
, argred1
, w
);
4372 w
= fnma(y
, argred2
, w
);
4374 w
= blend(w
, -w
, mask
);
4375 x
= w
* copysign(SimdDouble(1.0), x
);
4378 p
= fma(CT6
, x2
, CT5
);
4379 p
= fma(p
, x2
, CT4
);
4380 p
= fma(p
, x2
, CT3
);
4381 p
= fma(p
, x2
, CT2
);
4382 p
= fma(p
, x2
, CT1
);
4383 p
= fma(x2
, p
* x
, x
);
4385 p
= blend(p
, maskzInvSingleAccuracy(p
, mask
), mask
);
4389 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4391 * \param x The argument to evaluate asin for
4394 static inline SimdDouble gmx_simdcall
asinSingleAccuracy(SimdDouble x
)
4396 const SimdDouble
limitlow(1e-4);
4397 const SimdDouble
half(0.5);
4398 const SimdDouble
one(1.0);
4399 const SimdDouble
halfpi(M_PI
/ 2.0);
4400 const SimdDouble
CC5(4.2163199048E-2);
4401 const SimdDouble
CC4(2.4181311049E-2);
4402 const SimdDouble
CC3(4.5470025998E-2);
4403 const SimdDouble
CC2(7.4953002686E-2);
4404 const SimdDouble
CC1(1.6666752422E-1);
4406 SimdDouble z
, z1
, z2
, q
, q1
, q2
;
4408 SimdDBool mask
, mask2
;
4411 mask
= (half
< xabs
);
4412 z1
= half
* (one
- xabs
);
4413 mask2
= (xabs
< one
);
4414 q1
= z1
* maskzInvsqrtSingleAccuracy(z1
, mask2
);
4417 z
= blend(z2
, z1
, mask
);
4418 q
= blend(q2
, q1
, mask
);
4421 pA
= fma(CC5
, z2
, CC3
);
4422 pB
= fma(CC4
, z2
, CC2
);
4423 pA
= fma(pA
, z2
, CC1
);
4425 z
= fma(pB
, z2
, pA
);
4429 z
= blend(z
, q2
, mask
);
4431 mask
= (limitlow
< xabs
);
4432 z
= blend(xabs
, z
, mask
);
4438 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
4440 * \param x The argument to evaluate acos for
4443 static inline SimdDouble gmx_simdcall
acosSingleAccuracy(SimdDouble x
)
4445 const SimdDouble
one(1.0);
4446 const SimdDouble
half(0.5);
4447 const SimdDouble
pi(M_PI
);
4448 const SimdDouble
halfpi(M_PI
/ 2.0);
4450 SimdDouble z
, z1
, z2
, z3
;
4451 SimdDBool mask1
, mask2
, mask3
;
4454 mask1
= (half
< xabs
);
4455 mask2
= (setZero() < x
);
4457 z
= half
* (one
- xabs
);
4458 mask3
= (xabs
< one
);
4459 z
= z
* maskzInvsqrtSingleAccuracy(z
, mask3
);
4460 z
= blend(x
, z
, mask1
);
4461 z
= asinSingleAccuracy(z
);
4466 z
= blend(z1
, z2
, mask2
);
4467 z
= blend(z3
, z
, mask1
);
4472 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4474 * \param x The argument to evaluate atan for
4475 * \result Atan(x), same argument/value range as standard math library.
4477 static inline SimdDouble gmx_simdcall
atanSingleAccuracy(SimdDouble x
)
4479 const SimdDouble
halfpi(M_PI
/ 2);
4480 const SimdDouble
CA17(0.002823638962581753730774);
4481 const SimdDouble
CA15(-0.01595690287649631500244);
4482 const SimdDouble
CA13(0.04250498861074447631836);
4483 const SimdDouble
CA11(-0.07489009201526641845703);
4484 const SimdDouble
CA9(0.1063479334115982055664);
4485 const SimdDouble
CA7(-0.1420273631811141967773);
4486 const SimdDouble
CA5(0.1999269574880599975585);
4487 const SimdDouble
CA3(-0.3333310186862945556640);
4488 SimdDouble x2
, x3
, x4
, pA
, pB
;
4489 SimdDBool mask
, mask2
;
4491 mask
= (x
< setZero());
4493 mask2
= (SimdDouble(1.0) < x
);
4494 x
= blend(x
, maskzInvSingleAccuracy(x
, mask2
), mask2
);
4499 pA
= fma(CA17
, x4
, CA13
);
4500 pB
= fma(CA15
, x4
, CA11
);
4501 pA
= fma(pA
, x4
, CA9
);
4502 pB
= fma(pB
, x4
, CA7
);
4503 pA
= fma(pA
, x4
, CA5
);
4504 pB
= fma(pB
, x4
, CA3
);
4505 pA
= fma(pA
, x2
, pB
);
4506 pA
= fma(pA
, x3
, x
);
4508 pA
= blend(pA
, halfpi
- pA
, mask2
);
4509 pA
= blend(pA
, -pA
, mask
);
4514 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
4516 * \param y Y component of vector, any quartile
4517 * \param x X component of vector, any quartile
4518 * \result Atan(y,x), same argument/value range as standard math library.
4520 * \note This routine should provide correct results for all finite
4521 * non-zero or positive-zero arguments. However, negative zero arguments will
4522 * be treated as positive zero, which means the return value will deviate from
4523 * the standard math library atan2(y,x) for those cases. That should not be
4524 * of any concern in Gromacs, and in particular it will not affect calculations
4525 * of angles from vectors.
4527 static inline SimdDouble gmx_simdcall
atan2SingleAccuracy(SimdDouble y
, SimdDouble x
)
4529 const SimdDouble
pi(M_PI
);
4530 const SimdDouble
halfpi(M_PI
/ 2.0);
4531 SimdDouble xinv
, p
, aoffset
;
4532 SimdDBool mask_xnz
, mask_ynz
, mask_xlt0
, mask_ylt0
;
4534 mask_xnz
= x
!= setZero();
4535 mask_ynz
= y
!= setZero();
4536 mask_xlt0
= (x
< setZero());
4537 mask_ylt0
= (y
< setZero());
4539 aoffset
= selectByNotMask(halfpi
, mask_xnz
);
4540 aoffset
= selectByMask(aoffset
, mask_ynz
);
4542 aoffset
= blend(aoffset
, pi
, mask_xlt0
);
4543 aoffset
= blend(aoffset
, -aoffset
, mask_ylt0
);
4545 xinv
= maskzInvSingleAccuracy(x
, mask_xnz
);
4547 p
= atanSingleAccuracy(p
);
4553 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
4555 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4556 * \result Correction factor to coulomb force - see below for details.
4558 * This routine is meant to enable analytical evaluation of the
4559 * direct-space PME electrostatic force to avoid tables.
4561 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
4562 * are some problems evaluating that:
4564 * First, the error function is difficult (read: expensive) to
4565 * approxmiate accurately for intermediate to large arguments, and
4566 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
4567 * Second, we now try to avoid calculating potentials in Gromacs but
4568 * use forces directly.
4570 * We can simply things slight by noting that the PME part is really
4571 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
4573 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
4575 * The first term we already have from the inverse square root, so
4576 * that we can leave out of this routine.
4578 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
4579 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
4580 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
4583 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
4584 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
4585 * then only use even powers. This is another minor optimization, since
4586 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
4587 * the vector between the two atoms to get the vectorial force. The
4588 * fastest flops are the ones we can avoid calculating!
4590 * So, here's how it should be used:
4592 * 1. Calculate \f$r^2\f$.
4593 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
4594 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
4595 * 4. The return value is the expression:
4598 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
4601 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
4604 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
4607 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
4610 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
4613 * With a bit of math exercise you should be able to confirm that
4617 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
4620 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
4621 * and you have your force (divided by \f$r\f$). A final multiplication
4622 * with the vector connecting the two particles and you have your
4623 * vectorial force to add to the particles.
4625 * This approximation achieves an accuracy slightly lower than 1e-6; when
4626 * added to \f$1/r\f$ the error will be insignificant.
4629 static inline SimdDouble gmx_simdcall
pmeForceCorrectionSingleAccuracy(SimdDouble z2
)
4631 const SimdDouble
FN6(-1.7357322914161492954e-8);
4632 const SimdDouble
FN5(1.4703624142580877519e-6);
4633 const SimdDouble
FN4(-0.000053401640219807709149);
4634 const SimdDouble
FN3(0.0010054721316683106153);
4635 const SimdDouble
FN2(-0.019278317264888380590);
4636 const SimdDouble
FN1(0.069670166153766424023);
4637 const SimdDouble
FN0(-0.75225204789749321333);
4639 const SimdDouble
FD4(0.0011193462567257629232);
4640 const SimdDouble
FD3(0.014866955030185295499);
4641 const SimdDouble
FD2(0.11583842382862377919);
4642 const SimdDouble
FD1(0.50736591960530292870);
4643 const SimdDouble
FD0(1.0);
4646 SimdDouble polyFN0
, polyFN1
, polyFD0
, polyFD1
;
4650 polyFD0
= fma(FD4
, z4
, FD2
);
4651 polyFD1
= fma(FD3
, z4
, FD1
);
4652 polyFD0
= fma(polyFD0
, z4
, FD0
);
4653 polyFD0
= fma(polyFD1
, z2
, polyFD0
);
4655 polyFD0
= invSingleAccuracy(polyFD0
);
4657 polyFN0
= fma(FN6
, z4
, FN4
);
4658 polyFN1
= fma(FN5
, z4
, FN3
);
4659 polyFN0
= fma(polyFN0
, z4
, FN2
);
4660 polyFN1
= fma(polyFN1
, z4
, FN1
);
4661 polyFN0
= fma(polyFN0
, z4
, FN0
);
4662 polyFN0
= fma(polyFN1
, z2
, polyFN0
);
4664 return polyFN0
* polyFD0
;
4668 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4670 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4671 * \result Correction factor to coulomb potential - see below for details.
4673 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4674 * as the input argument.
4676 * Here's how it should be used:
4678 * 1. Calculate \f$r^2\f$.
4679 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4680 * 3. Evaluate this routine with z^2 as the argument.
4681 * 4. The return value is the expression:
4684 * \frac{\mbox{erf}(z)}{z}
4687 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4690 * \frac{\mbox{erf}(r \beta)}{r}
4693 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4694 * and you have your potential.
4696 * This approximation achieves an accuracy slightly lower than 1e-6; when
4697 * added to \f$1/r\f$ the error will be insignificant.
4699 static inline SimdDouble gmx_simdcall
pmePotentialCorrectionSingleAccuracy(SimdDouble z2
)
4701 const SimdDouble
VN6(1.9296833005951166339e-8);
4702 const SimdDouble
VN5(-1.4213390571557850962e-6);
4703 const SimdDouble
VN4(0.000041603292906656984871);
4704 const SimdDouble
VN3(-0.00013134036773265025626);
4705 const SimdDouble
VN2(0.038657983986041781264);
4706 const SimdDouble
VN1(0.11285044772717598220);
4707 const SimdDouble
VN0(1.1283802385263030286);
4709 const SimdDouble
VD3(0.0066752224023576045451);
4710 const SimdDouble
VD2(0.078647795836373922256);
4711 const SimdDouble
VD1(0.43336185284710920150);
4712 const SimdDouble
VD0(1.0);
4715 SimdDouble polyVN0
, polyVN1
, polyVD0
, polyVD1
;
4719 polyVD1
= fma(VD3
, z4
, VD1
);
4720 polyVD0
= fma(VD2
, z4
, VD0
);
4721 polyVD0
= fma(polyVD1
, z2
, polyVD0
);
4723 polyVD0
= invSingleAccuracy(polyVD0
);
4725 polyVN0
= fma(VN6
, z4
, VN4
);
4726 polyVN1
= fma(VN5
, z4
, VN3
);
4727 polyVN0
= fma(polyVN0
, z4
, VN2
);
4728 polyVN1
= fma(polyVN1
, z4
, VN1
);
4729 polyVN0
= fma(polyVN0
, z4
, VN0
);
4730 polyVN0
= fma(polyVN1
, z2
, polyVN0
);
4732 return polyVN0
* polyVD0
;
4738 /*! \name SIMD4 math functions
4740 * \note Only a subset of the math functions are implemented for SIMD4.
4745 # if GMX_SIMD4_HAVE_FLOAT
4747 /*************************************************************************
4748 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4749 *************************************************************************/
4751 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4753 * This is a low-level routine that should only be used by SIMD math routine
4754 * that evaluates the inverse square root.
4756 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4757 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4758 * \return An improved approximation with roughly twice as many bits of accuracy.
4760 static inline Simd4Float gmx_simdcall
rsqrtIter(Simd4Float lu
, Simd4Float x
)
4762 Simd4Float tmp1
= x
* lu
;
4763 Simd4Float tmp2
= Simd4Float(-0.5F
) * lu
;
4764 tmp1
= fma(tmp1
, lu
, Simd4Float(-3.0F
));
4768 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4770 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4771 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4772 * For the single precision implementation this is obviously always
4773 * true for positive values, but for double precision it adds an
4774 * extra restriction since the first lookup step might have to be
4775 * performed in single precision on some architectures. Note that the
4776 * responsibility for checking falls on you - this routine does not
4778 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4780 static inline Simd4Float gmx_simdcall
invsqrt(Simd4Float x
)
4782 Simd4Float lu
= rsqrt(x
);
4783 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4784 lu
= rsqrtIter(lu
, x
);
4786 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4787 lu
= rsqrtIter(lu
, x
);
4789 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4790 lu
= rsqrtIter(lu
, x
);
4796 # endif // GMX_SIMD4_HAVE_FLOAT
4799 # if GMX_SIMD4_HAVE_DOUBLE
4800 /*************************************************************************
4801 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4802 *************************************************************************/
4804 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4806 * This is a low-level routine that should only be used by SIMD math routine
4807 * that evaluates the inverse square root.
4809 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4810 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4811 * \return An improved approximation with roughly twice as many bits of accuracy.
4813 static inline Simd4Double gmx_simdcall
rsqrtIter(Simd4Double lu
, Simd4Double x
)
4815 Simd4Double tmp1
= x
* lu
;
4816 Simd4Double tmp2
= Simd4Double(-0.5F
) * lu
;
4817 tmp1
= fma(tmp1
, lu
, Simd4Double(-3.0F
));
4821 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4823 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4824 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4825 * For the single precision implementation this is obviously always
4826 * true for positive values, but for double precision it adds an
4827 * extra restriction since the first lookup step might have to be
4828 * performed in single precision on some architectures. Note that the
4829 * responsibility for checking falls on you - this routine does not
4831 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4833 static inline Simd4Double gmx_simdcall
invsqrt(Simd4Double x
)
4835 Simd4Double lu
= rsqrt(x
);
4836 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4837 lu
= rsqrtIter(lu
, x
);
4839 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4840 lu
= rsqrtIter(lu
, x
);
4842 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4843 lu
= rsqrtIter(lu
, x
);
4845 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4846 lu
= rsqrtIter(lu
, x
);
4852 /**********************************************************************
4853 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4854 **********************************************************************/
4856 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4858 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4859 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4860 * For the single precision implementation this is obviously always
4861 * true for positive values, but for double precision it adds an
4862 * extra restriction since the first lookup step might have to be
4863 * performed in single precision on some architectures. Note that the
4864 * responsibility for checking falls on you - this routine does not
4866 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4868 static inline Simd4Double gmx_simdcall
invsqrtSingleAccuracy(Simd4Double x
)
4870 Simd4Double lu
= rsqrt(x
);
4871 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4872 lu
= rsqrtIter(lu
, x
);
4874 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4875 lu
= rsqrtIter(lu
, x
);
4877 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4878 lu
= rsqrtIter(lu
, x
);
4884 # endif // GMX_SIMD4_HAVE_DOUBLE
4888 # if GMX_SIMD_HAVE_FLOAT
4889 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4891 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4892 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4893 * For the single precision implementation this is obviously always
4894 * true for positive values, but for double precision it adds an
4895 * extra restriction since the first lookup step might have to be
4896 * performed in single precision on some architectures. Note that the
4897 * responsibility for checking falls on you - this routine does not
4899 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4901 static inline SimdFloat gmx_simdcall
invsqrtSingleAccuracy(SimdFloat x
)
4906 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4908 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4909 * Illegal values in the masked-out elements will not lead to
4910 * floating-point exceptions.
4912 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4913 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4914 * For the single precision implementation this is obviously always
4915 * true for positive values, but for double precision it adds an
4916 * extra restriction since the first lookup step might have to be
4917 * performed in single precision on some architectures. Note that the
4918 * responsibility for checking falls on you - this routine does not
4921 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
4922 * entry was not masked, and 0.0 for masked-out entries.
4924 static inline SimdFloat
maskzInvsqrtSingleAccuracy(SimdFloat x
, SimdFBool m
)
4926 return maskzInvsqrt(x
, m
);
4929 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4931 * \param x0 First set of arguments, x0 must be in single range (see below).
4932 * \param x1 Second set of arguments, x1 must be in single range (see below).
4933 * \param[out] out0 Result 1/sqrt(x0)
4934 * \param[out] out1 Result 1/sqrt(x1)
4936 * In particular for double precision we can sometimes calculate square root
4937 * pairs slightly faster by using single precision until the very last step.
4939 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4940 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4941 * For the single precision implementation this is obviously always
4942 * true for positive values, but for double precision it adds an
4943 * extra restriction since the first lookup step might have to be
4944 * performed in single precision on some architectures. Note that the
4945 * responsibility for checking falls on you - this routine does not
4948 static inline void gmx_simdcall
invsqrtPairSingleAccuracy(SimdFloat x0
, SimdFloat x1
, SimdFloat
* out0
, SimdFloat
* out1
)
4950 return invsqrtPair(x0
, x1
, out0
, out1
);
4953 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4955 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4956 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4957 * For the single precision implementation this is obviously always
4958 * true for positive values, but for double precision it adds an
4959 * extra restriction since the first lookup step might have to be
4960 * performed in single precision on some architectures. Note that the
4961 * responsibility for checking falls on you - this routine does not
4963 * \return 1/x. Result is undefined if your argument was invalid.
4965 static inline SimdFloat gmx_simdcall
invSingleAccuracy(SimdFloat x
)
4971 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4973 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4974 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4975 * For the single precision implementation this is obviously always
4976 * true for positive values, but for double precision it adds an
4977 * extra restriction since the first lookup step might have to be
4978 * performed in single precision on some architectures. Note that the
4979 * responsibility for checking falls on you - this routine does not
4982 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
4984 static inline SimdFloat
maskzInvSingleAccuracy(SimdFloat x
, SimdFBool m
)
4986 return maskzInv(x
, m
);
4989 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4991 * \copydetails sqrt(SimdFloat)
4993 template<MathOptimization opt
= MathOptimization::Safe
>
4994 static inline SimdFloat gmx_simdcall
sqrtSingleAccuracy(SimdFloat x
)
4996 return sqrt
<opt
>(x
);
4999 /*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
5001 * \copydetails cbrt(SimdFloat)
5003 static inline SimdFloat gmx_simdcall
cbrtSingleAccuracy(SimdFloat x
)
5008 /*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
5010 * \copydetails cbrt(SimdFloat)
5012 static inline SimdFloat gmx_simdcall
invcbrtSingleAccuracy(SimdFloat x
)
5017 /*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
5019 * \param x Argument, should be >0.
5020 * \result The base-2 logarithm of x. Undefined if argument is invalid.
5022 static inline SimdFloat gmx_simdcall
log2SingleAccuracy(SimdFloat x
)
5027 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
5029 * \param x Argument, should be >0.
5030 * \result The natural logarithm of x. Undefined if argument is invalid.
5032 static inline SimdFloat gmx_simdcall
logSingleAccuracy(SimdFloat x
)
5037 /*! \brief SIMD float 2^x, only targeting single accuracy.
5039 * \copydetails exp2(SimdFloat)
5041 template<MathOptimization opt
= MathOptimization::Safe
>
5042 static inline SimdFloat gmx_simdcall
exp2SingleAccuracy(SimdFloat x
)
5044 return exp2
<opt
>(x
);
5047 /*! \brief SIMD float e^x, only targeting single accuracy.
5049 * \copydetails exp(SimdFloat)
5051 template<MathOptimization opt
= MathOptimization::Safe
>
5052 static inline SimdFloat gmx_simdcall
expSingleAccuracy(SimdFloat x
)
5057 /*! \brief SIMD pow(x,y), only targeting single accuracy.
5059 * \copydetails pow(SimdFloat,SimdFloat)
5061 template<MathOptimization opt
= MathOptimization::Safe
>
5062 static inline SimdFloat gmx_simdcall
powSingleAccuracy(SimdFloat x
, SimdFloat y
)
5064 return pow
<opt
>(x
, y
);
5067 /*! \brief SIMD float erf(x), only targeting single accuracy.
5069 * \param x The value to calculate erf(x) for.
5072 * This routine achieves very close to single precision, but we do not care about
5073 * the last bit or the subnormal result range.
5075 static inline SimdFloat gmx_simdcall
erfSingleAccuracy(SimdFloat x
)
5080 /*! \brief SIMD float erfc(x), only targeting single accuracy.
5082 * \param x The value to calculate erfc(x) for.
5085 * This routine achieves singleprecision (bar the last bit) over most of the
5086 * input range, but for large arguments where the result is getting close
5087 * to the minimum representable numbers we accept slightly larger errors
5088 * (think results that are in the ballpark of 10^-30) since that is not
5091 static inline SimdFloat gmx_simdcall
erfcSingleAccuracy(SimdFloat x
)
5096 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
5098 * \param x The argument to evaluate sin/cos for
5099 * \param[out] sinval Sin(x)
5100 * \param[out] cosval Cos(x)
5102 static inline void gmx_simdcall
sinCosSingleAccuracy(SimdFloat x
, SimdFloat
* sinval
, SimdFloat
* cosval
)
5104 sincos(x
, sinval
, cosval
);
5107 /*! \brief SIMD float sin(x), only targeting single accuracy.
5109 * \param x The argument to evaluate sin for
5112 * \attention Do NOT call both sin & cos if you need both results, since each of them
5113 * will then call \ref sincos and waste a factor 2 in performance.
5115 static inline SimdFloat gmx_simdcall
sinSingleAccuracy(SimdFloat x
)
5120 /*! \brief SIMD float cos(x), only targeting single accuracy.
5122 * \param x The argument to evaluate cos for
5125 * \attention Do NOT call both sin & cos if you need both results, since each of them
5126 * will then call \ref sincos and waste a factor 2 in performance.
5128 static inline SimdFloat gmx_simdcall
cosSingleAccuracy(SimdFloat x
)
5133 /*! \brief SIMD float tan(x), only targeting single accuracy.
5135 * \param x The argument to evaluate tan for
5138 static inline SimdFloat gmx_simdcall
tanSingleAccuracy(SimdFloat x
)
5143 /*! \brief SIMD float asin(x), only targeting single accuracy.
5145 * \param x The argument to evaluate asin for
5148 static inline SimdFloat gmx_simdcall
asinSingleAccuracy(SimdFloat x
)
5153 /*! \brief SIMD float acos(x), only targeting single accuracy.
5155 * \param x The argument to evaluate acos for
5158 static inline SimdFloat gmx_simdcall
acosSingleAccuracy(SimdFloat x
)
5163 /*! \brief SIMD float atan(x), only targeting single accuracy.
5165 * \param x The argument to evaluate atan for
5166 * \result Atan(x), same argument/value range as standard math library.
5168 static inline SimdFloat gmx_simdcall
atanSingleAccuracy(SimdFloat x
)
5173 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
5175 * \param y Y component of vector, any quartile
5176 * \param x X component of vector, any quartile
5177 * \result Atan(y,x), same argument/value range as standard math library.
5179 * \note This routine should provide correct results for all finite
5180 * non-zero or positive-zero arguments. However, negative zero arguments will
5181 * be treated as positive zero, which means the return value will deviate from
5182 * the standard math library atan2(y,x) for those cases. That should not be
5183 * of any concern in Gromacs, and in particular it will not affect calculations
5184 * of angles from vectors.
5186 static inline SimdFloat gmx_simdcall
atan2SingleAccuracy(SimdFloat y
, SimdFloat x
)
5191 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
5193 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5194 * \result Correction factor to coulomb force.
5196 static inline SimdFloat gmx_simdcall
pmeForceCorrectionSingleAccuracy(SimdFloat z2
)
5198 return pmeForceCorrection(z2
);
5201 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
5203 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5204 * \result Correction factor to coulomb force.
5206 static inline SimdFloat gmx_simdcall
pmePotentialCorrectionSingleAccuracy(SimdFloat z2
)
5208 return pmePotentialCorrection(z2
);
5210 # endif // GMX_SIMD_HAVE_FLOAT
5212 # if GMX_SIMD4_HAVE_FLOAT
5213 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
5215 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
5216 * GMX_FLOAT_MAX, i.e. within the range of single precision.
5217 * For the single precision implementation this is obviously always
5218 * true for positive values, but for double precision it adds an
5219 * extra restriction since the first lookup step might have to be
5220 * performed in single precision on some architectures. Note that the
5221 * responsibility for checking falls on you - this routine does not
5223 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
5225 static inline Simd4Float gmx_simdcall
invsqrtSingleAccuracy(Simd4Float x
)
5229 # endif // GMX_SIMD4_HAVE_FLOAT
5231 /*! \} end of addtogroup module_simd */
5232 /*! \endcond end of condition libabl */
5238 #endif // GMX_SIMD_SIMD_MATH_H