2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H
36 #define GMX_SIMD_SIMD_MATH_H
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \ingroup module_simd
64 #include "gromacs/math/utilities.h"
65 #include "gromacs/simd/simd.h"
66 #include "gromacs/utility/basedefinitions.h"
67 #include "gromacs/utility/real.h"
75 /*! \addtogroup module_simd */
78 /*! \name Implementation accuracy settings
84 #if GMX_SIMD_HAVE_FLOAT
86 /*! \name Single precision SIMD math functions
88 * \note In most cases you should use the real-precision functions instead.
92 /****************************************
93 * SINGLE PRECISION SIMD MATH FUNCTIONS *
94 ****************************************/
96 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
97 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
99 * \param x Values to set sign for
100 * \param y Values used to set sign
101 * \return Magnitude of x, sign of y
103 static inline SimdFloat gmx_simdcall
104 copysign(SimdFloat x
, SimdFloat y
)
106 #if GMX_SIMD_HAVE_LOGICAL
107 return abs(x
) | ( SimdFloat(GMX_FLOAT_NEGZERO
) & y
);
109 return blend(abs(x
), -abs(x
), y
< setZero());
114 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
115 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
117 * This is a low-level routine that should only be used by SIMD math routine
118 * that evaluates the inverse square root.
120 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
121 * \param x The reference (starting) value x for which we want 1/sqrt(x).
122 * \return An improved approximation with roughly twice as many bits of accuracy.
124 static inline SimdFloat gmx_simdcall
125 rsqrtIter(SimdFloat lu
, SimdFloat x
)
127 SimdFloat tmp1
= x
*lu
;
128 SimdFloat tmp2
= SimdFloat(-0.5f
)*lu
;
129 tmp1
= fma(tmp1
, lu
, SimdFloat(-3.0f
));
134 /*! \brief Calculate 1/sqrt(x) for SIMD float.
136 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
137 * GMX_FLOAT_MAX, i.e. within the range of single precision.
138 * For the single precision implementation this is obviously always
139 * true for positive values, but for double precision it adds an
140 * extra restriction since the first lookup step might have to be
141 * performed in single precision on some architectures. Note that the
142 * responsibility for checking falls on you - this routine does not
145 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
147 static inline SimdFloat gmx_simdcall
150 SimdFloat lu
= rsqrt(x
);
151 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
152 lu
= rsqrtIter(lu
, x
);
154 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
155 lu
= rsqrtIter(lu
, x
);
157 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
158 lu
= rsqrtIter(lu
, x
);
163 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
165 * \param x0 First set of arguments, x0 must be in single range (see below).
166 * \param x1 Second set of arguments, x1 must be in single range (see below).
167 * \param[out] out0 Result 1/sqrt(x0)
168 * \param[out] out1 Result 1/sqrt(x1)
170 * In particular for double precision we can sometimes calculate square root
171 * pairs slightly faster by using single precision until the very last step.
173 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
174 * GMX_FLOAT_MAX, i.e. within the range of single precision.
175 * For the single precision implementation this is obviously always
176 * true for positive values, but for double precision it adds an
177 * extra restriction since the first lookup step might have to be
178 * performed in single precision on some architectures. Note that the
179 * responsibility for checking falls on you - this routine does not
182 static inline void gmx_simdcall
183 invsqrtPair(SimdFloat x0
, SimdFloat x1
,
184 SimdFloat
*out0
, SimdFloat
*out1
)
190 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
191 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
193 * This is a low-level routine that should only be used by SIMD math routine
194 * that evaluates the reciprocal.
196 * \param lu Approximation of 1/x, typically obtained from lookup.
197 * \param x The reference (starting) value x for which we want 1/x.
198 * \return An improved approximation with roughly twice as many bits of accuracy.
200 static inline SimdFloat gmx_simdcall
201 rcpIter(SimdFloat lu
, SimdFloat x
)
203 return lu
*fnma(lu
, x
, SimdFloat(2.0f
));
207 /*! \brief Calculate 1/x for SIMD float.
209 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
210 * GMX_FLOAT_MAX, i.e. within the range of single precision.
211 * For the single precision implementation this is obviously always
212 * true for positive values, but for double precision it adds an
213 * extra restriction since the first lookup step might have to be
214 * performed in single precision on some architectures. Note that the
215 * responsibility for checking falls on you - this routine does not
218 * \return 1/x. Result is undefined if your argument was invalid.
220 static inline SimdFloat gmx_simdcall
223 SimdFloat lu
= rcp(x
);
224 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
227 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
230 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
236 /*! \brief Division for SIMD floats
238 * \param nom Nominator
239 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
240 * For single precision this is equivalent to a nonzero argument,
241 * but in double precision it adds an extra restriction since
242 * the first lookup step might have to be performed in single
243 * precision on some architectures. Note that the responsibility
244 * for checking falls on you - this routine does not check arguments.
248 * \note This function does not use any masking to avoid problems with
249 * zero values in the denominator.
251 static inline SimdFloat gmx_simdcall
252 operator/(SimdFloat nom
, SimdFloat denom
)
254 return nom
*inv(denom
);
257 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
259 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
260 * Illegal values in the masked-out elements will not lead to
261 * floating-point exceptions.
263 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
264 * GMX_FLOAT_MAX for masked-in entries.
265 * See \ref invsqrt for the discussion about argument restrictions.
267 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
268 * entry was not masked, and 0.0 for masked-out entries.
270 static inline SimdFloat
271 maskzInvsqrt(SimdFloat x
, SimdFBool m
)
273 SimdFloat lu
= maskzRsqrt(x
, m
);
274 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
275 lu
= rsqrtIter(lu
, x
);
277 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
278 lu
= rsqrtIter(lu
, x
);
280 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
281 lu
= rsqrtIter(lu
, x
);
286 /*! \brief Calculate 1/x for SIMD float, masked version.
288 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
289 * GMX_FLOAT_MAX for masked-in entries.
290 * See \ref invsqrt for the discussion about argument restrictions.
292 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
294 static inline SimdFloat gmx_simdcall
295 maskzInv(SimdFloat x
, SimdFBool m
)
297 SimdFloat lu
= maskzRcp(x
, m
);
298 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
301 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
304 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
310 /*! \brief Calculate sqrt(x) for SIMD floats
312 * \tparam opt By default, this function checks if the input value is 0.0
313 * and masks this to return the correct result. If you are certain
314 * your argument will never be zero, and you know you need to save
315 * every single cycle you can, you can alternatively call the
316 * function as sqrt<MathOptimization::Unsafe>(x).
318 * \param x Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
319 * lookup step often has to be implemented in single precision.
320 * Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
321 * result, even in double precision. If you are using the unsafe
322 * math optimization parameter, the argument must be in the range
323 * GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
325 * \return sqrt(x). The result is undefined if the input value does not fall
326 * in the allowed range specified for the argument.
328 template <MathOptimization opt
= MathOptimization::Safe
>
329 static inline SimdFloat gmx_simdcall
332 if (opt
== MathOptimization::Safe
)
334 SimdFloat res
= maskzInvsqrt(x
, setZero() < x
);
339 return x
* invsqrt(x
);
343 #if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
344 /*! \brief SIMD float log(x). This is the natural logarithm.
346 * \param x Argument, should be >0.
347 * \result The natural logarithm of x. Undefined if argument is invalid.
349 static inline SimdFloat gmx_simdcall
352 const SimdFloat
one(1.0f
);
353 const SimdFloat
two(2.0f
);
354 const SimdFloat
invsqrt2(1.0f
/std::sqrt(2.0f
));
355 const SimdFloat
corr(0.693147180559945286226764f
);
356 const SimdFloat
CL9(0.2371599674224853515625f
);
357 const SimdFloat
CL7(0.285279005765914916992188f
);
358 const SimdFloat
CL5(0.400005519390106201171875f
);
359 const SimdFloat
CL3(0.666666567325592041015625f
);
360 const SimdFloat
CL1(2.0f
);
361 SimdFloat fExp
, x2
, p
;
369 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
370 fExp
= fExp
- selectByMask(one
, m
);
371 x
= x
* blend(one
, two
, m
);
373 x
= (x
-one
) * inv( x
+one
);
376 p
= fma(CL9
, x2
, CL7
);
380 p
= fma(p
, x
, corr
*fExp
);
386 #if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
387 /*! \brief SIMD float 2^x
389 * \tparam opt If this is changed from the default (safe) into the unsafe
390 * option, input values that would otherwise lead to zero-clamped
391 * results are not allowed and will lead to undefined results.
393 * \param x Argument. For the default (safe) function version this can be
394 * arbitrarily small value, but the routine might clamp the result to
395 * zero for arguments that would produce subnormal IEEE754-2008 results.
396 * This corresponds to inputs below -126 in single or -1022 in double,
397 * and it might overflow for arguments reaching 127 (single) or
398 * 1023 (double). If you enable the unsafe math optimization,
399 * very small arguments will not necessarily be zero-clamped, but
400 * can produce undefined results.
402 * \result 2^x. The result is undefined for very large arguments that cause
403 * internal floating-point overflow. If unsafe optimizations are enabled,
404 * this is also true for very small values.
406 * \note The definition range of this function is just-so-slightly smaller
407 * than the allowed IEEE exponents for many architectures. This is due
408 * to the implementation, which will hopefully improve in the future.
410 * \warning You cannot rely on this implementation returning inf for arguments
411 * that cause overflow. If you have some very large
412 * values and need to rely on getting a valid numerical output,
413 * take the minimum of your variable and the largest valid argument
414 * before calling this routine.
416 template <MathOptimization opt
= MathOptimization::Safe
>
417 static inline SimdFloat gmx_simdcall
420 const SimdFloat
CC6(0.0001534581200287996416911311f
);
421 const SimdFloat
CC5(0.001339993121934088894618990f
);
422 const SimdFloat
CC4(0.009618488957115180159497841f
);
423 const SimdFloat
CC3(0.05550328776964726865751735f
);
424 const SimdFloat
CC2(0.2402264689063408646490722f
);
425 const SimdFloat
CC1(0.6931472057372680777553816f
);
426 const SimdFloat
one(1.0f
);
432 // Large negative values are valid arguments to exp2(), so there are two
433 // things we need to account for:
434 // 1. When the exponents reaches -127, the (biased) exponent field will be
435 // zero and we can no longer multiply with it. There are special IEEE
436 // formats to handle this range, but for now we have to accept that
437 // we cannot handle those arguments. If input value becomes even more
438 // negative, it will start to loop and we would end up with invalid
439 // exponents. Thus, we need to limit or mask this.
440 // 2. For VERY large negative values, we will have problems that the
441 // subtraction to get the fractional part loses accuracy, and then we
442 // can end up with overflows in the polynomial.
444 // For now, we handle this by forwarding the math optimization setting to
445 // ldexp, where the routine will return zero for very small arguments.
447 // However, before doing that we need to make sure we do not call cvtR2I
448 // with an argument that is so negative it cannot be converted to an integer.
449 if (opt
== MathOptimization::Safe
)
451 x
= max(x
, SimdFloat(std::numeric_limits
<std::int32_t>::lowest()));
454 fexppart
= ldexp
<opt
>(one
, cvtR2I(x
));
458 p
= fma(CC6
, x
, CC5
);
469 #if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
470 /*! \brief SIMD float exp(x).
472 * In addition to scaling the argument for 2^x this routine correctly does
473 * extended precision arithmetics to improve accuracy.
475 * \tparam opt If this is changed from the default (safe) into the unsafe
476 * option, input values that would otherwise lead to zero-clamped
477 * results are not allowed and will lead to undefined results.
479 * \param x Argument. For the default (safe) function version this can be
480 * arbitrarily small value, but the routine might clamp the result to
481 * zero for arguments that would produce subnormal IEEE754-2008 results.
482 * This corresponds to input arguments reaching
483 * -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
484 * Similarly, it might overflow for arguments reaching
485 * 127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
486 * unsafe math optimizations are enabled, small input values that would
487 * result in zero-clamped output are not allowed.
489 * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
490 * depending on the underlying implementation. If unsafe optimizations
491 * are enabled, this is also true for very small values.
493 * \note The definition range of this function is just-so-slightly smaller
494 * than the allowed IEEE exponents for many architectures. This is due
495 * to the implementation, which will hopefully improve in the future.
497 * \warning You cannot rely on this implementation returning inf for arguments
498 * that cause overflow. If you have some very large
499 * values and need to rely on getting a valid numerical output,
500 * take the minimum of your variable and the largest valid argument
501 * before calling this routine.
503 template <MathOptimization opt
= MathOptimization::Safe
>
504 static inline SimdFloat gmx_simdcall
507 const SimdFloat
argscale(1.44269504088896341f
);
508 const SimdFloat
invargscale0(-0.693145751953125f
);
509 const SimdFloat
invargscale1(-1.428606765330187045e-06f
);
510 const SimdFloat
CC4(0.00136324646882712841033936f
);
511 const SimdFloat
CC3(0.00836596917361021041870117f
);
512 const SimdFloat
CC2(0.0416710823774337768554688f
);
513 const SimdFloat
CC1(0.166665524244308471679688f
);
514 const SimdFloat
CC0(0.499999850988388061523438f
);
515 const SimdFloat
one(1.0f
);
520 // Large negative values are valid arguments to exp2(), so there are two
521 // things we need to account for:
522 // 1. When the exponents reaches -127, the (biased) exponent field will be
523 // zero and we can no longer multiply with it. There are special IEEE
524 // formats to handle this range, but for now we have to accept that
525 // we cannot handle those arguments. If input value becomes even more
526 // negative, it will start to loop and we would end up with invalid
527 // exponents. Thus, we need to limit or mask this.
528 // 2. For VERY large negative values, we will have problems that the
529 // subtraction to get the fractional part loses accuracy, and then we
530 // can end up with overflows in the polynomial.
532 // For now, we handle this by forwarding the math optimization setting to
533 // ldexp, where the routine will return zero for very small arguments.
535 // However, before doing that we need to make sure we do not call cvtR2I
536 // with an argument that is so negative it cannot be converted to an integer
537 // after being multiplied by argscale.
539 if (opt
== MathOptimization::Safe
)
541 x
= max(x
, SimdFloat(std::numeric_limits
<std::int32_t>::lowest())/argscale
);
547 fexppart
= ldexp
<opt
>(one
, cvtR2I(y
));
550 // Extended precision arithmetics
551 x
= fma(invargscale0
, intpart
, x
);
552 x
= fma(invargscale1
, intpart
, x
);
554 p
= fma(CC4
, x
, CC3
);
559 x
= fma(p
, fexppart
, fexppart
);
564 /*! \brief SIMD float erf(x).
566 * \param x The value to calculate erf(x) for.
569 * This routine achieves very close to full precision, but we do not care about
570 * the last bit or the subnormal result range.
572 static inline SimdFloat gmx_simdcall
575 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
576 const SimdFloat
CA6(7.853861353153693e-5f
);
577 const SimdFloat
CA5(-8.010193625184903e-4f
);
578 const SimdFloat
CA4(5.188327685732524e-3f
);
579 const SimdFloat
CA3(-2.685381193529856e-2f
);
580 const SimdFloat
CA2(1.128358514861418e-1f
);
581 const SimdFloat
CA1(-3.761262582423300e-1f
);
582 const SimdFloat
CA0(1.128379165726710f
);
583 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
584 const SimdFloat
CB9(-0.0018629930017603923f
);
585 const SimdFloat
CB8(0.003909821287598495f
);
586 const SimdFloat
CB7(-0.0052094582210355615f
);
587 const SimdFloat
CB6(0.005685614362160572f
);
588 const SimdFloat
CB5(-0.0025367682853477272f
);
589 const SimdFloat
CB4(-0.010199799682318782f
);
590 const SimdFloat
CB3(0.04369575504816542f
);
591 const SimdFloat
CB2(-0.11884063474674492f
);
592 const SimdFloat
CB1(0.2732120154030589f
);
593 const SimdFloat
CB0(0.42758357702025784f
);
594 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
595 const SimdFloat
CC10(-0.0445555913112064f
);
596 const SimdFloat
CC9(0.21376355144663348f
);
597 const SimdFloat
CC8(-0.3473187200259257f
);
598 const SimdFloat
CC7(0.016690861551248114f
);
599 const SimdFloat
CC6(0.7560973182491192f
);
600 const SimdFloat
CC5(-1.2137903600145787f
);
601 const SimdFloat
CC4(0.8411872321232948f
);
602 const SimdFloat
CC3(-0.08670413896296343f
);
603 const SimdFloat
CC2(-0.27124782687240334f
);
604 const SimdFloat
CC1(-0.0007502488047806069f
);
605 const SimdFloat
CC0(0.5642114853803148f
);
606 const SimdFloat
one(1.0f
);
607 const SimdFloat
two(2.0f
);
610 SimdFloat t
, t2
, w
, w2
;
611 SimdFloat pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
613 SimdFloat res_erf
, res_erfc
, res
;
614 SimdFBool m
, maskErf
;
620 pA0
= fma(CA6
, x4
, CA4
);
621 pA1
= fma(CA5
, x4
, CA3
);
622 pA0
= fma(pA0
, x4
, CA2
);
623 pA1
= fma(pA1
, x4
, CA1
);
625 pA0
= fma(pA1
, x2
, pA0
);
626 // Constant term must come last for precision reasons
633 maskErf
= SimdFloat(0.75f
) <= y
;
634 t
= maskzInv(y
, maskErf
);
639 // No need for a floating-point sieve here (as in erfc), since erf()
640 // will never return values that are extremely small for large args.
641 expmx2
= exp( -y
*y
);
643 pB1
= fma(CB9
, w2
, CB7
);
644 pB0
= fma(CB8
, w2
, CB6
);
645 pB1
= fma(pB1
, w2
, CB5
);
646 pB0
= fma(pB0
, w2
, CB4
);
647 pB1
= fma(pB1
, w2
, CB3
);
648 pB0
= fma(pB0
, w2
, CB2
);
649 pB1
= fma(pB1
, w2
, CB1
);
650 pB0
= fma(pB0
, w2
, CB0
);
651 pB0
= fma(pB1
, w
, pB0
);
653 pC0
= fma(CC10
, t2
, CC8
);
654 pC1
= fma(CC9
, t2
, CC7
);
655 pC0
= fma(pC0
, t2
, CC6
);
656 pC1
= fma(pC1
, t2
, CC5
);
657 pC0
= fma(pC0
, t2
, CC4
);
658 pC1
= fma(pC1
, t2
, CC3
);
659 pC0
= fma(pC0
, t2
, CC2
);
660 pC1
= fma(pC1
, t2
, CC1
);
662 pC0
= fma(pC0
, t2
, CC0
);
663 pC0
= fma(pC1
, t
, pC0
);
666 // Select pB0 or pC0 for erfc()
668 res_erfc
= blend(pB0
, pC0
, m
);
669 res_erfc
= res_erfc
* expmx2
;
671 // erfc(x<0) = 2-erfc(|x|)
673 res_erfc
= blend(res_erfc
, two
-res_erfc
, m
);
675 // Select erf() or erfc()
676 res
= blend(res_erf
, one
-res_erfc
, maskErf
);
681 /*! \brief SIMD float erfc(x).
683 * \param x The value to calculate erfc(x) for.
686 * This routine achieves full precision (bar the last bit) over most of the
687 * input range, but for large arguments where the result is getting close
688 * to the minimum representable numbers we accept slightly larger errors
689 * (think results that are in the ballpark of 10^-30 for single precision)
690 * since that is not relevant for MD.
692 static inline SimdFloat gmx_simdcall
695 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
696 const SimdFloat
CA6(7.853861353153693e-5f
);
697 const SimdFloat
CA5(-8.010193625184903e-4f
);
698 const SimdFloat
CA4(5.188327685732524e-3f
);
699 const SimdFloat
CA3(-2.685381193529856e-2f
);
700 const SimdFloat
CA2(1.128358514861418e-1f
);
701 const SimdFloat
CA1(-3.761262582423300e-1f
);
702 const SimdFloat
CA0(1.128379165726710f
);
703 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
704 const SimdFloat
CB9(-0.0018629930017603923f
);
705 const SimdFloat
CB8(0.003909821287598495f
);
706 const SimdFloat
CB7(-0.0052094582210355615f
);
707 const SimdFloat
CB6(0.005685614362160572f
);
708 const SimdFloat
CB5(-0.0025367682853477272f
);
709 const SimdFloat
CB4(-0.010199799682318782f
);
710 const SimdFloat
CB3(0.04369575504816542f
);
711 const SimdFloat
CB2(-0.11884063474674492f
);
712 const SimdFloat
CB1(0.2732120154030589f
);
713 const SimdFloat
CB0(0.42758357702025784f
);
714 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
715 const SimdFloat
CC10(-0.0445555913112064f
);
716 const SimdFloat
CC9(0.21376355144663348f
);
717 const SimdFloat
CC8(-0.3473187200259257f
);
718 const SimdFloat
CC7(0.016690861551248114f
);
719 const SimdFloat
CC6(0.7560973182491192f
);
720 const SimdFloat
CC5(-1.2137903600145787f
);
721 const SimdFloat
CC4(0.8411872321232948f
);
722 const SimdFloat
CC3(-0.08670413896296343f
);
723 const SimdFloat
CC2(-0.27124782687240334f
);
724 const SimdFloat
CC1(-0.0007502488047806069f
);
725 const SimdFloat
CC0(0.5642114853803148f
);
726 // Coefficients for expansion of exp(x) in [0,0.1]
727 // CD0 and CD1 are both 1.0, so no need to declare them separately
728 const SimdFloat
CD2(0.5000066608081202f
);
729 const SimdFloat
CD3(0.1664795422874624f
);
730 const SimdFloat
CD4(0.04379839977652482f
);
731 const SimdFloat
one(1.0f
);
732 const SimdFloat
two(2.0f
);
734 /* We need to use a small trick here, since we cannot assume all SIMD
735 * architectures support integers, and the flag we want (0xfffff000) would
736 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
737 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
738 * fp numbers, and perform a logical or. Since the expression is constant,
739 * we can at least hope it is evaluated at compile-time.
741 #if GMX_SIMD_HAVE_LOGICAL
742 const SimdFloat
sieve(SimdFloat(-5.965323564e+29f
) | SimdFloat(7.05044434e-30f
));
744 const int isieve
= 0xFFFFF000;
745 alignas(GMX_SIMD_ALIGNMENT
) float mem
[GMX_SIMD_FLOAT_WIDTH
];
754 SimdFloat q
, z
, t
, t2
, w
, w2
;
755 SimdFloat pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
756 SimdFloat expmx2
, corr
;
757 SimdFloat res_erf
, res_erfc
, res
;
758 SimdFBool m
, msk_erf
;
764 pA0
= fma(CA6
, x4
, CA4
);
765 pA1
= fma(CA5
, x4
, CA3
);
766 pA0
= fma(pA0
, x4
, CA2
);
767 pA1
= fma(pA1
, x4
, CA1
);
769 pA0
= fma(pA0
, x4
, pA1
);
770 // Constant term must come last for precision reasons
777 msk_erf
= SimdFloat(0.75f
) <= y
;
778 t
= maskzInv(y
, msk_erf
);
783 * We cannot simply calculate exp(-y2) directly in single precision, since
784 * that will lose a couple of bits of precision due to the multiplication.
785 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
786 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
788 * The only drawback with this is that it requires TWO separate exponential
789 * evaluations, which would be horrible performance-wise. However, the argument
790 * for the second exp() call is always small, so there we simply use a
791 * low-order minimax expansion on [0,0.1].
793 * However, this neat idea requires support for logical ops (and) on
794 * FP numbers, which some vendors decided isn't necessary in their SIMD
795 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
796 * in double, but we still need memory as a backup when that is not available,
797 * and this case is rare enough that we go directly there...
799 #if GMX_SIMD_HAVE_LOGICAL
803 for (i
= 0; i
< GMX_SIMD_FLOAT_WIDTH
; i
++)
806 conv
.i
= conv
.i
& isieve
;
809 z
= load
<SimdFloat
>(mem
);
812 corr
= fma(CD4
, q
, CD3
);
813 corr
= fma(corr
, q
, CD2
);
814 corr
= fma(corr
, q
, one
);
815 corr
= fma(corr
, q
, one
);
817 expmx2
= exp( -z
*z
);
818 expmx2
= expmx2
* corr
;
820 pB1
= fma(CB9
, w2
, CB7
);
821 pB0
= fma(CB8
, w2
, CB6
);
822 pB1
= fma(pB1
, w2
, CB5
);
823 pB0
= fma(pB0
, w2
, CB4
);
824 pB1
= fma(pB1
, w2
, CB3
);
825 pB0
= fma(pB0
, w2
, CB2
);
826 pB1
= fma(pB1
, w2
, CB1
);
827 pB0
= fma(pB0
, w2
, CB0
);
828 pB0
= fma(pB1
, w
, pB0
);
830 pC0
= fma(CC10
, t2
, CC8
);
831 pC1
= fma(CC9
, t2
, CC7
);
832 pC0
= fma(pC0
, t2
, CC6
);
833 pC1
= fma(pC1
, t2
, CC5
);
834 pC0
= fma(pC0
, t2
, CC4
);
835 pC1
= fma(pC1
, t2
, CC3
);
836 pC0
= fma(pC0
, t2
, CC2
);
837 pC1
= fma(pC1
, t2
, CC1
);
839 pC0
= fma(pC0
, t2
, CC0
);
840 pC0
= fma(pC1
, t
, pC0
);
843 // Select pB0 or pC0 for erfc()
845 res_erfc
= blend(pB0
, pC0
, m
);
846 res_erfc
= res_erfc
* expmx2
;
848 // erfc(x<0) = 2-erfc(|x|)
850 res_erfc
= blend(res_erfc
, two
-res_erfc
, m
);
852 // Select erf() or erfc()
853 res
= blend(one
-res_erf
, res_erfc
, msk_erf
);
858 /*! \brief SIMD float sin \& cos.
860 * \param x The argument to evaluate sin/cos for
861 * \param[out] sinval Sin(x)
862 * \param[out] cosval Cos(x)
864 * This version achieves close to machine precision, but for very large
865 * magnitudes of the argument we inherently begin to lose accuracy due to the
866 * argument reduction, despite using extended precision arithmetics internally.
868 static inline void gmx_simdcall
869 sincos(SimdFloat x
, SimdFloat
*sinval
, SimdFloat
*cosval
)
871 // Constants to subtract Pi/4*x from y while minimizing precision loss
872 const SimdFloat
argred0(-1.5703125);
873 const SimdFloat
argred1(-4.83751296997070312500e-04f
);
874 const SimdFloat
argred2(-7.54953362047672271729e-08f
);
875 const SimdFloat
argred3(-2.56334406825708960298e-12f
);
876 const SimdFloat
two_over_pi(static_cast<float>(2.0f
/M_PI
));
877 const SimdFloat
const_sin2(-1.9515295891e-4f
);
878 const SimdFloat
const_sin1( 8.3321608736e-3f
);
879 const SimdFloat
const_sin0(-1.6666654611e-1f
);
880 const SimdFloat
const_cos2( 2.443315711809948e-5f
);
881 const SimdFloat
const_cos1(-1.388731625493765e-3f
);
882 const SimdFloat
const_cos0( 4.166664568298827e-2f
);
883 const SimdFloat
half(0.5f
);
884 const SimdFloat
one(1.0f
);
885 SimdFloat ssign
, csign
;
886 SimdFloat x2
, y
, z
, psin
, pcos
, sss
, ccc
;
889 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
890 const SimdFInt32
ione(1);
891 const SimdFInt32
itwo(2);
898 m
= cvtIB2B((iy
& ione
) == SimdFInt32(0));
899 ssign
= selectByMask(SimdFloat(GMX_FLOAT_NEGZERO
), cvtIB2B((iy
& itwo
) == itwo
));
900 csign
= selectByMask(SimdFloat(GMX_FLOAT_NEGZERO
), cvtIB2B(((iy
+ione
) & itwo
) == itwo
));
902 const SimdFloat
quarter(0.25f
);
903 const SimdFloat
minusquarter(-0.25f
);
905 SimdFBool m1
, m2
, m3
;
907 /* The most obvious way to find the arguments quadrant in the unit circle
908 * to calculate the sign is to use integer arithmetic, but that is not
909 * present in all SIMD implementations. As an alternative, we have devised a
910 * pure floating-point algorithm that uses truncation for argument reduction
911 * so that we get a new value 0<=q<1 over the unit circle, and then
912 * do floating-point comparisons with fractions. This is likely to be
913 * slightly slower (~10%) due to the longer latencies of floating-point, so
914 * we only use it when integer SIMD arithmetic is not present.
918 // It is critical that half-way cases are rounded down
919 z
= fma(x
, two_over_pi
, half
);
923 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
924 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
925 * This removes the 2*Pi periodicity without using any integer arithmetic.
926 * First check if y had the value 2 or 3, set csign if true.
929 /* If we have logical operations we can work directly on the signbit, which
930 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
931 * Thus, if you are altering defines to debug alternative code paths, the
932 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
933 * active or inactive - you will get errors if only one is used.
935 # if GMX_SIMD_HAVE_LOGICAL
936 ssign
= ssign
& SimdFloat(GMX_FLOAT_NEGZERO
);
937 csign
= andNot(q
, SimdFloat(GMX_FLOAT_NEGZERO
));
938 ssign
= ssign
^ csign
;
940 ssign
= copysign(SimdFloat(1.0f
), ssign
);
941 csign
= copysign(SimdFloat(1.0f
), q
);
943 ssign
= ssign
* csign
; // swap ssign if csign was set.
945 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
946 m1
= (q
< minusquarter
);
947 m2
= (setZero() <= q
);
951 // where mask is FALSE, swap sign.
952 csign
= csign
* blend(SimdFloat(-1.0f
), one
, m
);
954 x
= fma(y
, argred0
, x
);
955 x
= fma(y
, argred1
, x
);
956 x
= fma(y
, argred2
, x
);
957 x
= fma(y
, argred3
, x
);
960 psin
= fma(const_sin2
, x2
, const_sin1
);
961 psin
= fma(psin
, x2
, const_sin0
);
962 psin
= fma(psin
, x
* x2
, x
);
963 pcos
= fma(const_cos2
, x2
, const_cos1
);
964 pcos
= fma(pcos
, x2
, const_cos0
);
965 pcos
= fms(pcos
, x2
, half
);
966 pcos
= fma(pcos
, x2
, one
);
968 sss
= blend(pcos
, psin
, m
);
969 ccc
= blend(psin
, pcos
, m
);
970 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
971 #if GMX_SIMD_HAVE_LOGICAL
972 *sinval
= sss
^ ssign
;
973 *cosval
= ccc
^ csign
;
975 *sinval
= sss
* ssign
;
976 *cosval
= ccc
* csign
;
980 /*! \brief SIMD float sin(x).
982 * \param x The argument to evaluate sin for
985 * \attention Do NOT call both sin & cos if you need both results, since each of them
986 * will then call \ref sincos and waste a factor 2 in performance.
988 static inline SimdFloat gmx_simdcall
996 /*! \brief SIMD float cos(x).
998 * \param x The argument to evaluate cos for
1001 * \attention Do NOT call both sin & cos if you need both results, since each of them
1002 * will then call \ref sincos and waste a factor 2 in performance.
1004 static inline SimdFloat gmx_simdcall
1012 /*! \brief SIMD float tan(x).
1014 * \param x The argument to evaluate tan for
1017 static inline SimdFloat gmx_simdcall
1020 const SimdFloat
argred0(-1.5703125);
1021 const SimdFloat
argred1(-4.83751296997070312500e-04f
);
1022 const SimdFloat
argred2(-7.54953362047672271729e-08f
);
1023 const SimdFloat
argred3(-2.56334406825708960298e-12f
);
1024 const SimdFloat
two_over_pi(static_cast<float>(2.0f
/M_PI
));
1025 const SimdFloat
CT6(0.009498288995810566122993911);
1026 const SimdFloat
CT5(0.002895755790837379295226923);
1027 const SimdFloat
CT4(0.02460087336161924491836265);
1028 const SimdFloat
CT3(0.05334912882656359828045988);
1029 const SimdFloat
CT2(0.1333989091464957704418495);
1030 const SimdFloat
CT1(0.3333307599244198227797507);
1032 SimdFloat x2
, p
, y
, z
;
1035 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1039 z
= x
* two_over_pi
;
1042 m
= cvtIB2B((iy
& ione
) == ione
);
1044 x
= fma(y
, argred0
, x
);
1045 x
= fma(y
, argred1
, x
);
1046 x
= fma(y
, argred2
, x
);
1047 x
= fma(y
, argred3
, x
);
1048 x
= selectByMask(SimdFloat(GMX_FLOAT_NEGZERO
), m
) ^ x
;
1050 const SimdFloat
quarter(0.25f
);
1051 const SimdFloat
half(0.5f
);
1052 const SimdFloat
threequarter(0.75f
);
1054 SimdFBool m1
, m2
, m3
;
1057 z
= fma(w
, two_over_pi
, half
);
1063 m3
= threequarter
<= q
;
1066 w
= fma(y
, argred0
, w
);
1067 w
= fma(y
, argred1
, w
);
1068 w
= fma(y
, argred2
, w
);
1069 w
= fma(y
, argred3
, w
);
1070 w
= blend(w
, -w
, m
);
1071 x
= w
* copysign( SimdFloat(1.0), x
);
1074 p
= fma(CT6
, x2
, CT5
);
1075 p
= fma(p
, x2
, CT4
);
1076 p
= fma(p
, x2
, CT3
);
1077 p
= fma(p
, x2
, CT2
);
1078 p
= fma(p
, x2
, CT1
);
1079 p
= fma(x2
* p
, x
, x
);
1081 p
= blend( p
, maskzInv(p
, m
), m
);
1085 /*! \brief SIMD float asin(x).
1087 * \param x The argument to evaluate asin for
1090 static inline SimdFloat gmx_simdcall
1093 const SimdFloat
limitlow(1e-4f
);
1094 const SimdFloat
half(0.5f
);
1095 const SimdFloat
one(1.0f
);
1096 const SimdFloat
halfpi(static_cast<float>(M_PI
/2.0f
));
1097 const SimdFloat
CC5(4.2163199048E-2f
);
1098 const SimdFloat
CC4(2.4181311049E-2f
);
1099 const SimdFloat
CC3(4.5470025998E-2f
);
1100 const SimdFloat
CC2(7.4953002686E-2f
);
1101 const SimdFloat
CC1(1.6666752422E-1f
);
1103 SimdFloat z
, z1
, z2
, q
, q1
, q2
;
1109 z1
= half
* (one
-xabs
);
1111 q1
= z1
* maskzInvsqrt(z1
, m2
);
1114 z
= blend(z2
, z1
, m
);
1115 q
= blend(q2
, q1
, m
);
1118 pA
= fma(CC5
, z2
, CC3
);
1119 pB
= fma(CC4
, z2
, CC2
);
1120 pA
= fma(pA
, z2
, CC1
);
1122 z
= fma(pB
, z2
, pA
);
1126 z
= blend(z
, q2
, m
);
1128 m
= limitlow
< xabs
;
1129 z
= blend( xabs
, z
, m
);
1135 /*! \brief SIMD float acos(x).
1137 * \param x The argument to evaluate acos for
1140 static inline SimdFloat gmx_simdcall
1143 const SimdFloat
one(1.0f
);
1144 const SimdFloat
half(0.5f
);
1145 const SimdFloat
pi(static_cast<float>(M_PI
));
1146 const SimdFloat
halfpi(static_cast<float>(M_PI
/2.0f
));
1148 SimdFloat z
, z1
, z2
, z3
;
1149 SimdFBool m1
, m2
, m3
;
1155 z
= fnma(half
, xabs
, half
);
1157 z
= z
* maskzInvsqrt(z
, m3
);
1158 z
= blend(x
, z
, m1
);
1164 z
= blend(z1
, z2
, m2
);
1165 z
= blend(z3
, z
, m1
);
1170 /*! \brief SIMD float asin(x).
1172 * \param x The argument to evaluate atan for
1173 * \result Atan(x), same argument/value range as standard math library.
1175 static inline SimdFloat gmx_simdcall
1178 const SimdFloat
halfpi(static_cast<float>(M_PI
/2.0f
));
1179 const SimdFloat
CA17(0.002823638962581753730774f
);
1180 const SimdFloat
CA15(-0.01595690287649631500244f
);
1181 const SimdFloat
CA13(0.04250498861074447631836f
);
1182 const SimdFloat
CA11(-0.07489009201526641845703f
);
1183 const SimdFloat
CA9 (0.1063479334115982055664f
);
1184 const SimdFloat
CA7 (-0.1420273631811141967773f
);
1185 const SimdFloat
CA5 (0.1999269574880599975585f
);
1186 const SimdFloat
CA3 (-0.3333310186862945556640f
);
1187 const SimdFloat
one (1.0f
);
1188 SimdFloat x2
, x3
, x4
, pA
, pB
;
1194 x
= blend(x
, maskzInv(x
, m2
), m2
);
1199 pA
= fma(CA17
, x4
, CA13
);
1200 pB
= fma(CA15
, x4
, CA11
);
1201 pA
= fma(pA
, x4
, CA9
);
1202 pB
= fma(pB
, x4
, CA7
);
1203 pA
= fma(pA
, x4
, CA5
);
1204 pB
= fma(pB
, x4
, CA3
);
1205 pA
= fma(pA
, x2
, pB
);
1206 pA
= fma(pA
, x3
, x
);
1208 pA
= blend(pA
, halfpi
-pA
, m2
);
1209 pA
= blend(pA
, -pA
, m
);
1214 /*! \brief SIMD float atan2(y,x).
1216 * \param y Y component of vector, any quartile
1217 * \param x X component of vector, any quartile
1218 * \result Atan(y,x), same argument/value range as standard math library.
1220 * \note This routine should provide correct results for all finite
1221 * non-zero or positive-zero arguments. However, negative zero arguments will
1222 * be treated as positive zero, which means the return value will deviate from
1223 * the standard math library atan2(y,x) for those cases. That should not be
1224 * of any concern in Gromacs, and in particular it will not affect calculations
1225 * of angles from vectors.
1227 static inline SimdFloat gmx_simdcall
1228 atan2(SimdFloat y
, SimdFloat x
)
1230 const SimdFloat
pi(static_cast<float>(M_PI
));
1231 const SimdFloat
halfpi(static_cast<float>(M_PI
/2.0));
1232 SimdFloat xinv
, p
, aoffset
;
1233 SimdFBool mask_xnz
, mask_ynz
, mask_xlt0
, mask_ylt0
;
1235 mask_xnz
= x
!= setZero();
1236 mask_ynz
= y
!= setZero();
1237 mask_xlt0
= x
< setZero();
1238 mask_ylt0
= y
< setZero();
1240 aoffset
= selectByNotMask(halfpi
, mask_xnz
);
1241 aoffset
= selectByMask(aoffset
, mask_ynz
);
1243 aoffset
= blend(aoffset
, pi
, mask_xlt0
);
1244 aoffset
= blend(aoffset
, -aoffset
, mask_ylt0
);
1246 xinv
= maskzInv(x
, mask_xnz
);
1254 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1256 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1257 * \result Correction factor to coulomb force - see below for details.
1259 * This routine is meant to enable analytical evaluation of the
1260 * direct-space PME electrostatic force to avoid tables.
1262 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1263 * are some problems evaluating that:
1265 * First, the error function is difficult (read: expensive) to
1266 * approxmiate accurately for intermediate to large arguments, and
1267 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1268 * Second, we now try to avoid calculating potentials in Gromacs but
1269 * use forces directly.
1271 * We can simply things slight by noting that the PME part is really
1272 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1274 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1276 * The first term we already have from the inverse square root, so
1277 * that we can leave out of this routine.
1279 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1280 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1281 * the range used for the minimax fit. Use your favorite plotting program
1282 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1284 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1285 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1286 * then only use even powers. This is another minor optimization, since
1287 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1288 * the vector between the two atoms to get the vectorial force. The
1289 * fastest flops are the ones we can avoid calculating!
1291 * So, here's how it should be used:
1293 * 1. Calculate \f$r^2\f$.
1294 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1295 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1296 * 4. The return value is the expression:
1299 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1302 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1305 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1308 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1311 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1314 * With a bit of math exercise you should be able to confirm that
1318 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1321 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1322 * and you have your force (divided by \f$r\f$). A final multiplication
1323 * with the vector connecting the two particles and you have your
1324 * vectorial force to add to the particles.
1326 * This approximation achieves an error slightly lower than 1e-6
1327 * in single precision and 1e-11 in double precision
1328 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1329 * when added to \f$1/r\f$ the error will be insignificant.
1330 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1333 static inline SimdFloat gmx_simdcall
1334 pmeForceCorrection(SimdFloat z2
)
1336 const SimdFloat
FN6(-1.7357322914161492954e-8f
);
1337 const SimdFloat
FN5(1.4703624142580877519e-6f
);
1338 const SimdFloat
FN4(-0.000053401640219807709149f
);
1339 const SimdFloat
FN3(0.0010054721316683106153f
);
1340 const SimdFloat
FN2(-0.019278317264888380590f
);
1341 const SimdFloat
FN1(0.069670166153766424023f
);
1342 const SimdFloat
FN0(-0.75225204789749321333f
);
1344 const SimdFloat
FD4(0.0011193462567257629232f
);
1345 const SimdFloat
FD3(0.014866955030185295499f
);
1346 const SimdFloat
FD2(0.11583842382862377919f
);
1347 const SimdFloat
FD1(0.50736591960530292870f
);
1348 const SimdFloat
FD0(1.0f
);
1351 SimdFloat polyFN0
, polyFN1
, polyFD0
, polyFD1
;
1355 polyFD0
= fma(FD4
, z4
, FD2
);
1356 polyFD1
= fma(FD3
, z4
, FD1
);
1357 polyFD0
= fma(polyFD0
, z4
, FD0
);
1358 polyFD0
= fma(polyFD1
, z2
, polyFD0
);
1360 polyFD0
= inv(polyFD0
);
1362 polyFN0
= fma(FN6
, z4
, FN4
);
1363 polyFN1
= fma(FN5
, z4
, FN3
);
1364 polyFN0
= fma(polyFN0
, z4
, FN2
);
1365 polyFN1
= fma(polyFN1
, z4
, FN1
);
1366 polyFN0
= fma(polyFN0
, z4
, FN0
);
1367 polyFN0
= fma(polyFN1
, z2
, polyFN0
);
1369 return polyFN0
* polyFD0
;
1374 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1376 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1377 * \result Correction factor to coulomb potential - see below for details.
1379 * See \ref pmeForceCorrection for details about the approximation.
1381 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1382 * as the input argument.
1384 * Here's how it should be used:
1386 * 1. Calculate \f$r^2\f$.
1387 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1388 * 3. Evaluate this routine with z^2 as the argument.
1389 * 4. The return value is the expression:
1392 * \frac{\mbox{erf}(z)}{z}
1395 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1398 * \frac{\mbox{erf}(r \beta)}{r}
1401 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1402 * and you have your potential.
1404 * This approximation achieves an error slightly lower than 1e-6
1405 * in single precision and 4e-11 in double precision
1406 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1407 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1408 * when added to \f$1/r\f$ the error will be insignificant.
1409 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1411 static inline SimdFloat gmx_simdcall
1412 pmePotentialCorrection(SimdFloat z2
)
1414 const SimdFloat
VN6(1.9296833005951166339e-8f
);
1415 const SimdFloat
VN5(-1.4213390571557850962e-6f
);
1416 const SimdFloat
VN4(0.000041603292906656984871f
);
1417 const SimdFloat
VN3(-0.00013134036773265025626f
);
1418 const SimdFloat
VN2(0.038657983986041781264f
);
1419 const SimdFloat
VN1(0.11285044772717598220f
);
1420 const SimdFloat
VN0(1.1283802385263030286f
);
1422 const SimdFloat
VD3(0.0066752224023576045451f
);
1423 const SimdFloat
VD2(0.078647795836373922256f
);
1424 const SimdFloat
VD1(0.43336185284710920150f
);
1425 const SimdFloat
VD0(1.0f
);
1428 SimdFloat polyVN0
, polyVN1
, polyVD0
, polyVD1
;
1432 polyVD1
= fma(VD3
, z4
, VD1
);
1433 polyVD0
= fma(VD2
, z4
, VD0
);
1434 polyVD0
= fma(polyVD1
, z2
, polyVD0
);
1436 polyVD0
= inv(polyVD0
);
1438 polyVN0
= fma(VN6
, z4
, VN4
);
1439 polyVN1
= fma(VN5
, z4
, VN3
);
1440 polyVN0
= fma(polyVN0
, z4
, VN2
);
1441 polyVN1
= fma(polyVN1
, z4
, VN1
);
1442 polyVN0
= fma(polyVN0
, z4
, VN0
);
1443 polyVN0
= fma(polyVN1
, z2
, polyVN0
);
1445 return polyVN0
* polyVD0
;
1451 #if GMX_SIMD_HAVE_DOUBLE
1454 /*! \name Double precision SIMD math functions
1456 * \note In most cases you should use the real-precision functions instead.
1460 /****************************************
1461 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1462 ****************************************/
1464 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1465 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1467 * \param x Values to set sign for
1468 * \param y Values used to set sign
1469 * \return Magnitude of x, sign of y
1471 static inline SimdDouble gmx_simdcall
1472 copysign(SimdDouble x
, SimdDouble y
)
1474 #if GMX_SIMD_HAVE_LOGICAL
1475 return abs(x
) | (SimdDouble(GMX_DOUBLE_NEGZERO
) & y
);
1477 return blend(abs(x
), -abs(x
), (y
< setZero()));
1482 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1483 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1485 * This is a low-level routine that should only be used by SIMD math routine
1486 * that evaluates the inverse square root.
1488 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1489 * \param x The reference (starting) value x for which we want 1/sqrt(x).
1490 * \return An improved approximation with roughly twice as many bits of accuracy.
1492 static inline SimdDouble gmx_simdcall
1493 rsqrtIter(SimdDouble lu
, SimdDouble x
)
1495 SimdDouble tmp1
= x
*lu
;
1496 SimdDouble tmp2
= SimdDouble(-0.5)*lu
;
1497 tmp1
= fma(tmp1
, lu
, SimdDouble(-3.0));
1502 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1504 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1505 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1506 * For the single precision implementation this is obviously always
1507 * true for positive values, but for double precision it adds an
1508 * extra restriction since the first lookup step might have to be
1509 * performed in single precision on some architectures. Note that the
1510 * responsibility for checking falls on you - this routine does not
1513 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
1515 static inline SimdDouble gmx_simdcall
1516 invsqrt(SimdDouble x
)
1518 SimdDouble lu
= rsqrt(x
);
1519 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1520 lu
= rsqrtIter(lu
, x
);
1522 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1523 lu
= rsqrtIter(lu
, x
);
1525 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1526 lu
= rsqrtIter(lu
, x
);
1528 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1529 lu
= rsqrtIter(lu
, x
);
1534 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1536 * \param x0 First set of arguments, x0 must be in single range (see below).
1537 * \param x1 Second set of arguments, x1 must be in single range (see below).
1538 * \param[out] out0 Result 1/sqrt(x0)
1539 * \param[out] out1 Result 1/sqrt(x1)
1541 * In particular for double precision we can sometimes calculate square root
1542 * pairs slightly faster by using single precision until the very last step.
1544 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1545 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1546 * For the single precision implementation this is obviously always
1547 * true for positive values, but for double precision it adds an
1548 * extra restriction since the first lookup step might have to be
1549 * performed in single precision on some architectures. Note that the
1550 * responsibility for checking falls on you - this routine does not
1553 static inline void gmx_simdcall
1554 invsqrtPair(SimdDouble x0
, SimdDouble x1
,
1555 SimdDouble
*out0
, SimdDouble
*out1
)
1557 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1558 SimdFloat xf
= cvtDD2F(x0
, x1
);
1559 SimdFloat luf
= rsqrt(xf
);
1560 SimdDouble lu0
, lu1
;
1561 // Intermediate target is single - mantissa+1 bits
1562 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1563 luf
= rsqrtIter(luf
, xf
);
1565 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1566 luf
= rsqrtIter(luf
, xf
);
1568 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1569 luf
= rsqrtIter(luf
, xf
);
1571 cvtF2DD(luf
, &lu0
, &lu1
);
1572 // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1573 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1574 lu0
= rsqrtIter(lu0
, x0
);
1575 lu1
= rsqrtIter(lu1
, x1
);
1577 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1578 lu0
= rsqrtIter(lu0
, x0
);
1579 lu1
= rsqrtIter(lu1
, x1
);
1584 *out0
= invsqrt(x0
);
1585 *out1
= invsqrt(x1
);
1589 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1590 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1592 * This is a low-level routine that should only be used by SIMD math routine
1593 * that evaluates the reciprocal.
1595 * \param lu Approximation of 1/x, typically obtained from lookup.
1596 * \param x The reference (starting) value x for which we want 1/x.
1597 * \return An improved approximation with roughly twice as many bits of accuracy.
1599 static inline SimdDouble gmx_simdcall
1600 rcpIter(SimdDouble lu
, SimdDouble x
)
1602 return lu
*fnma(lu
, x
, SimdDouble(2.0));
1606 /*! \brief Calculate 1/x for SIMD double.
1608 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1609 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1610 * For the single precision implementation this is obviously always
1611 * true for positive values, but for double precision it adds an
1612 * extra restriction since the first lookup step might have to be
1613 * performed in single precision on some architectures. Note that the
1614 * responsibility for checking falls on you - this routine does not
1617 * \return 1/x. Result is undefined if your argument was invalid.
1619 static inline SimdDouble gmx_simdcall
1622 SimdDouble lu
= rcp(x
);
1623 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1624 lu
= rcpIter(lu
, x
);
1626 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1627 lu
= rcpIter(lu
, x
);
1629 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1630 lu
= rcpIter(lu
, x
);
1632 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1633 lu
= rcpIter(lu
, x
);
1638 /*! \brief Division for SIMD doubles
1640 * \param nom Nominator
1641 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1642 * For single precision this is equivalent to a nonzero argument,
1643 * but in double precision it adds an extra restriction since
1644 * the first lookup step might have to be performed in single
1645 * precision on some architectures. Note that the responsibility
1646 * for checking falls on you - this routine does not check arguments.
1650 * \note This function does not use any masking to avoid problems with
1651 * zero values in the denominator.
1653 static inline SimdDouble gmx_simdcall
1654 operator/(SimdDouble nom
, SimdDouble denom
)
1656 return nom
*inv(denom
);
1660 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1662 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1663 * Illegal values in the masked-out elements will not lead to
1664 * floating-point exceptions.
1666 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1667 * GMX_FLOAT_MAX for masked-in entries.
1668 * See \ref invsqrt for the discussion about argument restrictions.
1670 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
1671 * entry was not masked, and 0.0 for masked-out entries.
1673 static inline SimdDouble
1674 maskzInvsqrt(SimdDouble x
, SimdDBool m
)
1676 SimdDouble lu
= maskzRsqrt(x
, m
);
1677 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1678 lu
= rsqrtIter(lu
, x
);
1680 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1681 lu
= rsqrtIter(lu
, x
);
1683 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1684 lu
= rsqrtIter(lu
, x
);
1686 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1687 lu
= rsqrtIter(lu
, x
);
1692 /*! \brief Calculate 1/x for SIMD double, masked version.
1694 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1695 * GMX_FLOAT_MAX for masked-in entries.
1696 * See \ref invsqrt for the discussion about argument restrictions.
1698 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1700 static inline SimdDouble gmx_simdcall
1701 maskzInv(SimdDouble x
, SimdDBool m
)
1703 SimdDouble lu
= maskzRcp(x
, m
);
1704 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1705 lu
= rcpIter(lu
, x
);
1707 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1708 lu
= rcpIter(lu
, x
);
1710 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1711 lu
= rcpIter(lu
, x
);
1713 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1714 lu
= rcpIter(lu
, x
);
1720 /*! \brief Calculate sqrt(x) for SIMD doubles.
1722 * \copydetails sqrt(SimdFloat)
1724 template <MathOptimization opt
= MathOptimization::Safe
>
1725 static inline SimdDouble gmx_simdcall
1728 if (opt
== MathOptimization::Safe
)
1730 // As we might use a float version of rsqrt, we mask out small values
1731 SimdDouble res
= maskzInvsqrt(x
, SimdDouble(GMX_FLOAT_MIN
) < x
);
1736 return x
* invsqrt(x
);
1740 #if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
1741 /*! \brief SIMD double log(x). This is the natural logarithm.
1743 * \param x Argument, should be >0.
1744 * \result The natural logarithm of x. Undefined if argument is invalid.
1746 static inline SimdDouble gmx_simdcall
1749 const SimdDouble
one(1.0);
1750 const SimdDouble
two(2.0);
1751 const SimdDouble
invsqrt2(1.0/std::sqrt(2.0));
1752 const SimdDouble
corr(0.693147180559945286226764);
1753 const SimdDouble
CL15(0.148197055177935105296783);
1754 const SimdDouble
CL13(0.153108178020442575739679);
1755 const SimdDouble
CL11(0.181837339521549679055568);
1756 const SimdDouble
CL9(0.22222194152736701733275);
1757 const SimdDouble
CL7(0.285714288030134544449368);
1758 const SimdDouble
CL5(0.399999999989941956712869);
1759 const SimdDouble
CL3(0.666666666666685503450651);
1760 const SimdDouble
CL1(2.0);
1761 SimdDouble fExp
, x2
, p
;
1765 x
= frexp(x
, &iExp
);
1766 fExp
= cvtI2R(iExp
);
1769 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
1770 fExp
= fExp
- selectByMask(one
, m
);
1771 x
= x
* blend(one
, two
, m
);
1773 x
= (x
-one
) * inv( x
+one
);
1776 p
= fma(CL15
, x2
, CL13
);
1777 p
= fma(p
, x2
, CL11
);
1778 p
= fma(p
, x2
, CL9
);
1779 p
= fma(p
, x2
, CL7
);
1780 p
= fma(p
, x2
, CL5
);
1781 p
= fma(p
, x2
, CL3
);
1782 p
= fma(p
, x2
, CL1
);
1783 p
= fma(p
, x
, corr
* fExp
);
1789 #if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
1790 /*! \brief SIMD double 2^x.
1792 * \copydetails exp2(SimdFloat)
1794 template <MathOptimization opt
= MathOptimization::Safe
>
1795 static inline SimdDouble gmx_simdcall
1798 const SimdDouble
CE11(4.435280790452730022081181e-10);
1799 const SimdDouble
CE10(7.074105630863314448024247e-09);
1800 const SimdDouble
CE9(1.017819803432096698472621e-07);
1801 const SimdDouble
CE8(1.321543308956718799557863e-06);
1802 const SimdDouble
CE7(0.00001525273348995851746990884);
1803 const SimdDouble
CE6(0.0001540353046251466849082632);
1804 const SimdDouble
CE5(0.001333355814678995257307880);
1805 const SimdDouble
CE4(0.009618129107588335039176502);
1806 const SimdDouble
CE3(0.05550410866481992147457793);
1807 const SimdDouble
CE2(0.2402265069591015620470894);
1808 const SimdDouble
CE1(0.6931471805599453304615075);
1809 const SimdDouble
one(1.0);
1812 SimdDouble fexppart
;
1815 // Large negative values are valid arguments to exp2(), so there are two
1816 // things we need to account for:
1817 // 1. When the exponents reaches -1023, the (biased) exponent field will be
1818 // zero and we can no longer multiply with it. There are special IEEE
1819 // formats to handle this range, but for now we have to accept that
1820 // we cannot handle those arguments. If input value becomes even more
1821 // negative, it will start to loop and we would end up with invalid
1822 // exponents. Thus, we need to limit or mask this.
1823 // 2. For VERY large negative values, we will have problems that the
1824 // subtraction to get the fractional part loses accuracy, and then we
1825 // can end up with overflows in the polynomial.
1827 // For now, we handle this by forwarding the math optimization setting to
1828 // ldexp, where the routine will return zero for very small arguments.
1830 // However, before doing that we need to make sure we do not call cvtR2I
1831 // with an argument that is so negative it cannot be converted to an integer.
1832 if (opt
== MathOptimization::Safe
)
1834 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest()));
1837 fexppart
= ldexp
<opt
>(one
, cvtR2I(x
));
1841 p
= fma(CE11
, x
, CE10
);
1857 #if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
1858 /*! \brief SIMD double exp(x).
1860 * \copydetails exp(SimdFloat)
1862 template <MathOptimization opt
= MathOptimization::Safe
>
1863 static inline SimdDouble gmx_simdcall
1866 const SimdDouble
argscale(1.44269504088896340735992468100);
1867 const SimdDouble
invargscale0(-0.69314718055966295651160180568695068359375);
1868 const SimdDouble
invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
1869 const SimdDouble
CE12(2.078375306791423699350304e-09);
1870 const SimdDouble
CE11(2.518173854179933105218635e-08);
1871 const SimdDouble
CE10(2.755842049600488770111608e-07);
1872 const SimdDouble
CE9(2.755691815216689746619849e-06);
1873 const SimdDouble
CE8(2.480158383706245033920920e-05);
1874 const SimdDouble
CE7(0.0001984127043518048611841321);
1875 const SimdDouble
CE6(0.001388888889360258341755930);
1876 const SimdDouble
CE5(0.008333333332907368102819109);
1877 const SimdDouble
CE4(0.04166666666663836745814631);
1878 const SimdDouble
CE3(0.1666666666666796929434570);
1879 const SimdDouble
CE2(0.5);
1880 const SimdDouble
one(1.0);
1881 SimdDouble fexppart
;
1885 // Large negative values are valid arguments to exp2(), so there are two
1886 // things we need to account for:
1887 // 1. When the exponents reaches -1023, the (biased) exponent field will be
1888 // zero and we can no longer multiply with it. There are special IEEE
1889 // formats to handle this range, but for now we have to accept that
1890 // we cannot handle those arguments. If input value becomes even more
1891 // negative, it will start to loop and we would end up with invalid
1892 // exponents. Thus, we need to limit or mask this.
1893 // 2. For VERY large negative values, we will have problems that the
1894 // subtraction to get the fractional part loses accuracy, and then we
1895 // can end up with overflows in the polynomial.
1897 // For now, we handle this by forwarding the math optimization setting to
1898 // ldexp, where the routine will return zero for very small arguments.
1900 // However, before doing that we need to make sure we do not call cvtR2I
1901 // with an argument that is so negative it cannot be converted to an integer
1902 // after being multiplied by argscale.
1904 if (opt
== MathOptimization::Safe
)
1906 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest())/argscale
);
1911 fexppart
= ldexp
<opt
>(one
, cvtR2I(y
));
1914 // Extended precision arithmetics
1915 x
= fma(invargscale0
, intpart
, x
);
1916 x
= fma(invargscale1
, intpart
, x
);
1918 p
= fma(CE12
, x
, CE11
);
1919 p
= fma(p
, x
, CE10
);
1928 p
= fma(p
, x
* x
, x
);
1929 x
= fma(p
, fexppart
, fexppart
);
1935 /*! \brief SIMD double erf(x).
1937 * \param x The value to calculate erf(x) for.
1940 * This routine achieves very close to full precision, but we do not care about
1941 * the last bit or the subnormal result range.
1943 static inline SimdDouble gmx_simdcall
1946 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
1947 const SimdDouble
CAP4(-0.431780540597889301512e-4);
1948 const SimdDouble
CAP3(-0.00578562306260059236059);
1949 const SimdDouble
CAP2(-0.028593586920219752446);
1950 const SimdDouble
CAP1(-0.315924962948621698209);
1951 const SimdDouble
CAP0(0.14952975608477029151);
1953 const SimdDouble
CAQ5(-0.374089300177174709737e-5);
1954 const SimdDouble
CAQ4(0.00015126584532155383535);
1955 const SimdDouble
CAQ3(0.00536692680669480725423);
1956 const SimdDouble
CAQ2(0.0668686825594046122636);
1957 const SimdDouble
CAQ1(0.402604990869284362773);
1959 const SimdDouble
CAoffset(0.9788494110107421875);
1961 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
1962 const SimdDouble
CBP6(2.49650423685462752497647637088e-10);
1963 const SimdDouble
CBP5(0.00119770193298159629350136085658);
1964 const SimdDouble
CBP4(0.0164944422378370965881008942733);
1965 const SimdDouble
CBP3(0.0984581468691775932063932439252);
1966 const SimdDouble
CBP2(0.317364595806937763843589437418);
1967 const SimdDouble
CBP1(0.554167062641455850932670067075);
1968 const SimdDouble
CBP0(0.427583576155807163756925301060);
1969 const SimdDouble
CBQ7(0.00212288829699830145976198384930);
1970 const SimdDouble
CBQ6(0.0334810979522685300554606393425);
1971 const SimdDouble
CBQ5(0.2361713785181450957579508850717);
1972 const SimdDouble
CBQ4(0.955364736493055670530981883072);
1973 const SimdDouble
CBQ3(2.36815675631420037315349279199);
1974 const SimdDouble
CBQ2(3.55261649184083035537184223542);
1975 const SimdDouble
CBQ1(2.93501136050160872574376997993);
1978 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
1979 const SimdDouble
CCP6(-2.8175401114513378771);
1980 const SimdDouble
CCP5(-3.22729451764143718517);
1981 const SimdDouble
CCP4(-2.5518551727311523996);
1982 const SimdDouble
CCP3(-0.687717681153649930619);
1983 const SimdDouble
CCP2(-0.212652252872804219852);
1984 const SimdDouble
CCP1(0.0175389834052493308818);
1985 const SimdDouble
CCP0(0.00628057170626964891937);
1987 const SimdDouble
CCQ6(5.48409182238641741584);
1988 const SimdDouble
CCQ5(13.5064170191802889145);
1989 const SimdDouble
CCQ4(22.9367376522880577224);
1990 const SimdDouble
CCQ3(15.930646027911794143);
1991 const SimdDouble
CCQ2(11.0567237927800161565);
1992 const SimdDouble
CCQ1(2.79257750980575282228);
1994 const SimdDouble
CCoffset(0.5579090118408203125);
1996 const SimdDouble
one(1.0);
1997 const SimdDouble
two(2.0);
1999 SimdDouble xabs
, x2
, x4
, t
, t2
, w
, w2
;
2000 SimdDouble PolyAP0
, PolyAP1
, PolyAQ0
, PolyAQ1
;
2001 SimdDouble PolyBP0
, PolyBP1
, PolyBQ0
, PolyBQ1
;
2002 SimdDouble PolyCP0
, PolyCP1
, PolyCQ0
, PolyCQ1
;
2003 SimdDouble res_erf
, res_erfcB
, res_erfcC
, res_erfc
, res
;
2005 SimdDBool mask
, mask_erf
, notmask_erf
;
2009 mask_erf
= (xabs
< one
);
2010 notmask_erf
= (one
<= xabs
);
2014 PolyAP0
= fma(CAP4
, x4
, CAP2
);
2015 PolyAP1
= fma(CAP3
, x4
, CAP1
);
2016 PolyAP0
= fma(PolyAP0
, x4
, CAP0
);
2017 PolyAP0
= fma(PolyAP1
, x2
, PolyAP0
);
2019 PolyAQ1
= fma(CAQ5
, x4
, CAQ3
);
2020 PolyAQ0
= fma(CAQ4
, x4
, CAQ2
);
2021 PolyAQ1
= fma(PolyAQ1
, x4
, CAQ1
);
2022 PolyAQ0
= fma(PolyAQ0
, x4
, one
);
2023 PolyAQ0
= fma(PolyAQ1
, x2
, PolyAQ0
);
2025 res_erf
= PolyAP0
* maskzInv(PolyAQ0
, mask_erf
);
2026 res_erf
= CAoffset
+ res_erf
;
2027 res_erf
= x
* res_erf
;
2029 // Calculate erfc() in range [1,4.5]
2033 PolyBP0
= fma(CBP6
, t2
, CBP4
);
2034 PolyBP1
= fma(CBP5
, t2
, CBP3
);
2035 PolyBP0
= fma(PolyBP0
, t2
, CBP2
);
2036 PolyBP1
= fma(PolyBP1
, t2
, CBP1
);
2037 PolyBP0
= fma(PolyBP0
, t2
, CBP0
);
2038 PolyBP0
= fma(PolyBP1
, t
, PolyBP0
);
2040 PolyBQ1
= fma(CBQ7
, t2
, CBQ5
);
2041 PolyBQ0
= fma(CBQ6
, t2
, CBQ4
);
2042 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ3
);
2043 PolyBQ0
= fma(PolyBQ0
, t2
, CBQ2
);
2044 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ1
);
2045 PolyBQ0
= fma(PolyBQ0
, t2
, one
);
2046 PolyBQ0
= fma(PolyBQ1
, t
, PolyBQ0
);
2048 // The denominator polynomial can be zero outside the range
2049 res_erfcB
= PolyBP0
* maskzInv(PolyBQ0
, notmask_erf
);
2051 res_erfcB
= res_erfcB
* xabs
;
2053 // Calculate erfc() in range [4.5,inf]
2054 w
= maskzInv(xabs
, notmask_erf
);
2057 PolyCP0
= fma(CCP6
, w2
, CCP4
);
2058 PolyCP1
= fma(CCP5
, w2
, CCP3
);
2059 PolyCP0
= fma(PolyCP0
, w2
, CCP2
);
2060 PolyCP1
= fma(PolyCP1
, w2
, CCP1
);
2061 PolyCP0
= fma(PolyCP0
, w2
, CCP0
);
2062 PolyCP0
= fma(PolyCP1
, w
, PolyCP0
);
2064 PolyCQ0
= fma(CCQ6
, w2
, CCQ4
);
2065 PolyCQ1
= fma(CCQ5
, w2
, CCQ3
);
2066 PolyCQ0
= fma(PolyCQ0
, w2
, CCQ2
);
2067 PolyCQ1
= fma(PolyCQ1
, w2
, CCQ1
);
2068 PolyCQ0
= fma(PolyCQ0
, w2
, one
);
2069 PolyCQ0
= fma(PolyCQ1
, w
, PolyCQ0
);
2071 expmx2
= exp( -x2
);
2073 // The denominator polynomial can be zero outside the range
2074 res_erfcC
= PolyCP0
* maskzInv(PolyCQ0
, notmask_erf
);
2075 res_erfcC
= res_erfcC
+ CCoffset
;
2076 res_erfcC
= res_erfcC
* w
;
2078 mask
= (SimdDouble(4.5) < xabs
);
2079 res_erfc
= blend(res_erfcB
, res_erfcC
, mask
);
2081 res_erfc
= res_erfc
* expmx2
;
2083 // erfc(x<0) = 2-erfc(|x|)
2084 mask
= (x
< setZero());
2085 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
2087 // Select erf() or erfc()
2088 res
= blend(one
- res_erfc
, res_erf
, mask_erf
);
2093 /*! \brief SIMD double erfc(x).
2095 * \param x The value to calculate erfc(x) for.
2098 * This routine achieves full precision (bar the last bit) over most of the
2099 * input range, but for large arguments where the result is getting close
2100 * to the minimum representable numbers we accept slightly larger errors
2101 * (think results that are in the ballpark of 10^-200 for double)
2102 * since that is not relevant for MD.
2104 static inline SimdDouble gmx_simdcall
2107 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2108 const SimdDouble
CAP4(-0.431780540597889301512e-4);
2109 const SimdDouble
CAP3(-0.00578562306260059236059);
2110 const SimdDouble
CAP2(-0.028593586920219752446);
2111 const SimdDouble
CAP1(-0.315924962948621698209);
2112 const SimdDouble
CAP0(0.14952975608477029151);
2114 const SimdDouble
CAQ5(-0.374089300177174709737e-5);
2115 const SimdDouble
CAQ4(0.00015126584532155383535);
2116 const SimdDouble
CAQ3(0.00536692680669480725423);
2117 const SimdDouble
CAQ2(0.0668686825594046122636);
2118 const SimdDouble
CAQ1(0.402604990869284362773);
2120 const SimdDouble
CAoffset(0.9788494110107421875);
2122 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2123 const SimdDouble
CBP6(2.49650423685462752497647637088e-10);
2124 const SimdDouble
CBP5(0.00119770193298159629350136085658);
2125 const SimdDouble
CBP4(0.0164944422378370965881008942733);
2126 const SimdDouble
CBP3(0.0984581468691775932063932439252);
2127 const SimdDouble
CBP2(0.317364595806937763843589437418);
2128 const SimdDouble
CBP1(0.554167062641455850932670067075);
2129 const SimdDouble
CBP0(0.427583576155807163756925301060);
2130 const SimdDouble
CBQ7(0.00212288829699830145976198384930);
2131 const SimdDouble
CBQ6(0.0334810979522685300554606393425);
2132 const SimdDouble
CBQ5(0.2361713785181450957579508850717);
2133 const SimdDouble
CBQ4(0.955364736493055670530981883072);
2134 const SimdDouble
CBQ3(2.36815675631420037315349279199);
2135 const SimdDouble
CBQ2(3.55261649184083035537184223542);
2136 const SimdDouble
CBQ1(2.93501136050160872574376997993);
2139 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2140 const SimdDouble
CCP6(-2.8175401114513378771);
2141 const SimdDouble
CCP5(-3.22729451764143718517);
2142 const SimdDouble
CCP4(-2.5518551727311523996);
2143 const SimdDouble
CCP3(-0.687717681153649930619);
2144 const SimdDouble
CCP2(-0.212652252872804219852);
2145 const SimdDouble
CCP1(0.0175389834052493308818);
2146 const SimdDouble
CCP0(0.00628057170626964891937);
2148 const SimdDouble
CCQ6(5.48409182238641741584);
2149 const SimdDouble
CCQ5(13.5064170191802889145);
2150 const SimdDouble
CCQ4(22.9367376522880577224);
2151 const SimdDouble
CCQ3(15.930646027911794143);
2152 const SimdDouble
CCQ2(11.0567237927800161565);
2153 const SimdDouble
CCQ1(2.79257750980575282228);
2155 const SimdDouble
CCoffset(0.5579090118408203125);
2157 const SimdDouble
one(1.0);
2158 const SimdDouble
two(2.0);
2160 SimdDouble xabs
, x2
, x4
, t
, t2
, w
, w2
;
2161 SimdDouble PolyAP0
, PolyAP1
, PolyAQ0
, PolyAQ1
;
2162 SimdDouble PolyBP0
, PolyBP1
, PolyBQ0
, PolyBQ1
;
2163 SimdDouble PolyCP0
, PolyCP1
, PolyCQ0
, PolyCQ1
;
2164 SimdDouble res_erf
, res_erfcB
, res_erfcC
, res_erfc
, res
;
2166 SimdDBool mask
, mask_erf
, notmask_erf
;
2170 mask_erf
= (xabs
< one
);
2171 notmask_erf
= (one
<= xabs
);
2175 PolyAP0
= fma(CAP4
, x4
, CAP2
);
2176 PolyAP1
= fma(CAP3
, x4
, CAP1
);
2177 PolyAP0
= fma(PolyAP0
, x4
, CAP0
);
2178 PolyAP0
= fma(PolyAP1
, x2
, PolyAP0
);
2179 PolyAQ1
= fma(CAQ5
, x4
, CAQ3
);
2180 PolyAQ0
= fma(CAQ4
, x4
, CAQ2
);
2181 PolyAQ1
= fma(PolyAQ1
, x4
, CAQ1
);
2182 PolyAQ0
= fma(PolyAQ0
, x4
, one
);
2183 PolyAQ0
= fma(PolyAQ1
, x2
, PolyAQ0
);
2185 res_erf
= PolyAP0
* maskzInv(PolyAQ0
, mask_erf
);
2186 res_erf
= CAoffset
+ res_erf
;
2187 res_erf
= x
* res_erf
;
2189 // Calculate erfc() in range [1,4.5]
2193 PolyBP0
= fma(CBP6
, t2
, CBP4
);
2194 PolyBP1
= fma(CBP5
, t2
, CBP3
);
2195 PolyBP0
= fma(PolyBP0
, t2
, CBP2
);
2196 PolyBP1
= fma(PolyBP1
, t2
, CBP1
);
2197 PolyBP0
= fma(PolyBP0
, t2
, CBP0
);
2198 PolyBP0
= fma(PolyBP1
, t
, PolyBP0
);
2200 PolyBQ1
= fma(CBQ7
, t2
, CBQ5
);
2201 PolyBQ0
= fma(CBQ6
, t2
, CBQ4
);
2202 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ3
);
2203 PolyBQ0
= fma(PolyBQ0
, t2
, CBQ2
);
2204 PolyBQ1
= fma(PolyBQ1
, t2
, CBQ1
);
2205 PolyBQ0
= fma(PolyBQ0
, t2
, one
);
2206 PolyBQ0
= fma(PolyBQ1
, t
, PolyBQ0
);
2208 // The denominator polynomial can be zero outside the range
2209 res_erfcB
= PolyBP0
* maskzInv(PolyBQ0
, notmask_erf
);
2211 res_erfcB
= res_erfcB
* xabs
;
2213 // Calculate erfc() in range [4.5,inf]
2214 w
= maskzInv(xabs
, xabs
!= setZero());
2217 PolyCP0
= fma(CCP6
, w2
, CCP4
);
2218 PolyCP1
= fma(CCP5
, w2
, CCP3
);
2219 PolyCP0
= fma(PolyCP0
, w2
, CCP2
);
2220 PolyCP1
= fma(PolyCP1
, w2
, CCP1
);
2221 PolyCP0
= fma(PolyCP0
, w2
, CCP0
);
2222 PolyCP0
= fma(PolyCP1
, w
, PolyCP0
);
2224 PolyCQ0
= fma(CCQ6
, w2
, CCQ4
);
2225 PolyCQ1
= fma(CCQ5
, w2
, CCQ3
);
2226 PolyCQ0
= fma(PolyCQ0
, w2
, CCQ2
);
2227 PolyCQ1
= fma(PolyCQ1
, w2
, CCQ1
);
2228 PolyCQ0
= fma(PolyCQ0
, w2
, one
);
2229 PolyCQ0
= fma(PolyCQ1
, w
, PolyCQ0
);
2231 expmx2
= exp( -x2
);
2233 // The denominator polynomial can be zero outside the range
2234 res_erfcC
= PolyCP0
* maskzInv(PolyCQ0
, notmask_erf
);
2235 res_erfcC
= res_erfcC
+ CCoffset
;
2236 res_erfcC
= res_erfcC
* w
;
2238 mask
= (SimdDouble(4.5) < xabs
);
2239 res_erfc
= blend(res_erfcB
, res_erfcC
, mask
);
2241 res_erfc
= res_erfc
* expmx2
;
2243 // erfc(x<0) = 2-erfc(|x|)
2244 mask
= (x
< setZero());
2245 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
2247 // Select erf() or erfc()
2248 res
= blend(res_erfc
, one
- res_erf
, mask_erf
);
2253 /*! \brief SIMD double sin \& cos.
2255 * \param x The argument to evaluate sin/cos for
2256 * \param[out] sinval Sin(x)
2257 * \param[out] cosval Cos(x)
2259 * This version achieves close to machine precision, but for very large
2260 * magnitudes of the argument we inherently begin to lose accuracy due to the
2261 * argument reduction, despite using extended precision arithmetics internally.
2263 static inline void gmx_simdcall
2264 sincos(SimdDouble x
, SimdDouble
*sinval
, SimdDouble
*cosval
)
2266 // Constants to subtract Pi/4*x from y while minimizing precision loss
2267 const SimdDouble
argred0(-2*0.78539816290140151978);
2268 const SimdDouble
argred1(-2*4.9604678871439933374e-10);
2269 const SimdDouble
argred2(-2*1.1258708853173288931e-18);
2270 const SimdDouble
argred3(-2*1.7607799325916000908e-27);
2271 const SimdDouble
two_over_pi(2.0/M_PI
);
2272 const SimdDouble
const_sin5( 1.58938307283228937328511e-10);
2273 const SimdDouble
const_sin4(-2.50506943502539773349318e-08);
2274 const SimdDouble
const_sin3( 2.75573131776846360512547e-06);
2275 const SimdDouble
const_sin2(-0.000198412698278911770864914);
2276 const SimdDouble
const_sin1( 0.0083333333333191845961746);
2277 const SimdDouble
const_sin0(-0.166666666666666130709393);
2279 const SimdDouble
const_cos7(-1.13615350239097429531523e-11);
2280 const SimdDouble
const_cos6( 2.08757471207040055479366e-09);
2281 const SimdDouble
const_cos5(-2.75573144028847567498567e-07);
2282 const SimdDouble
const_cos4( 2.48015872890001867311915e-05);
2283 const SimdDouble
const_cos3(-0.00138888888888714019282329);
2284 const SimdDouble
const_cos2( 0.0416666666666665519592062);
2285 const SimdDouble
half(0.5);
2286 const SimdDouble
one(1.0);
2287 SimdDouble ssign
, csign
;
2288 SimdDouble x2
, y
, z
, psin
, pcos
, sss
, ccc
;
2290 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2291 const SimdDInt32
ione(1);
2292 const SimdDInt32
itwo(2);
2295 z
= x
* two_over_pi
;
2299 mask
= cvtIB2B((iy
& ione
) == setZero());
2300 ssign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B((iy
& itwo
) == itwo
));
2301 csign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B(((iy
+ ione
) & itwo
) == itwo
));
2303 const SimdDouble
quarter(0.25);
2304 const SimdDouble
minusquarter(-0.25);
2306 SimdDBool m1
, m2
, m3
;
2308 /* The most obvious way to find the arguments quadrant in the unit circle
2309 * to calculate the sign is to use integer arithmetic, but that is not
2310 * present in all SIMD implementations. As an alternative, we have devised a
2311 * pure floating-point algorithm that uses truncation for argument reduction
2312 * so that we get a new value 0<=q<1 over the unit circle, and then
2313 * do floating-point comparisons with fractions. This is likely to be
2314 * slightly slower (~10%) due to the longer latencies of floating-point, so
2315 * we only use it when integer SIMD arithmetic is not present.
2319 // It is critical that half-way cases are rounded down
2320 z
= fma(x
, two_over_pi
, half
);
2324 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2325 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2326 * This removes the 2*Pi periodicity without using any integer arithmetic.
2327 * First check if y had the value 2 or 3, set csign if true.
2330 /* If we have logical operations we can work directly on the signbit, which
2331 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2332 * Thus, if you are altering defines to debug alternative code paths, the
2333 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2334 * active or inactive - you will get errors if only one is used.
2336 # if GMX_SIMD_HAVE_LOGICAL
2337 ssign
= ssign
& SimdDouble(GMX_DOUBLE_NEGZERO
);
2338 csign
= andNot(q
, SimdDouble(GMX_DOUBLE_NEGZERO
));
2339 ssign
= ssign
^ csign
;
2341 ssign
= copysign(SimdDouble(1.0), ssign
);
2342 csign
= copysign(SimdDouble(1.0), q
);
2344 ssign
= ssign
* csign
; // swap ssign if csign was set.
2346 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2347 m1
= (q
< minusquarter
);
2348 m2
= (setZero() <= q
);
2352 // where mask is FALSE, swap sign.
2353 csign
= csign
* blend(SimdDouble(-1.0), one
, mask
);
2355 x
= fma(y
, argred0
, x
);
2356 x
= fma(y
, argred1
, x
);
2357 x
= fma(y
, argred2
, x
);
2358 x
= fma(y
, argred3
, x
);
2361 psin
= fma(const_sin5
, x2
, const_sin4
);
2362 psin
= fma(psin
, x2
, const_sin3
);
2363 psin
= fma(psin
, x2
, const_sin2
);
2364 psin
= fma(psin
, x2
, const_sin1
);
2365 psin
= fma(psin
, x2
, const_sin0
);
2366 psin
= fma(psin
, x2
* x
, x
);
2368 pcos
= fma(const_cos7
, x2
, const_cos6
);
2369 pcos
= fma(pcos
, x2
, const_cos5
);
2370 pcos
= fma(pcos
, x2
, const_cos4
);
2371 pcos
= fma(pcos
, x2
, const_cos3
);
2372 pcos
= fma(pcos
, x2
, const_cos2
);
2373 pcos
= fms(pcos
, x2
, half
);
2374 pcos
= fma(pcos
, x2
, one
);
2376 sss
= blend(pcos
, psin
, mask
);
2377 ccc
= blend(psin
, pcos
, mask
);
2378 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2379 #if GMX_SIMD_HAVE_LOGICAL
2380 *sinval
= sss
^ ssign
;
2381 *cosval
= ccc
^ csign
;
2383 *sinval
= sss
* ssign
;
2384 *cosval
= ccc
* csign
;
2388 /*! \brief SIMD double sin(x).
2390 * \param x The argument to evaluate sin for
2393 * \attention Do NOT call both sin & cos if you need both results, since each of them
2394 * will then call \ref sincos and waste a factor 2 in performance.
2396 static inline SimdDouble gmx_simdcall
2404 /*! \brief SIMD double cos(x).
2406 * \param x The argument to evaluate cos for
2409 * \attention Do NOT call both sin & cos if you need both results, since each of them
2410 * will then call \ref sincos and waste a factor 2 in performance.
2412 static inline SimdDouble gmx_simdcall
2420 /*! \brief SIMD double tan(x).
2422 * \param x The argument to evaluate tan for
2425 static inline SimdDouble gmx_simdcall
2428 const SimdDouble
argred0(-2*0.78539816290140151978);
2429 const SimdDouble
argred1(-2*4.9604678871439933374e-10);
2430 const SimdDouble
argred2(-2*1.1258708853173288931e-18);
2431 const SimdDouble
argred3(-2*1.7607799325916000908e-27);
2432 const SimdDouble
two_over_pi(2.0/M_PI
);
2433 const SimdDouble
CT15(1.01419718511083373224408e-05);
2434 const SimdDouble
CT14(-2.59519791585924697698614e-05);
2435 const SimdDouble
CT13(5.23388081915899855325186e-05);
2436 const SimdDouble
CT12(-3.05033014433946488225616e-05);
2437 const SimdDouble
CT11(7.14707504084242744267497e-05);
2438 const SimdDouble
CT10(8.09674518280159187045078e-05);
2439 const SimdDouble
CT9(0.000244884931879331847054404);
2440 const SimdDouble
CT8(0.000588505168743587154904506);
2441 const SimdDouble
CT7(0.00145612788922812427978848);
2442 const SimdDouble
CT6(0.00359208743836906619142924);
2443 const SimdDouble
CT5(0.00886323944362401618113356);
2444 const SimdDouble
CT4(0.0218694882853846389592078);
2445 const SimdDouble
CT3(0.0539682539781298417636002);
2446 const SimdDouble
CT2(0.133333333333125941821962);
2447 const SimdDouble
CT1(0.333333333333334980164153);
2449 SimdDouble x2
, p
, y
, z
;
2452 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2456 z
= x
* two_over_pi
;
2459 m
= cvtIB2B((iy
& ione
) == ione
);
2461 x
= fma(y
, argred0
, x
);
2462 x
= fma(y
, argred1
, x
);
2463 x
= fma(y
, argred2
, x
);
2464 x
= fma(y
, argred3
, x
);
2465 x
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), m
) ^ x
;
2467 const SimdDouble
quarter(0.25);
2468 const SimdDouble
half(0.5);
2469 const SimdDouble
threequarter(0.75);
2471 SimdDBool m1
, m2
, m3
;
2474 z
= fma(w
, two_over_pi
, half
);
2478 m1
= (quarter
<= q
);
2480 m3
= (threequarter
<= q
);
2483 w
= fma(y
, argred0
, w
);
2484 w
= fma(y
, argred1
, w
);
2485 w
= fma(y
, argred2
, w
);
2486 w
= fma(y
, argred3
, w
);
2488 w
= blend(w
, -w
, m
);
2489 x
= w
* copysign( SimdDouble(1.0), x
);
2492 p
= fma(CT15
, x2
, CT14
);
2493 p
= fma(p
, x2
, CT13
);
2494 p
= fma(p
, x2
, CT12
);
2495 p
= fma(p
, x2
, CT11
);
2496 p
= fma(p
, x2
, CT10
);
2497 p
= fma(p
, x2
, CT9
);
2498 p
= fma(p
, x2
, CT8
);
2499 p
= fma(p
, x2
, CT7
);
2500 p
= fma(p
, x2
, CT6
);
2501 p
= fma(p
, x2
, CT5
);
2502 p
= fma(p
, x2
, CT4
);
2503 p
= fma(p
, x2
, CT3
);
2504 p
= fma(p
, x2
, CT2
);
2505 p
= fma(p
, x2
, CT1
);
2506 p
= fma(x2
, p
* x
, x
);
2508 p
= blend( p
, maskzInv(p
, m
), m
);
2512 /*! \brief SIMD double asin(x).
2514 * \param x The argument to evaluate asin for
2517 static inline SimdDouble gmx_simdcall
2520 // Same algorithm as cephes library
2521 const SimdDouble
limit1(0.625);
2522 const SimdDouble
limit2(1e-8);
2523 const SimdDouble
one(1.0);
2524 const SimdDouble
quarterpi(M_PI
/4.0);
2525 const SimdDouble
morebits(6.123233995736765886130e-17);
2527 const SimdDouble
P5(4.253011369004428248960e-3);
2528 const SimdDouble
P4(-6.019598008014123785661e-1);
2529 const SimdDouble
P3(5.444622390564711410273e0
);
2530 const SimdDouble
P2(-1.626247967210700244449e1
);
2531 const SimdDouble
P1(1.956261983317594739197e1
);
2532 const SimdDouble
P0(-8.198089802484824371615e0
);
2534 const SimdDouble
Q4(-1.474091372988853791896e1
);
2535 const SimdDouble
Q3(7.049610280856842141659e1
);
2536 const SimdDouble
Q2(-1.471791292232726029859e2
);
2537 const SimdDouble
Q1(1.395105614657485689735e2
);
2538 const SimdDouble
Q0(-4.918853881490881290097e1
);
2540 const SimdDouble
R4(2.967721961301243206100e-3);
2541 const SimdDouble
R3(-5.634242780008963776856e-1);
2542 const SimdDouble
R2(6.968710824104713396794e0
);
2543 const SimdDouble
R1(-2.556901049652824852289e1
);
2544 const SimdDouble
R0(2.853665548261061424989e1
);
2546 const SimdDouble
S3(-2.194779531642920639778e1
);
2547 const SimdDouble
S2(1.470656354026814941758e2
);
2548 const SimdDouble
S1(-3.838770957603691357202e2
);
2549 const SimdDouble
S0(3.424398657913078477438e2
);
2552 SimdDouble zz
, ww
, z
, q
, w
, zz2
, ww2
;
2557 SimdDouble nom
, denom
;
2558 SimdDBool mask
, mask2
;
2562 mask
= (limit1
< xabs
);
2570 RA
= fma(R4
, zz2
, R2
);
2571 RB
= fma(R3
, zz2
, R1
);
2572 RA
= fma(RA
, zz2
, R0
);
2573 RA
= fma(RB
, zz
, RA
);
2576 SB
= fma(S3
, zz2
, S1
);
2578 SA
= fma(SA
, zz2
, S0
);
2579 SA
= fma(SB
, zz
, SA
);
2582 PA
= fma(P5
, ww2
, P3
);
2583 PB
= fma(P4
, ww2
, P2
);
2584 PA
= fma(PA
, ww2
, P1
);
2585 PB
= fma(PB
, ww2
, P0
);
2586 PA
= fma(PA
, ww
, PB
);
2589 QB
= fma(Q4
, ww2
, Q2
);
2591 QA
= fma(QA
, ww2
, Q1
);
2592 QB
= fma(QB
, ww2
, Q0
);
2593 QA
= fma(QA
, ww
, QB
);
2598 nom
= blend( PA
, RA
, mask
);
2599 denom
= blend( QA
, SA
, mask
);
2601 mask2
= (limit2
< xabs
);
2602 q
= nom
* maskzInv(denom
, mask2
);
2607 zz
= fms(zz
, q
, morebits
);
2614 z
= blend( w
, z
, mask
);
2616 z
= blend( xabs
, z
, mask2
);
2623 /*! \brief SIMD double acos(x).
2625 * \param x The argument to evaluate acos for
2628 static inline SimdDouble gmx_simdcall
2631 const SimdDouble
one(1.0);
2632 const SimdDouble
half(0.5);
2633 const SimdDouble
quarterpi0(7.85398163397448309616e-1);
2634 const SimdDouble
quarterpi1(6.123233995736765886130e-17);
2637 SimdDouble z
, z1
, z2
;
2640 z1
= half
* (one
- x
);
2642 z
= blend( x
, z1
, mask1
);
2648 z2
= quarterpi0
- z
;
2649 z2
= z2
+ quarterpi1
;
2650 z2
= z2
+ quarterpi0
;
2652 z
= blend(z2
, z1
, mask1
);
2657 /*! \brief SIMD double asin(x).
2659 * \param x The argument to evaluate atan for
2660 * \result Atan(x), same argument/value range as standard math library.
2662 static inline SimdDouble gmx_simdcall
2665 // Same algorithm as cephes library
2666 const SimdDouble
limit1(0.66);
2667 const SimdDouble
limit2(2.41421356237309504880);
2668 const SimdDouble
quarterpi(M_PI
/4.0);
2669 const SimdDouble
halfpi(M_PI
/2.0);
2670 const SimdDouble
mone(-1.0);
2671 const SimdDouble
morebits1(0.5*6.123233995736765886130E-17);
2672 const SimdDouble
morebits2(6.123233995736765886130E-17);
2674 const SimdDouble
P4(-8.750608600031904122785E-1);
2675 const SimdDouble
P3(-1.615753718733365076637E1
);
2676 const SimdDouble
P2(-7.500855792314704667340E1
);
2677 const SimdDouble
P1(-1.228866684490136173410E2
);
2678 const SimdDouble
P0(-6.485021904942025371773E1
);
2680 const SimdDouble
Q4(2.485846490142306297962E1
);
2681 const SimdDouble
Q3(1.650270098316988542046E2
);
2682 const SimdDouble
Q2(4.328810604912902668951E2
);
2683 const SimdDouble
Q1(4.853903996359136964868E2
);
2684 const SimdDouble
Q0(1.945506571482613964425E2
);
2686 SimdDouble y
, xabs
, t1
, t2
;
2688 SimdDouble P_A
, P_B
, Q_A
, Q_B
;
2689 SimdDBool mask1
, mask2
;
2693 mask1
= (limit1
< xabs
);
2694 mask2
= (limit2
< xabs
);
2696 t1
= (xabs
+ mone
) * maskzInv(xabs
- mone
, mask1
);
2697 t2
= mone
* maskzInv(xabs
, mask2
);
2699 y
= selectByMask(quarterpi
, mask1
);
2700 y
= blend(y
, halfpi
, mask2
);
2701 xabs
= blend(xabs
, t1
, mask1
);
2702 xabs
= blend(xabs
, t2
, mask2
);
2707 P_A
= fma(P4
, z2
, P2
);
2708 P_B
= fma(P3
, z2
, P1
);
2709 P_A
= fma(P_A
, z2
, P0
);
2710 P_A
= fma(P_B
, z
, P_A
);
2713 Q_B
= fma(Q4
, z2
, Q2
);
2715 Q_A
= fma(Q_A
, z2
, Q1
);
2716 Q_B
= fma(Q_B
, z2
, Q0
);
2717 Q_A
= fma(Q_A
, z
, Q_B
);
2721 z
= fma(z
, xabs
, xabs
);
2723 t1
= selectByMask(morebits1
, mask1
);
2724 t1
= blend(t1
, morebits2
, mask2
);
2734 /*! \brief SIMD double atan2(y,x).
2736 * \param y Y component of vector, any quartile
2737 * \param x X component of vector, any quartile
2738 * \result Atan(y,x), same argument/value range as standard math library.
2740 * \note This routine should provide correct results for all finite
2741 * non-zero or positive-zero arguments. However, negative zero arguments will
2742 * be treated as positive zero, which means the return value will deviate from
2743 * the standard math library atan2(y,x) for those cases. That should not be
2744 * of any concern in Gromacs, and in particular it will not affect calculations
2745 * of angles from vectors.
2747 static inline SimdDouble gmx_simdcall
2748 atan2(SimdDouble y
, SimdDouble x
)
2750 const SimdDouble
pi(M_PI
);
2751 const SimdDouble
halfpi(M_PI
/2.0);
2752 SimdDouble xinv
, p
, aoffset
;
2753 SimdDBool mask_xnz
, mask_ynz
, mask_xlt0
, mask_ylt0
;
2755 mask_xnz
= x
!= setZero();
2756 mask_ynz
= y
!= setZero();
2757 mask_xlt0
= (x
< setZero());
2758 mask_ylt0
= (y
< setZero());
2760 aoffset
= selectByNotMask(halfpi
, mask_xnz
);
2761 aoffset
= selectByMask(aoffset
, mask_ynz
);
2763 aoffset
= blend(aoffset
, pi
, mask_xlt0
);
2764 aoffset
= blend(aoffset
, -aoffset
, mask_ylt0
);
2766 xinv
= maskzInv(x
, mask_xnz
);
2775 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
2777 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2778 * interaction distance and beta the ewald splitting parameters.
2779 * \result Correction factor to coulomb force.
2781 * This routine is meant to enable analytical evaluation of the
2782 * direct-space PME electrostatic force to avoid tables. For details, see the
2783 * single precision function.
2785 static inline SimdDouble gmx_simdcall
2786 pmeForceCorrection(SimdDouble z2
)
2788 const SimdDouble
FN10(-8.0072854618360083154e-14);
2789 const SimdDouble
FN9(1.1859116242260148027e-11);
2790 const SimdDouble
FN8(-8.1490406329798423616e-10);
2791 const SimdDouble
FN7(3.4404793543907847655e-8);
2792 const SimdDouble
FN6(-9.9471420832602741006e-7);
2793 const SimdDouble
FN5(0.000020740315999115847456);
2794 const SimdDouble
FN4(-0.00031991745139313364005);
2795 const SimdDouble
FN3(0.0035074449373659008203);
2796 const SimdDouble
FN2(-0.031750380176100813405);
2797 const SimdDouble
FN1(0.13884101728898463426);
2798 const SimdDouble
FN0(-0.75225277815249618847);
2800 const SimdDouble
FD5(0.000016009278224355026701);
2801 const SimdDouble
FD4(0.00051055686934806966046);
2802 const SimdDouble
FD3(0.0081803507497974289008);
2803 const SimdDouble
FD2(0.077181146026670287235);
2804 const SimdDouble
FD1(0.41543303143712535988);
2805 const SimdDouble
FD0(1.0);
2808 SimdDouble polyFN0
, polyFN1
, polyFD0
, polyFD1
;
2812 polyFD1
= fma(FD5
, z4
, FD3
);
2813 polyFD1
= fma(polyFD1
, z4
, FD1
);
2814 polyFD1
= polyFD1
* z2
;
2815 polyFD0
= fma(FD4
, z4
, FD2
);
2816 polyFD0
= fma(polyFD0
, z4
, FD0
);
2817 polyFD0
= polyFD0
+ polyFD1
;
2819 polyFD0
= inv(polyFD0
);
2821 polyFN0
= fma(FN10
, z4
, FN8
);
2822 polyFN0
= fma(polyFN0
, z4
, FN6
);
2823 polyFN0
= fma(polyFN0
, z4
, FN4
);
2824 polyFN0
= fma(polyFN0
, z4
, FN2
);
2825 polyFN0
= fma(polyFN0
, z4
, FN0
);
2826 polyFN1
= fma(FN9
, z4
, FN7
);
2827 polyFN1
= fma(polyFN1
, z4
, FN5
);
2828 polyFN1
= fma(polyFN1
, z4
, FN3
);
2829 polyFN1
= fma(polyFN1
, z4
, FN1
);
2830 polyFN0
= fma(polyFN1
, z2
, polyFN0
);
2833 return polyFN0
* polyFD0
;
2838 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
2840 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2841 * interaction distance and beta the ewald splitting parameters.
2842 * \result Correction factor to coulomb force.
2844 * This routine is meant to enable analytical evaluation of the
2845 * direct-space PME electrostatic potential to avoid tables. For details, see the
2846 * single precision function.
2848 static inline SimdDouble gmx_simdcall
2849 pmePotentialCorrection(SimdDouble z2
)
2851 const SimdDouble
VN9(-9.3723776169321855475e-13);
2852 const SimdDouble
VN8(1.2280156762674215741e-10);
2853 const SimdDouble
VN7(-7.3562157912251309487e-9);
2854 const SimdDouble
VN6(2.6215886208032517509e-7);
2855 const SimdDouble
VN5(-4.9532491651265819499e-6);
2856 const SimdDouble
VN4(0.00025907400778966060389);
2857 const SimdDouble
VN3(0.0010585044856156469792);
2858 const SimdDouble
VN2(0.045247661136833092885);
2859 const SimdDouble
VN1(0.11643931522926034421);
2860 const SimdDouble
VN0(1.1283791671726767970);
2862 const SimdDouble
VD5(0.000021784709867336150342);
2863 const SimdDouble
VD4(0.00064293662010911388448);
2864 const SimdDouble
VD3(0.0096311444822588683504);
2865 const SimdDouble
VD2(0.085608012351550627051);
2866 const SimdDouble
VD1(0.43652499166614811084);
2867 const SimdDouble
VD0(1.0);
2870 SimdDouble polyVN0
, polyVN1
, polyVD0
, polyVD1
;
2874 polyVD1
= fma(VD5
, z4
, VD3
);
2875 polyVD0
= fma(VD4
, z4
, VD2
);
2876 polyVD1
= fma(polyVD1
, z4
, VD1
);
2877 polyVD0
= fma(polyVD0
, z4
, VD0
);
2878 polyVD0
= fma(polyVD1
, z2
, polyVD0
);
2880 polyVD0
= inv(polyVD0
);
2882 polyVN1
= fma(VN9
, z4
, VN7
);
2883 polyVN0
= fma(VN8
, z4
, VN6
);
2884 polyVN1
= fma(polyVN1
, z4
, VN5
);
2885 polyVN0
= fma(polyVN0
, z4
, VN4
);
2886 polyVN1
= fma(polyVN1
, z4
, VN3
);
2887 polyVN0
= fma(polyVN0
, z4
, VN2
);
2888 polyVN1
= fma(polyVN1
, z4
, VN1
);
2889 polyVN0
= fma(polyVN0
, z4
, VN0
);
2890 polyVN0
= fma(polyVN1
, z2
, polyVN0
);
2892 return polyVN0
* polyVD0
;
2898 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2900 * \note In some cases we do not need full double accuracy of individual
2901 * SIMD math functions, although the data is stored in double precision
2902 * SIMD registers. This might be the case for special algorithms, or
2903 * if the architecture does not support single precision.
2904 * Since the full double precision evaluation of math functions
2905 * typically require much more expensive polynomial approximations
2906 * these functions implement the algorithms used in the single precision
2907 * SIMD math functions, but they operate on double precision
2913 /*********************************************************************
2914 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2915 *********************************************************************/
2917 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2919 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
2920 * GMX_FLOAT_MAX, i.e. within the range of single precision.
2921 * For the single precision implementation this is obviously always
2922 * true for positive values, but for double precision it adds an
2923 * extra restriction since the first lookup step might have to be
2924 * performed in single precision on some architectures. Note that the
2925 * responsibility for checking falls on you - this routine does not
2928 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
2930 static inline SimdDouble gmx_simdcall
2931 invsqrtSingleAccuracy(SimdDouble x
)
2933 SimdDouble lu
= rsqrt(x
);
2934 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2935 lu
= rsqrtIter(lu
, x
);
2937 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2938 lu
= rsqrtIter(lu
, x
);
2940 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2941 lu
= rsqrtIter(lu
, x
);
2946 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
2948 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
2949 * Illegal values in the masked-out elements will not lead to
2950 * floating-point exceptions.
2952 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
2953 * GMX_FLOAT_MAX, i.e. within the range of single precision.
2954 * For the single precision implementation this is obviously always
2955 * true for positive values, but for double precision it adds an
2956 * extra restriction since the first lookup step might have to be
2957 * performed in single precision on some architectures. Note that the
2958 * responsibility for checking falls on you - this routine does not
2962 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
2963 * entry was not masked, and 0.0 for masked-out entries.
2965 static inline SimdDouble
2966 maskzInvsqrtSingleAccuracy(SimdDouble x
, SimdDBool m
)
2968 SimdDouble lu
= maskzRsqrt(x
, m
);
2969 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2970 lu
= rsqrtIter(lu
, x
);
2972 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2973 lu
= rsqrtIter(lu
, x
);
2975 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2976 lu
= rsqrtIter(lu
, x
);
2981 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2983 * \param x0 First set of arguments, x0 must be in single range (see below).
2984 * \param x1 Second set of arguments, x1 must be in single range (see below).
2985 * \param[out] out0 Result 1/sqrt(x0)
2986 * \param[out] out1 Result 1/sqrt(x1)
2988 * In particular for double precision we can sometimes calculate square root
2989 * pairs slightly faster by using single precision until the very last step.
2991 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
2992 * GMX_FLOAT_MAX, i.e. within the range of single precision.
2993 * For the single precision implementation this is obviously always
2994 * true for positive values, but for double precision it adds an
2995 * extra restriction since the first lookup step might have to be
2996 * performed in single precision on some architectures. Note that the
2997 * responsibility for checking falls on you - this routine does not
3000 static inline void gmx_simdcall
3001 invsqrtPairSingleAccuracy(SimdDouble x0
, SimdDouble x1
,
3002 SimdDouble
*out0
, SimdDouble
*out1
)
3004 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
3005 SimdFloat xf
= cvtDD2F(x0
, x1
);
3006 SimdFloat luf
= rsqrt(xf
);
3007 SimdDouble lu0
, lu1
;
3008 // Intermediate target is single - mantissa+1 bits
3009 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3010 luf
= rsqrtIter(luf
, xf
);
3012 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3013 luf
= rsqrtIter(luf
, xf
);
3015 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3016 luf
= rsqrtIter(luf
, xf
);
3018 cvtF2DD(luf
, &lu0
, &lu1
);
3019 // We now have single-precision accuracy values in lu0/lu1
3023 *out0
= invsqrtSingleAccuracy(x0
);
3024 *out1
= invsqrtSingleAccuracy(x1
);
3028 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3030 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3031 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3032 * For the single precision implementation this is obviously always
3033 * true for positive values, but for double precision it adds an
3034 * extra restriction since the first lookup step might have to be
3035 * performed in single precision on some architectures. Note that the
3036 * responsibility for checking falls on you - this routine does not
3039 * \return 1/x. Result is undefined if your argument was invalid.
3041 static inline SimdDouble gmx_simdcall
3042 invSingleAccuracy(SimdDouble x
)
3044 SimdDouble lu
= rcp(x
);
3045 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3046 lu
= rcpIter(lu
, x
);
3048 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3049 lu
= rcpIter(lu
, x
);
3051 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3052 lu
= rcpIter(lu
, x
);
3057 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3059 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3060 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3061 * For the single precision implementation this is obviously always
3062 * true for positive values, but for double precision it adds an
3063 * extra restriction since the first lookup step might have to be
3064 * performed in single precision on some architectures. Note that the
3065 * responsibility for checking falls on you - this routine does not
3069 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3071 static inline SimdDouble gmx_simdcall
3072 maskzInvSingleAccuracy(SimdDouble x
, SimdDBool m
)
3074 SimdDouble lu
= maskzRcp(x
, m
);
3075 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3076 lu
= rcpIter(lu
, x
);
3078 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3079 lu
= rcpIter(lu
, x
);
3081 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3082 lu
= rcpIter(lu
, x
);
3088 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3090 * \copydetails sqrt(SimdFloat)
3092 template <MathOptimization opt
= MathOptimization::Safe
>
3093 static inline SimdDouble gmx_simdcall
3094 sqrtSingleAccuracy(SimdDouble x
)
3096 if (opt
== MathOptimization::Safe
)
3098 SimdDouble res
= maskzInvsqrt(x
, SimdDouble(GMX_FLOAT_MIN
) < x
);
3103 return x
* invsqrtSingleAccuracy(x
);
3108 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3110 * \param x Argument, should be >0.
3111 * \result The natural logarithm of x. Undefined if argument is invalid.
3113 static inline SimdDouble gmx_simdcall
3114 logSingleAccuracy(SimdDouble x
)
3116 const SimdDouble
one(1.0);
3117 const SimdDouble
two(2.0);
3118 const SimdDouble
sqrt2(std::sqrt(2.0));
3119 const SimdDouble
corr(0.693147180559945286226764);
3120 const SimdDouble
CL9(0.2371599674224853515625);
3121 const SimdDouble
CL7(0.285279005765914916992188);
3122 const SimdDouble
CL5(0.400005519390106201171875);
3123 const SimdDouble
CL3(0.666666567325592041015625);
3124 const SimdDouble
CL1(2.0);
3125 SimdDouble fexp
, x2
, p
;
3129 x
= frexp(x
, &iexp
);
3130 fexp
= cvtI2R(iexp
);
3133 // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3134 fexp
= fexp
- selectByMask(one
, mask
);
3135 x
= x
* blend(one
, two
, mask
);
3137 x
= (x
- one
) * invSingleAccuracy( x
+ one
);
3140 p
= fma(CL9
, x2
, CL7
);
3141 p
= fma(p
, x2
, CL5
);
3142 p
= fma(p
, x2
, CL3
);
3143 p
= fma(p
, x2
, CL1
);
3144 p
= fma(p
, x
, corr
* fexp
);
3149 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3151 * \copydetails exp2(SimdFloat)
3153 template <MathOptimization opt
= MathOptimization::Safe
>
3154 static inline SimdDouble gmx_simdcall
3155 exp2SingleAccuracy(SimdDouble x
)
3157 const SimdDouble
CC6(0.0001534581200287996416911311);
3158 const SimdDouble
CC5(0.001339993121934088894618990);
3159 const SimdDouble
CC4(0.009618488957115180159497841);
3160 const SimdDouble
CC3(0.05550328776964726865751735);
3161 const SimdDouble
CC2(0.2402264689063408646490722);
3162 const SimdDouble
CC1(0.6931472057372680777553816);
3163 const SimdDouble
one(1.0);
3169 // Large negative values are valid arguments to exp2(), so there are two
3170 // things we need to account for:
3171 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3172 // zero and we can no longer multiply with it. There are special IEEE
3173 // formats to handle this range, but for now we have to accept that
3174 // we cannot handle those arguments. If input value becomes even more
3175 // negative, it will start to loop and we would end up with invalid
3176 // exponents. Thus, we need to limit or mask this.
3177 // 2. For VERY large negative values, we will have problems that the
3178 // subtraction to get the fractional part loses accuracy, and then we
3179 // can end up with overflows in the polynomial.
3181 // For now, we handle this by forwarding the math optimization setting to
3182 // ldexp, where the routine will return zero for very small arguments.
3184 // However, before doing that we need to make sure we do not call cvtR2I
3185 // with an argument that is so negative it cannot be converted to an integer.
3186 if (opt
== MathOptimization::Safe
)
3188 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest()));
3195 p
= fma(CC6
, x
, CC5
);
3201 x
= ldexp
<opt
>(p
, ix
);
3208 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3210 * \copydetails exp(SimdFloat)
3212 template <MathOptimization opt
= MathOptimization::Safe
>
3213 static inline SimdDouble gmx_simdcall
3214 expSingleAccuracy(SimdDouble x
)
3216 const SimdDouble
argscale(1.44269504088896341);
3217 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3218 const SimdDouble
smallArgLimit(-709.0895657128);
3219 const SimdDouble
invargscale(-0.69314718055994528623);
3220 const SimdDouble
CC4(0.00136324646882712841033936);
3221 const SimdDouble
CC3(0.00836596917361021041870117);
3222 const SimdDouble
CC2(0.0416710823774337768554688);
3223 const SimdDouble
CC1(0.166665524244308471679688);
3224 const SimdDouble
CC0(0.499999850988388061523438);
3225 const SimdDouble
one(1.0);
3230 // Large negative values are valid arguments to exp2(), so there are two
3231 // things we need to account for:
3232 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3233 // zero and we can no longer multiply with it. There are special IEEE
3234 // formats to handle this range, but for now we have to accept that
3235 // we cannot handle those arguments. If input value becomes even more
3236 // negative, it will start to loop and we would end up with invalid
3237 // exponents. Thus, we need to limit or mask this.
3238 // 2. For VERY large negative values, we will have problems that the
3239 // subtraction to get the fractional part loses accuracy, and then we
3240 // can end up with overflows in the polynomial.
3242 // For now, we handle this by forwarding the math optimization setting to
3243 // ldexp, where the routine will return zero for very small arguments.
3245 // However, before doing that we need to make sure we do not call cvtR2I
3246 // with an argument that is so negative it cannot be converted to an integer
3247 // after being multiplied by argscale.
3249 if (opt
== MathOptimization::Safe
)
3251 x
= max(x
, SimdDouble(std::numeric_limits
<std::int32_t>::lowest())/argscale
);
3257 intpart
= round(y
); // use same rounding algorithm here
3259 // Extended precision arithmetics not needed since
3260 // we have double precision and only need single accuracy.
3261 x
= fma(invargscale
, intpart
, x
);
3263 p
= fma(CC4
, x
, CC3
);
3269 x
= ldexp
<opt
>(p
, iy
);
3274 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3276 * \param x The value to calculate erf(x) for.
3279 * This routine achieves very close to single precision, but we do not care about
3280 * the last bit or the subnormal result range.
3282 static inline SimdDouble gmx_simdcall
3283 erfSingleAccuracy(SimdDouble x
)
3285 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3286 const SimdDouble
CA6(7.853861353153693e-5);
3287 const SimdDouble
CA5(-8.010193625184903e-4);
3288 const SimdDouble
CA4(5.188327685732524e-3);
3289 const SimdDouble
CA3(-2.685381193529856e-2);
3290 const SimdDouble
CA2(1.128358514861418e-1);
3291 const SimdDouble
CA1(-3.761262582423300e-1);
3292 const SimdDouble
CA0(1.128379165726710);
3293 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3294 const SimdDouble
CB9(-0.0018629930017603923);
3295 const SimdDouble
CB8(0.003909821287598495);
3296 const SimdDouble
CB7(-0.0052094582210355615);
3297 const SimdDouble
CB6(0.005685614362160572);
3298 const SimdDouble
CB5(-0.0025367682853477272);
3299 const SimdDouble
CB4(-0.010199799682318782);
3300 const SimdDouble
CB3(0.04369575504816542);
3301 const SimdDouble
CB2(-0.11884063474674492);
3302 const SimdDouble
CB1(0.2732120154030589);
3303 const SimdDouble
CB0(0.42758357702025784);
3304 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3305 const SimdDouble
CC10(-0.0445555913112064);
3306 const SimdDouble
CC9(0.21376355144663348);
3307 const SimdDouble
CC8(-0.3473187200259257);
3308 const SimdDouble
CC7(0.016690861551248114);
3309 const SimdDouble
CC6(0.7560973182491192);
3310 const SimdDouble
CC5(-1.2137903600145787);
3311 const SimdDouble
CC4(0.8411872321232948);
3312 const SimdDouble
CC3(-0.08670413896296343);
3313 const SimdDouble
CC2(-0.27124782687240334);
3314 const SimdDouble
CC1(-0.0007502488047806069);
3315 const SimdDouble
CC0(0.5642114853803148);
3316 const SimdDouble
one(1.0);
3317 const SimdDouble
two(2.0);
3319 SimdDouble x2
, x4
, y
;
3320 SimdDouble t
, t2
, w
, w2
;
3321 SimdDouble pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
3323 SimdDouble res_erf
, res_erfc
, res
;
3324 SimdDBool mask
, msk_erf
;
3330 pA0
= fma(CA6
, x4
, CA4
);
3331 pA1
= fma(CA5
, x4
, CA3
);
3332 pA0
= fma(pA0
, x4
, CA2
);
3333 pA1
= fma(pA1
, x4
, CA1
);
3335 pA0
= fma(pA1
, x2
, pA0
);
3336 // Constant term must come last for precision reasons
3343 msk_erf
= (SimdDouble(0.75) <= y
);
3344 t
= maskzInvSingleAccuracy(y
, msk_erf
);
3349 expmx2
= expSingleAccuracy( -y
*y
);
3351 pB1
= fma(CB9
, w2
, CB7
);
3352 pB0
= fma(CB8
, w2
, CB6
);
3353 pB1
= fma(pB1
, w2
, CB5
);
3354 pB0
= fma(pB0
, w2
, CB4
);
3355 pB1
= fma(pB1
, w2
, CB3
);
3356 pB0
= fma(pB0
, w2
, CB2
);
3357 pB1
= fma(pB1
, w2
, CB1
);
3358 pB0
= fma(pB0
, w2
, CB0
);
3359 pB0
= fma(pB1
, w
, pB0
);
3361 pC0
= fma(CC10
, t2
, CC8
);
3362 pC1
= fma(CC9
, t2
, CC7
);
3363 pC0
= fma(pC0
, t2
, CC6
);
3364 pC1
= fma(pC1
, t2
, CC5
);
3365 pC0
= fma(pC0
, t2
, CC4
);
3366 pC1
= fma(pC1
, t2
, CC3
);
3367 pC0
= fma(pC0
, t2
, CC2
);
3368 pC1
= fma(pC1
, t2
, CC1
);
3370 pC0
= fma(pC0
, t2
, CC0
);
3371 pC0
= fma(pC1
, t
, pC0
);
3374 // Select pB0 or pC0 for erfc()
3376 res_erfc
= blend(pB0
, pC0
, mask
);
3377 res_erfc
= res_erfc
* expmx2
;
3379 // erfc(x<0) = 2-erfc(|x|)
3380 mask
= (x
< setZero());
3381 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
3383 // Select erf() or erfc()
3384 mask
= (y
< SimdDouble(0.75));
3385 res
= blend(one
- res_erfc
, res_erf
, mask
);
3390 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3392 * \param x The value to calculate erfc(x) for.
3395 * This routine achieves singleprecision (bar the last bit) over most of the
3396 * input range, but for large arguments where the result is getting close
3397 * to the minimum representable numbers we accept slightly larger errors
3398 * (think results that are in the ballpark of 10^-30) since that is not
3401 static inline SimdDouble gmx_simdcall
3402 erfcSingleAccuracy(SimdDouble x
)
3404 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3405 const SimdDouble
CA6(7.853861353153693e-5);
3406 const SimdDouble
CA5(-8.010193625184903e-4);
3407 const SimdDouble
CA4(5.188327685732524e-3);
3408 const SimdDouble
CA3(-2.685381193529856e-2);
3409 const SimdDouble
CA2(1.128358514861418e-1);
3410 const SimdDouble
CA1(-3.761262582423300e-1);
3411 const SimdDouble
CA0(1.128379165726710);
3412 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3413 const SimdDouble
CB9(-0.0018629930017603923);
3414 const SimdDouble
CB8(0.003909821287598495);
3415 const SimdDouble
CB7(-0.0052094582210355615);
3416 const SimdDouble
CB6(0.005685614362160572);
3417 const SimdDouble
CB5(-0.0025367682853477272);
3418 const SimdDouble
CB4(-0.010199799682318782);
3419 const SimdDouble
CB3(0.04369575504816542);
3420 const SimdDouble
CB2(-0.11884063474674492);
3421 const SimdDouble
CB1(0.2732120154030589);
3422 const SimdDouble
CB0(0.42758357702025784);
3423 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3424 const SimdDouble
CC10(-0.0445555913112064);
3425 const SimdDouble
CC9(0.21376355144663348);
3426 const SimdDouble
CC8(-0.3473187200259257);
3427 const SimdDouble
CC7(0.016690861551248114);
3428 const SimdDouble
CC6(0.7560973182491192);
3429 const SimdDouble
CC5(-1.2137903600145787);
3430 const SimdDouble
CC4(0.8411872321232948);
3431 const SimdDouble
CC3(-0.08670413896296343);
3432 const SimdDouble
CC2(-0.27124782687240334);
3433 const SimdDouble
CC1(-0.0007502488047806069);
3434 const SimdDouble
CC0(0.5642114853803148);
3435 const SimdDouble
one(1.0);
3436 const SimdDouble
two(2.0);
3438 SimdDouble x2
, x4
, y
;
3439 SimdDouble t
, t2
, w
, w2
;
3440 SimdDouble pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
3442 SimdDouble res_erf
, res_erfc
, res
;
3443 SimdDBool mask
, msk_erf
;
3449 pA0
= fma(CA6
, x4
, CA4
);
3450 pA1
= fma(CA5
, x4
, CA3
);
3451 pA0
= fma(pA0
, x4
, CA2
);
3452 pA1
= fma(pA1
, x4
, CA1
);
3454 pA0
= fma(pA0
, x4
, pA1
);
3455 // Constant term must come last for precision reasons
3462 msk_erf
= (SimdDouble(0.75) <= y
);
3463 t
= maskzInvSingleAccuracy(y
, msk_erf
);
3468 expmx2
= expSingleAccuracy( -y
*y
);
3470 pB1
= fma(CB9
, w2
, CB7
);
3471 pB0
= fma(CB8
, w2
, CB6
);
3472 pB1
= fma(pB1
, w2
, CB5
);
3473 pB0
= fma(pB0
, w2
, CB4
);
3474 pB1
= fma(pB1
, w2
, CB3
);
3475 pB0
= fma(pB0
, w2
, CB2
);
3476 pB1
= fma(pB1
, w2
, CB1
);
3477 pB0
= fma(pB0
, w2
, CB0
);
3478 pB0
= fma(pB1
, w
, pB0
);
3480 pC0
= fma(CC10
, t2
, CC8
);
3481 pC1
= fma(CC9
, t2
, CC7
);
3482 pC0
= fma(pC0
, t2
, CC6
);
3483 pC1
= fma(pC1
, t2
, CC5
);
3484 pC0
= fma(pC0
, t2
, CC4
);
3485 pC1
= fma(pC1
, t2
, CC3
);
3486 pC0
= fma(pC0
, t2
, CC2
);
3487 pC1
= fma(pC1
, t2
, CC1
);
3489 pC0
= fma(pC0
, t2
, CC0
);
3490 pC0
= fma(pC1
, t
, pC0
);
3493 // Select pB0 or pC0 for erfc()
3495 res_erfc
= blend(pB0
, pC0
, mask
);
3496 res_erfc
= res_erfc
* expmx2
;
3498 // erfc(x<0) = 2-erfc(|x|)
3499 mask
= (x
< setZero());
3500 res_erfc
= blend(res_erfc
, two
- res_erfc
, mask
);
3502 // Select erf() or erfc()
3503 mask
= (y
< SimdDouble(0.75));
3504 res
= blend(res_erfc
, one
- res_erf
, mask
);
3509 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3511 * \param x The argument to evaluate sin/cos for
3512 * \param[out] sinval Sin(x)
3513 * \param[out] cosval Cos(x)
3515 static inline void gmx_simdcall
3516 sinCosSingleAccuracy(SimdDouble x
, SimdDouble
*sinval
, SimdDouble
*cosval
)
3518 // Constants to subtract Pi/4*x from y while minimizing precision loss
3519 const SimdDouble
argred0(2*0.78539816290140151978);
3520 const SimdDouble
argred1(2*4.9604678871439933374e-10);
3521 const SimdDouble
argred2(2*1.1258708853173288931e-18);
3522 const SimdDouble
two_over_pi(2.0/M_PI
);
3523 const SimdDouble
const_sin2(-1.9515295891e-4);
3524 const SimdDouble
const_sin1( 8.3321608736e-3);
3525 const SimdDouble
const_sin0(-1.6666654611e-1);
3526 const SimdDouble
const_cos2( 2.443315711809948e-5);
3527 const SimdDouble
const_cos1(-1.388731625493765e-3);
3528 const SimdDouble
const_cos0( 4.166664568298827e-2);
3530 const SimdDouble
half(0.5);
3531 const SimdDouble
one(1.0);
3532 SimdDouble ssign
, csign
;
3533 SimdDouble x2
, y
, z
, psin
, pcos
, sss
, ccc
;
3536 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3537 const SimdDInt32
ione(1);
3538 const SimdDInt32
itwo(2);
3541 z
= x
* two_over_pi
;
3545 mask
= cvtIB2B((iy
& ione
) == setZero());
3546 ssign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B((iy
& itwo
) == itwo
));
3547 csign
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), cvtIB2B(((iy
+ ione
) & itwo
) == itwo
));
3549 const SimdDouble
quarter(0.25);
3550 const SimdDouble
minusquarter(-0.25);
3552 SimdDBool m1
, m2
, m3
;
3554 /* The most obvious way to find the arguments quadrant in the unit circle
3555 * to calculate the sign is to use integer arithmetic, but that is not
3556 * present in all SIMD implementations. As an alternative, we have devised a
3557 * pure floating-point algorithm that uses truncation for argument reduction
3558 * so that we get a new value 0<=q<1 over the unit circle, and then
3559 * do floating-point comparisons with fractions. This is likely to be
3560 * slightly slower (~10%) due to the longer latencies of floating-point, so
3561 * we only use it when integer SIMD arithmetic is not present.
3565 // It is critical that half-way cases are rounded down
3566 z
= fma(x
, two_over_pi
, half
);
3570 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3571 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3572 * This removes the 2*Pi periodicity without using any integer arithmetic.
3573 * First check if y had the value 2 or 3, set csign if true.
3576 /* If we have logical operations we can work directly on the signbit, which
3577 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3578 * Thus, if you are altering defines to debug alternative code paths, the
3579 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3580 * active or inactive - you will get errors if only one is used.
3582 # if GMX_SIMD_HAVE_LOGICAL
3583 ssign
= ssign
& SimdDouble(GMX_DOUBLE_NEGZERO
);
3584 csign
= andNot(q
, SimdDouble(GMX_DOUBLE_NEGZERO
));
3585 ssign
= ssign
^ csign
;
3587 ssign
= copysign(SimdDouble(1.0), ssign
);
3588 csign
= copysign(SimdDouble(1.0), q
);
3590 ssign
= ssign
* csign
; // swap ssign if csign was set.
3592 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
3593 m1
= (q
< minusquarter
);
3594 m2
= (setZero() <= q
);
3598 // where mask is FALSE, swap sign.
3599 csign
= csign
* blend(SimdDouble(-1.0), one
, mask
);
3601 x
= fnma(y
, argred0
, x
);
3602 x
= fnma(y
, argred1
, x
);
3603 x
= fnma(y
, argred2
, x
);
3606 psin
= fma(const_sin2
, x2
, const_sin1
);
3607 psin
= fma(psin
, x2
, const_sin0
);
3608 psin
= fma(psin
, x
* x2
, x
);
3609 pcos
= fma(const_cos2
, x2
, const_cos1
);
3610 pcos
= fma(pcos
, x2
, const_cos0
);
3611 pcos
= fms(pcos
, x2
, half
);
3612 pcos
= fma(pcos
, x2
, one
);
3614 sss
= blend(pcos
, psin
, mask
);
3615 ccc
= blend(psin
, pcos
, mask
);
3616 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
3617 #if GMX_SIMD_HAVE_LOGICAL
3618 *sinval
= sss
^ ssign
;
3619 *cosval
= ccc
^ csign
;
3621 *sinval
= sss
* ssign
;
3622 *cosval
= ccc
* csign
;
3626 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3628 * \param x The argument to evaluate sin for
3631 * \attention Do NOT call both sin & cos if you need both results, since each of them
3632 * will then call \ref sincos and waste a factor 2 in performance.
3634 static inline SimdDouble gmx_simdcall
3635 sinSingleAccuracy(SimdDouble x
)
3638 sinCosSingleAccuracy(x
, &s
, &c
);
3642 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3644 * \param x The argument to evaluate cos for
3647 * \attention Do NOT call both sin & cos if you need both results, since each of them
3648 * will then call \ref sincos and waste a factor 2 in performance.
3650 static inline SimdDouble gmx_simdcall
3651 cosSingleAccuracy(SimdDouble x
)
3654 sinCosSingleAccuracy(x
, &s
, &c
);
3658 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3660 * \param x The argument to evaluate tan for
3663 static inline SimdDouble gmx_simdcall
3664 tanSingleAccuracy(SimdDouble x
)
3666 const SimdDouble
argred0(2*0.78539816290140151978);
3667 const SimdDouble
argred1(2*4.9604678871439933374e-10);
3668 const SimdDouble
argred2(2*1.1258708853173288931e-18);
3669 const SimdDouble
two_over_pi(2.0/M_PI
);
3670 const SimdDouble
CT6(0.009498288995810566122993911);
3671 const SimdDouble
CT5(0.002895755790837379295226923);
3672 const SimdDouble
CT4(0.02460087336161924491836265);
3673 const SimdDouble
CT3(0.05334912882656359828045988);
3674 const SimdDouble
CT2(0.1333989091464957704418495);
3675 const SimdDouble
CT1(0.3333307599244198227797507);
3677 SimdDouble x2
, p
, y
, z
;
3680 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3684 z
= x
* two_over_pi
;
3687 mask
= cvtIB2B((iy
& ione
) == ione
);
3689 x
= fnma(y
, argred0
, x
);
3690 x
= fnma(y
, argred1
, x
);
3691 x
= fnma(y
, argred2
, x
);
3692 x
= selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO
), mask
) ^ x
;
3694 const SimdDouble
quarter(0.25);
3695 const SimdDouble
half(0.5);
3696 const SimdDouble
threequarter(0.75);
3698 SimdDBool m1
, m2
, m3
;
3701 z
= fma(w
, two_over_pi
, half
);
3705 m1
= (quarter
<= q
);
3707 m3
= (threequarter
<= q
);
3710 w
= fnma(y
, argred0
, w
);
3711 w
= fnma(y
, argred1
, w
);
3712 w
= fnma(y
, argred2
, w
);
3714 w
= blend(w
, -w
, mask
);
3715 x
= w
* copysign( SimdDouble(1.0), x
);
3718 p
= fma(CT6
, x2
, CT5
);
3719 p
= fma(p
, x2
, CT4
);
3720 p
= fma(p
, x2
, CT3
);
3721 p
= fma(p
, x2
, CT2
);
3722 p
= fma(p
, x2
, CT1
);
3723 p
= fma(x2
, p
* x
, x
);
3725 p
= blend( p
, maskzInvSingleAccuracy(p
, mask
), mask
);
3729 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3731 * \param x The argument to evaluate asin for
3734 static inline SimdDouble gmx_simdcall
3735 asinSingleAccuracy(SimdDouble x
)
3737 const SimdDouble
limitlow(1e-4);
3738 const SimdDouble
half(0.5);
3739 const SimdDouble
one(1.0);
3740 const SimdDouble
halfpi(M_PI
/2.0);
3741 const SimdDouble
CC5(4.2163199048E-2);
3742 const SimdDouble
CC4(2.4181311049E-2);
3743 const SimdDouble
CC3(4.5470025998E-2);
3744 const SimdDouble
CC2(7.4953002686E-2);
3745 const SimdDouble
CC1(1.6666752422E-1);
3747 SimdDouble z
, z1
, z2
, q
, q1
, q2
;
3749 SimdDBool mask
, mask2
;
3752 mask
= (half
< xabs
);
3753 z1
= half
* (one
- xabs
);
3754 mask2
= (xabs
< one
);
3755 q1
= z1
* maskzInvsqrtSingleAccuracy(z1
, mask2
);
3758 z
= blend(z2
, z1
, mask
);
3759 q
= blend(q2
, q1
, mask
);
3762 pA
= fma(CC5
, z2
, CC3
);
3763 pB
= fma(CC4
, z2
, CC2
);
3764 pA
= fma(pA
, z2
, CC1
);
3766 z
= fma(pB
, z2
, pA
);
3770 z
= blend(z
, q2
, mask
);
3772 mask
= (limitlow
< xabs
);
3773 z
= blend( xabs
, z
, mask
);
3779 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3781 * \param x The argument to evaluate acos for
3784 static inline SimdDouble gmx_simdcall
3785 acosSingleAccuracy(SimdDouble x
)
3787 const SimdDouble
one(1.0);
3788 const SimdDouble
half(0.5);
3789 const SimdDouble
pi(M_PI
);
3790 const SimdDouble
halfpi(M_PI
/2.0);
3792 SimdDouble z
, z1
, z2
, z3
;
3793 SimdDBool mask1
, mask2
, mask3
;
3796 mask1
= (half
< xabs
);
3797 mask2
= (setZero() < x
);
3799 z
= half
* (one
- xabs
);
3800 mask3
= (xabs
< one
);
3801 z
= z
* maskzInvsqrtSingleAccuracy(z
, mask3
);
3802 z
= blend(x
, z
, mask1
);
3803 z
= asinSingleAccuracy(z
);
3808 z
= blend(z1
, z2
, mask2
);
3809 z
= blend(z3
, z
, mask1
);
3814 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3816 * \param x The argument to evaluate atan for
3817 * \result Atan(x), same argument/value range as standard math library.
3819 static inline SimdDouble gmx_simdcall
3820 atanSingleAccuracy(SimdDouble x
)
3822 const SimdDouble
halfpi(M_PI
/2);
3823 const SimdDouble
CA17(0.002823638962581753730774);
3824 const SimdDouble
CA15(-0.01595690287649631500244);
3825 const SimdDouble
CA13(0.04250498861074447631836);
3826 const SimdDouble
CA11(-0.07489009201526641845703);
3827 const SimdDouble
CA9(0.1063479334115982055664);
3828 const SimdDouble
CA7(-0.1420273631811141967773);
3829 const SimdDouble
CA5(0.1999269574880599975585);
3830 const SimdDouble
CA3(-0.3333310186862945556640);
3831 SimdDouble x2
, x3
, x4
, pA
, pB
;
3832 SimdDBool mask
, mask2
;
3834 mask
= (x
< setZero());
3836 mask2
= (SimdDouble(1.0) < x
);
3837 x
= blend(x
, maskzInvSingleAccuracy(x
, mask2
), mask2
);
3842 pA
= fma(CA17
, x4
, CA13
);
3843 pB
= fma(CA15
, x4
, CA11
);
3844 pA
= fma(pA
, x4
, CA9
);
3845 pB
= fma(pB
, x4
, CA7
);
3846 pA
= fma(pA
, x4
, CA5
);
3847 pB
= fma(pB
, x4
, CA3
);
3848 pA
= fma(pA
, x2
, pB
);
3849 pA
= fma(pA
, x3
, x
);
3851 pA
= blend(pA
, halfpi
- pA
, mask2
);
3852 pA
= blend(pA
, -pA
, mask
);
3857 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3859 * \param y Y component of vector, any quartile
3860 * \param x X component of vector, any quartile
3861 * \result Atan(y,x), same argument/value range as standard math library.
3863 * \note This routine should provide correct results for all finite
3864 * non-zero or positive-zero arguments. However, negative zero arguments will
3865 * be treated as positive zero, which means the return value will deviate from
3866 * the standard math library atan2(y,x) for those cases. That should not be
3867 * of any concern in Gromacs, and in particular it will not affect calculations
3868 * of angles from vectors.
3870 static inline SimdDouble gmx_simdcall
3871 atan2SingleAccuracy(SimdDouble y
, SimdDouble x
)
3873 const SimdDouble
pi(M_PI
);
3874 const SimdDouble
halfpi(M_PI
/2.0);
3875 SimdDouble xinv
, p
, aoffset
;
3876 SimdDBool mask_xnz
, mask_ynz
, mask_xlt0
, mask_ylt0
;
3878 mask_xnz
= x
!= setZero();
3879 mask_ynz
= y
!= setZero();
3880 mask_xlt0
= (x
< setZero());
3881 mask_ylt0
= (y
< setZero());
3883 aoffset
= selectByNotMask(halfpi
, mask_xnz
);
3884 aoffset
= selectByMask(aoffset
, mask_ynz
);
3886 aoffset
= blend(aoffset
, pi
, mask_xlt0
);
3887 aoffset
= blend(aoffset
, -aoffset
, mask_ylt0
);
3889 xinv
= maskzInvSingleAccuracy(x
, mask_xnz
);
3891 p
= atanSingleAccuracy(p
);
3897 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3899 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3900 * \result Correction factor to coulomb force - see below for details.
3902 * This routine is meant to enable analytical evaluation of the
3903 * direct-space PME electrostatic force to avoid tables.
3905 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3906 * are some problems evaluating that:
3908 * First, the error function is difficult (read: expensive) to
3909 * approxmiate accurately for intermediate to large arguments, and
3910 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3911 * Second, we now try to avoid calculating potentials in Gromacs but
3912 * use forces directly.
3914 * We can simply things slight by noting that the PME part is really
3915 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3917 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3919 * The first term we already have from the inverse square root, so
3920 * that we can leave out of this routine.
3922 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3923 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3924 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3927 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3928 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3929 * then only use even powers. This is another minor optimization, since
3930 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3931 * the vector between the two atoms to get the vectorial force. The
3932 * fastest flops are the ones we can avoid calculating!
3934 * So, here's how it should be used:
3936 * 1. Calculate \f$r^2\f$.
3937 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3938 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3939 * 4. The return value is the expression:
3942 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3945 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3948 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3951 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3954 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3957 * With a bit of math exercise you should be able to confirm that
3961 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3964 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3965 * and you have your force (divided by \f$r\f$). A final multiplication
3966 * with the vector connecting the two particles and you have your
3967 * vectorial force to add to the particles.
3969 * This approximation achieves an accuracy slightly lower than 1e-6; when
3970 * added to \f$1/r\f$ the error will be insignificant.
3973 static inline SimdDouble gmx_simdcall
3974 pmeForceCorrectionSingleAccuracy(SimdDouble z2
)
3976 const SimdDouble
FN6(-1.7357322914161492954e-8);
3977 const SimdDouble
FN5(1.4703624142580877519e-6);
3978 const SimdDouble
FN4(-0.000053401640219807709149);
3979 const SimdDouble
FN3(0.0010054721316683106153);
3980 const SimdDouble
FN2(-0.019278317264888380590);
3981 const SimdDouble
FN1(0.069670166153766424023);
3982 const SimdDouble
FN0(-0.75225204789749321333);
3984 const SimdDouble
FD4(0.0011193462567257629232);
3985 const SimdDouble
FD3(0.014866955030185295499);
3986 const SimdDouble
FD2(0.11583842382862377919);
3987 const SimdDouble
FD1(0.50736591960530292870);
3988 const SimdDouble
FD0(1.0);
3991 SimdDouble polyFN0
, polyFN1
, polyFD0
, polyFD1
;
3995 polyFD0
= fma(FD4
, z4
, FD2
);
3996 polyFD1
= fma(FD3
, z4
, FD1
);
3997 polyFD0
= fma(polyFD0
, z4
, FD0
);
3998 polyFD0
= fma(polyFD1
, z2
, polyFD0
);
4000 polyFD0
= invSingleAccuracy(polyFD0
);
4002 polyFN0
= fma(FN6
, z4
, FN4
);
4003 polyFN1
= fma(FN5
, z4
, FN3
);
4004 polyFN0
= fma(polyFN0
, z4
, FN2
);
4005 polyFN1
= fma(polyFN1
, z4
, FN1
);
4006 polyFN0
= fma(polyFN0
, z4
, FN0
);
4007 polyFN0
= fma(polyFN1
, z2
, polyFN0
);
4009 return polyFN0
* polyFD0
;
4014 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4016 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4017 * \result Correction factor to coulomb potential - see below for details.
4019 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4020 * as the input argument.
4022 * Here's how it should be used:
4024 * 1. Calculate \f$r^2\f$.
4025 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4026 * 3. Evaluate this routine with z^2 as the argument.
4027 * 4. The return value is the expression:
4030 * \frac{\mbox{erf}(z)}{z}
4033 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4036 * \frac{\mbox{erf}(r \beta)}{r}
4039 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4040 * and you have your potential.
4042 * This approximation achieves an accuracy slightly lower than 1e-6; when
4043 * added to \f$1/r\f$ the error will be insignificant.
4045 static inline SimdDouble gmx_simdcall
4046 pmePotentialCorrectionSingleAccuracy(SimdDouble z2
)
4048 const SimdDouble
VN6(1.9296833005951166339e-8);
4049 const SimdDouble
VN5(-1.4213390571557850962e-6);
4050 const SimdDouble
VN4(0.000041603292906656984871);
4051 const SimdDouble
VN3(-0.00013134036773265025626);
4052 const SimdDouble
VN2(0.038657983986041781264);
4053 const SimdDouble
VN1(0.11285044772717598220);
4054 const SimdDouble
VN0(1.1283802385263030286);
4056 const SimdDouble
VD3(0.0066752224023576045451);
4057 const SimdDouble
VD2(0.078647795836373922256);
4058 const SimdDouble
VD1(0.43336185284710920150);
4059 const SimdDouble
VD0(1.0);
4062 SimdDouble polyVN0
, polyVN1
, polyVD0
, polyVD1
;
4066 polyVD1
= fma(VD3
, z4
, VD1
);
4067 polyVD0
= fma(VD2
, z4
, VD0
);
4068 polyVD0
= fma(polyVD1
, z2
, polyVD0
);
4070 polyVD0
= invSingleAccuracy(polyVD0
);
4072 polyVN0
= fma(VN6
, z4
, VN4
);
4073 polyVN1
= fma(VN5
, z4
, VN3
);
4074 polyVN0
= fma(polyVN0
, z4
, VN2
);
4075 polyVN1
= fma(polyVN1
, z4
, VN1
);
4076 polyVN0
= fma(polyVN0
, z4
, VN0
);
4077 polyVN0
= fma(polyVN1
, z2
, polyVN0
);
4079 return polyVN0
* polyVD0
;
4085 /*! \name SIMD4 math functions
4087 * \note Only a subset of the math functions are implemented for SIMD4.
4092 #if GMX_SIMD4_HAVE_FLOAT
4094 /*************************************************************************
4095 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4096 *************************************************************************/
4098 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4100 * This is a low-level routine that should only be used by SIMD math routine
4101 * that evaluates the inverse square root.
4103 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4104 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4105 * \return An improved approximation with roughly twice as many bits of accuracy.
4107 static inline Simd4Float gmx_simdcall
4108 rsqrtIter(Simd4Float lu
, Simd4Float x
)
4110 Simd4Float tmp1
= x
*lu
;
4111 Simd4Float tmp2
= Simd4Float(-0.5f
)*lu
;
4112 tmp1
= fma(tmp1
, lu
, Simd4Float(-3.0f
));
4116 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4118 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4119 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4120 * For the single precision implementation this is obviously always
4121 * true for positive values, but for double precision it adds an
4122 * extra restriction since the first lookup step might have to be
4123 * performed in single precision on some architectures. Note that the
4124 * responsibility for checking falls on you - this routine does not
4126 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4128 static inline Simd4Float gmx_simdcall
4129 invsqrt(Simd4Float x
)
4131 Simd4Float lu
= rsqrt(x
);
4132 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4133 lu
= rsqrtIter(lu
, x
);
4135 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4136 lu
= rsqrtIter(lu
, x
);
4138 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4139 lu
= rsqrtIter(lu
, x
);
4145 #endif // GMX_SIMD4_HAVE_FLOAT
4149 #if GMX_SIMD4_HAVE_DOUBLE
4150 /*************************************************************************
4151 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4152 *************************************************************************/
4154 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4156 * This is a low-level routine that should only be used by SIMD math routine
4157 * that evaluates the inverse square root.
4159 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4160 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4161 * \return An improved approximation with roughly twice as many bits of accuracy.
4163 static inline Simd4Double gmx_simdcall
4164 rsqrtIter(Simd4Double lu
, Simd4Double x
)
4166 Simd4Double tmp1
= x
*lu
;
4167 Simd4Double tmp2
= Simd4Double(-0.5f
)*lu
;
4168 tmp1
= fma(tmp1
, lu
, Simd4Double(-3.0f
));
4172 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4174 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4175 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4176 * For the single precision implementation this is obviously always
4177 * true for positive values, but for double precision it adds an
4178 * extra restriction since the first lookup step might have to be
4179 * performed in single precision on some architectures. Note that the
4180 * responsibility for checking falls on you - this routine does not
4182 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4184 static inline Simd4Double gmx_simdcall
4185 invsqrt(Simd4Double x
)
4187 Simd4Double lu
= rsqrt(x
);
4188 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4189 lu
= rsqrtIter(lu
, x
);
4191 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4192 lu
= rsqrtIter(lu
, x
);
4194 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4195 lu
= rsqrtIter(lu
, x
);
4197 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4198 lu
= rsqrtIter(lu
, x
);
4204 /**********************************************************************
4205 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4206 **********************************************************************/
4208 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4210 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4211 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4212 * For the single precision implementation this is obviously always
4213 * true for positive values, but for double precision it adds an
4214 * extra restriction since the first lookup step might have to be
4215 * performed in single precision on some architectures. Note that the
4216 * responsibility for checking falls on you - this routine does not
4218 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4220 static inline Simd4Double gmx_simdcall
4221 invsqrtSingleAccuracy(Simd4Double x
)
4223 Simd4Double lu
= rsqrt(x
);
4224 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4225 lu
= rsqrtIter(lu
, x
);
4227 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4228 lu
= rsqrtIter(lu
, x
);
4230 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4231 lu
= rsqrtIter(lu
, x
);
4238 #endif // GMX_SIMD4_HAVE_DOUBLE
4242 #if GMX_SIMD_HAVE_FLOAT
4243 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4245 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4246 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4247 * For the single precision implementation this is obviously always
4248 * true for positive values, but for double precision it adds an
4249 * extra restriction since the first lookup step might have to be
4250 * performed in single precision on some architectures. Note that the
4251 * responsibility for checking falls on you - this routine does not
4253 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4255 static inline SimdFloat gmx_simdcall
4256 invsqrtSingleAccuracy(SimdFloat x
)
4261 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4263 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4264 * Illegal values in the masked-out elements will not lead to
4265 * floating-point exceptions.
4267 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4268 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4269 * For the single precision implementation this is obviously always
4270 * true for positive values, but for double precision it adds an
4271 * extra restriction since the first lookup step might have to be
4272 * performed in single precision on some architectures. Note that the
4273 * responsibility for checking falls on you - this routine does not
4276 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
4277 * entry was not masked, and 0.0 for masked-out entries.
4279 static inline SimdFloat
4280 maskzInvsqrtSingleAccuracy(SimdFloat x
, SimdFBool m
)
4282 return maskzInvsqrt(x
, m
);
4285 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4287 * \param x0 First set of arguments, x0 must be in single range (see below).
4288 * \param x1 Second set of arguments, x1 must be in single range (see below).
4289 * \param[out] out0 Result 1/sqrt(x0)
4290 * \param[out] out1 Result 1/sqrt(x1)
4292 * In particular for double precision we can sometimes calculate square root
4293 * pairs slightly faster by using single precision until the very last step.
4295 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4296 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4297 * For the single precision implementation this is obviously always
4298 * true for positive values, but for double precision it adds an
4299 * extra restriction since the first lookup step might have to be
4300 * performed in single precision on some architectures. Note that the
4301 * responsibility for checking falls on you - this routine does not
4304 static inline void gmx_simdcall
4305 invsqrtPairSingleAccuracy(SimdFloat x0
, SimdFloat x1
,
4306 SimdFloat
*out0
, SimdFloat
*out1
)
4308 return invsqrtPair(x0
, x1
, out0
, out1
);
4311 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4313 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4314 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4315 * For the single precision implementation this is obviously always
4316 * true for positive values, but for double precision it adds an
4317 * extra restriction since the first lookup step might have to be
4318 * performed in single precision on some architectures. Note that the
4319 * responsibility for checking falls on you - this routine does not
4321 * \return 1/x. Result is undefined if your argument was invalid.
4323 static inline SimdFloat gmx_simdcall
4324 invSingleAccuracy(SimdFloat x
)
4330 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4332 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4333 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4334 * For the single precision implementation this is obviously always
4335 * true for positive values, but for double precision it adds an
4336 * extra restriction since the first lookup step might have to be
4337 * performed in single precision on some architectures. Note that the
4338 * responsibility for checking falls on you - this routine does not
4341 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
4343 static inline SimdFloat
4344 maskzInvSingleAccuracy(SimdFloat x
, SimdFBool m
)
4346 return maskzInv(x
, m
);
4349 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4351 * \copydetails sqrt(SimdFloat)
4353 template <MathOptimization opt
= MathOptimization::Safe
>
4354 static inline SimdFloat gmx_simdcall
4355 sqrtSingleAccuracy(SimdFloat x
)
4357 return sqrt
<opt
>(x
);
4360 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
4362 * \param x Argument, should be >0.
4363 * \result The natural logarithm of x. Undefined if argument is invalid.
4365 static inline SimdFloat gmx_simdcall
4366 logSingleAccuracy(SimdFloat x
)
4371 /*! \brief SIMD float 2^x, only targeting single accuracy.
4373 * \copydetails exp2(SimdFloat)
4375 template <MathOptimization opt
= MathOptimization::Safe
>
4376 static inline SimdFloat gmx_simdcall
4377 exp2SingleAccuracy(SimdFloat x
)
4379 return exp2
<opt
>(x
);
4382 /*! \brief SIMD float e^x, only targeting single accuracy.
4384 * \copydetails exp(SimdFloat)
4386 template <MathOptimization opt
= MathOptimization::Safe
>
4387 static inline SimdFloat gmx_simdcall
4388 expSingleAccuracy(SimdFloat x
)
4394 /*! \brief SIMD float erf(x), only targeting single accuracy.
4396 * \param x The value to calculate erf(x) for.
4399 * This routine achieves very close to single precision, but we do not care about
4400 * the last bit or the subnormal result range.
4402 static inline SimdFloat gmx_simdcall
4403 erfSingleAccuracy(SimdFloat x
)
4408 /*! \brief SIMD float erfc(x), only targeting single accuracy.
4410 * \param x The value to calculate erfc(x) for.
4413 * This routine achieves singleprecision (bar the last bit) over most of the
4414 * input range, but for large arguments where the result is getting close
4415 * to the minimum representable numbers we accept slightly larger errors
4416 * (think results that are in the ballpark of 10^-30) since that is not
4419 static inline SimdFloat gmx_simdcall
4420 erfcSingleAccuracy(SimdFloat x
)
4425 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
4427 * \param x The argument to evaluate sin/cos for
4428 * \param[out] sinval Sin(x)
4429 * \param[out] cosval Cos(x)
4431 static inline void gmx_simdcall
4432 sinCosSingleAccuracy(SimdFloat x
, SimdFloat
*sinval
, SimdFloat
*cosval
)
4434 sincos(x
, sinval
, cosval
);
4437 /*! \brief SIMD float sin(x), only targeting single accuracy.
4439 * \param x The argument to evaluate sin for
4442 * \attention Do NOT call both sin & cos if you need both results, since each of them
4443 * will then call \ref sincos and waste a factor 2 in performance.
4445 static inline SimdFloat gmx_simdcall
4446 sinSingleAccuracy(SimdFloat x
)
4451 /*! \brief SIMD float cos(x), only targeting single accuracy.
4453 * \param x The argument to evaluate cos for
4456 * \attention Do NOT call both sin & cos if you need both results, since each of them
4457 * will then call \ref sincos and waste a factor 2 in performance.
4459 static inline SimdFloat gmx_simdcall
4460 cosSingleAccuracy(SimdFloat x
)
4465 /*! \brief SIMD float tan(x), only targeting single accuracy.
4467 * \param x The argument to evaluate tan for
4470 static inline SimdFloat gmx_simdcall
4471 tanSingleAccuracy(SimdFloat x
)
4476 /*! \brief SIMD float asin(x), only targeting single accuracy.
4478 * \param x The argument to evaluate asin for
4481 static inline SimdFloat gmx_simdcall
4482 asinSingleAccuracy(SimdFloat x
)
4487 /*! \brief SIMD float acos(x), only targeting single accuracy.
4489 * \param x The argument to evaluate acos for
4492 static inline SimdFloat gmx_simdcall
4493 acosSingleAccuracy(SimdFloat x
)
4498 /*! \brief SIMD float atan(x), only targeting single accuracy.
4500 * \param x The argument to evaluate atan for
4501 * \result Atan(x), same argument/value range as standard math library.
4503 static inline SimdFloat gmx_simdcall
4504 atanSingleAccuracy(SimdFloat x
)
4509 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
4511 * \param y Y component of vector, any quartile
4512 * \param x X component of vector, any quartile
4513 * \result Atan(y,x), same argument/value range as standard math library.
4515 * \note This routine should provide correct results for all finite
4516 * non-zero or positive-zero arguments. However, negative zero arguments will
4517 * be treated as positive zero, which means the return value will deviate from
4518 * the standard math library atan2(y,x) for those cases. That should not be
4519 * of any concern in Gromacs, and in particular it will not affect calculations
4520 * of angles from vectors.
4522 static inline SimdFloat gmx_simdcall
4523 atan2SingleAccuracy(SimdFloat y
, SimdFloat x
)
4528 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
4530 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4531 * \result Correction factor to coulomb force.
4533 static inline SimdFloat gmx_simdcall
4534 pmeForceCorrectionSingleAccuracy(SimdFloat z2
)
4536 return pmeForceCorrection(z2
);
4539 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
4541 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4542 * \result Correction factor to coulomb force.
4544 static inline SimdFloat gmx_simdcall
4545 pmePotentialCorrectionSingleAccuracy(SimdFloat z2
)
4547 return pmePotentialCorrection(z2
);
4549 #endif // GMX_SIMD_HAVE_FLOAT
4551 #if GMX_SIMD4_HAVE_FLOAT
4552 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
4554 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4555 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4556 * For the single precision implementation this is obviously always
4557 * true for positive values, but for double precision it adds an
4558 * extra restriction since the first lookup step might have to be
4559 * performed in single precision on some architectures. Note that the
4560 * responsibility for checking falls on you - this routine does not
4562 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4564 static inline Simd4Float gmx_simdcall
4565 invsqrtSingleAccuracy(Simd4Float x
)
4569 #endif // GMX_SIMD4_HAVE_FLOAT
4571 /*! \} end of addtogroup module_simd */
4572 /*! \endcond end of condition libabl */
4578 #endif // GMX_SIMD_SIMD_MATH_H