src/gromacs/simd/simd_math.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
   5  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #ifndef GMX_SIMD_SIMD_MATH_H
  37 #define GMX_SIMD_SIMD_MATH_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Math functions for SIMD datatypes.
  42  *
  43  * \attention This file is generic for all SIMD architectures, so you cannot
  44  * assume that any of the optional SIMD features (as defined in simd.h) are
  45  * present. In particular, this means you cannot assume support for integers,
  46  * logical operations (neither on floating-point nor integer values), shifts,
  47  * and the architecture might only have SIMD for either float or double.
  48  * Second, to keep this file clean and general, any additions to this file
  49  * must work for all possible SIMD architectures in both single and double
  50  * precision (if they support it), and you cannot make any assumptions about
  51  * SIMD width.
  52  *
  53  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  54  *
  55  * \inlibraryapi
  56  * \ingroup module_simd
  57  */
  58
  59 #include "config.h"
  60
  61 #include <cmath>
  62
  63 #include <limits>
  64
  65 #include "gromacs/math/utilities.h"
  66 #include "gromacs/simd/simd.h"
  67 #include "gromacs/utility/basedefinitions.h"
  68 #include "gromacs/utility/real.h"
  69
  70 namespace gmx
  71 {
  72
  73 #if GMX_SIMD
  74
  75 /*! \cond libapi */
  76 /*! \addtogroup module_simd */
  77 /*! \{ */
  78
  79 /*! \name Implementation accuracy settings
  80  *  \{
  81  */
  82
  83 /*! \} */
  84
  85 #    if GMX_SIMD_HAVE_FLOAT
  86
  87 /*! \name Single precision SIMD math functions
  88  *
  89  *  \note In most cases you should use the real-precision functions instead.
  90  *  \{
  91  */
  92
  93 /****************************************
  94  * SINGLE PRECISION SIMD MATH FUNCTIONS *
  95  ****************************************/
  96
  97 #        if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
  98 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
  99  *
 100  * \param x Values to set sign for
 101  * \param y Values used to set sign
 102  * \return  Magnitude of x, sign of y
 103  */
 104 static inline SimdFloat gmx_simdcall copysign(SimdFloat x, SimdFloat y)
 105 {
 106 #            if GMX_SIMD_HAVE_LOGICAL
 107     return abs(x) | (SimdFloat(GMX_FLOAT_NEGZERO) & y);
 108 #            else
 109     return blend(abs(x), -abs(x), y < setZero());
 110 #            endif
 111 }
 112 #        endif
 113
 114 #        if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
 115 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
 116  *
 117  * This is a low-level routine that should only be used by SIMD math routine
 118  * that evaluates the inverse square root.
 119  *
 120  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
 121  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
 122  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 123  */
 124 static inline SimdFloat gmx_simdcall rsqrtIter(SimdFloat lu, SimdFloat x)
 125 {
 126     SimdFloat tmp1 = x * lu;
 127     SimdFloat tmp2 = SimdFloat(-0.5F) * lu;
 128     tmp1           = fma(tmp1, lu, SimdFloat(-3.0F));
 129     return tmp1 * tmp2;
 130 }
 131 #        endif
 132
 133 /*! \brief Calculate 1/sqrt(x) for SIMD float.
 134  *
 135  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 136  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 137  *           For the single precision implementation this is obviously always
 138  *           true for positive values, but for double precision it adds an
 139  *           extra restriction since the first lookup step might have to be
 140  *           performed in single precision on some architectures. Note that the
 141  *           responsibility for checking falls on you - this routine does not
 142  *           check arguments.
 143  *
 144  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
 145  */
 146 static inline SimdFloat gmx_simdcall invsqrt(SimdFloat x)
 147 {
 148     SimdFloat lu = rsqrt(x);
 149 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 150     lu = rsqrtIter(lu, x);
 151 #        endif
 152 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 153     lu = rsqrtIter(lu, x);
 154 #        endif
 155 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 156     lu = rsqrtIter(lu, x);
 157 #        endif
 158     return lu;
 159 }
 160
 161 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
 162  *
 163  * \param x0  First set of arguments, x0 must be in single range (see below).
 164  * \param x1  Second set of arguments, x1 must be in single range (see below).
 165  * \param[out] out0  Result 1/sqrt(x0)
 166  * \param[out] out1  Result 1/sqrt(x1)
 167  *
 168  *  In particular for double precision we can sometimes calculate square root
 169  *  pairs slightly faster by using single precision until the very last step.
 170  *
 171  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
 172  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
 173  *       For the single precision implementation this is obviously always
 174  *       true for positive values, but for double precision it adds an
 175  *       extra restriction since the first lookup step might have to be
 176  *       performed in single precision on some architectures. Note that the
 177  *       responsibility for checking falls on you - this routine does not
 178  *       check arguments.
 179  */
 180 static inline void gmx_simdcall invsqrtPair(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
 181 {
 182     *out0 = invsqrt(x0);
 183     *out1 = invsqrt(x1);
 184 }
 185
 186 #        if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
 187 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
 188  *
 189  * This is a low-level routine that should only be used by SIMD math routine
 190  * that evaluates the reciprocal.
 191  *
 192  *  \param lu Approximation of 1/x, typically obtained from lookup.
 193  *  \param x  The reference (starting) value x for which we want 1/x.
 194  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 195  */
 196 static inline SimdFloat gmx_simdcall rcpIter(SimdFloat lu, SimdFloat x)
 197 {
 198     return lu * fnma(lu, x, SimdFloat(2.0F));
 199 }
 200 #        endif
 201
 202 /*! \brief Calculate 1/x for SIMD float.
 203  *
 204  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 205  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 206  *           For the single precision implementation this is obviously always
 207  *           true for positive values, but for double precision it adds an
 208  *           extra restriction since the first lookup step might have to be
 209  *           performed in single precision on some architectures. Note that the
 210  *           responsibility for checking falls on you - this routine does not
 211  *           check arguments.
 212  *
 213  *  \return 1/x. Result is undefined if your argument was invalid.
 214  */
 215 static inline SimdFloat gmx_simdcall inv(SimdFloat x)
 216 {
 217     SimdFloat lu = rcp(x);
 218 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 219     lu = rcpIter(lu, x);
 220 #        endif
 221 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 222     lu = rcpIter(lu, x);
 223 #        endif
 224 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 225     lu = rcpIter(lu, x);
 226 #        endif
 227     return lu;
 228 }
 229
 230 /*! \brief Division for SIMD floats
 231  *
 232  * \param nom    Nominator
 233  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
 234  *               For single precision this is equivalent to a nonzero argument,
 235  *               but in double precision it adds an extra restriction since
 236  *               the first lookup step might have to be performed in single
 237  *               precision on some architectures. Note that the responsibility
 238  *               for checking falls on you - this routine does not check arguments.
 239  *
 240  * \return nom/denom
 241  *
 242  * \note This function does not use any masking to avoid problems with
 243  *       zero values in the denominator.
 244  */
 245 static inline SimdFloat gmx_simdcall operator/(SimdFloat nom, SimdFloat denom)
 246 {
 247     return nom * inv(denom);
 248 }
 249
 250 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
 251  *
 252  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
 253  *  Illegal values in the masked-out elements will not lead to
 254  *  floating-point exceptions.
 255  *
 256  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 257  *           GMX_FLOAT_MAX for masked-in entries.
 258  *           See \ref invsqrt for the discussion about argument restrictions.
 259  *  \param m Mask
 260  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
 261  *          entry was not masked, and 0.0 for masked-out entries.
 262  */
 263 static inline SimdFloat maskzInvsqrt(SimdFloat x, SimdFBool m)
 264 {
 265     SimdFloat lu = maskzRsqrt(x, m);
 266 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 267     lu = rsqrtIter(lu, x);
 268 #        endif
 269 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 270     lu = rsqrtIter(lu, x);
 271 #        endif
 272 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 273     lu = rsqrtIter(lu, x);
 274 #        endif
 275     return lu;
 276 }
 277
 278 /*! \brief Calculate 1/x for SIMD float, masked version.
 279  *
 280  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 281  *           GMX_FLOAT_MAX for masked-in entries.
 282  *           See \ref invsqrt for the discussion about argument restrictions.
 283  *  \param m Mask
 284  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
 285  */
 286 static inline SimdFloat gmx_simdcall maskzInv(SimdFloat x, SimdFBool m)
 287 {
 288     SimdFloat lu = maskzRcp(x, m);
 289 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 290     lu = rcpIter(lu, x);
 291 #        endif
 292 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 293     lu = rcpIter(lu, x);
 294 #        endif
 295 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 296     lu = rcpIter(lu, x);
 297 #        endif
 298     return lu;
 299 }
 300
 301 /*! \brief Calculate sqrt(x) for SIMD floats
 302  *
 303  *  \tparam opt By default, this function checks if the input value is 0.0
 304  *              and masks this to return the correct result. If you are certain
 305  *              your argument will never be zero, and you know you need to save
 306  *              every single cycle you can, you can alternatively call the
 307  *              function as sqrt<MathOptimization::Unsafe>(x).
 308  *
 309  *  \param  x   Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
 310  *              lookup step often has to be implemented in single precision.
 311  *              Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
 312  *              result, even in double precision. If you are using the unsafe
 313  *              math optimization parameter, the argument must be in the range
 314  *              GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
 315  *
 316  *  \return sqrt(x). The result is undefined if the input value does not fall
 317  *          in the allowed range specified for the argument.
 318  */
 319 template<MathOptimization opt = MathOptimization::Safe>
 320 static inline SimdFloat gmx_simdcall sqrt(SimdFloat x)
 321 {
 322     if (opt == MathOptimization::Safe)
 323     {
 324         SimdFloat res = maskzInvsqrt(x, setZero() < x);
 325         return res * x;
 326     }
 327     else
 328     {
 329         return x * invsqrt(x);
 330     }
 331 }
 332
 333 /*! \brief Cube root for SIMD floats
 334  *
 335  * \param x      Argument to calculate cube root of. Can be negative or zero,
 336  *               but NaN or Inf values are not supported. Denormal values will
 337  *               be treated as 0.0.
 338  * \return       Cube root of x.
 339  */
 340 static inline SimdFloat gmx_simdcall cbrt(SimdFloat x)
 341 {
 342     const SimdFloat signBit(GMX_FLOAT_NEGZERO);
 343     const SimdFloat minFloat(std::numeric_limits<float>::min());
 344     // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
 345     // negative exponent from frexp() is -126, we can subtract one more unit to get 126
 346     // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
 347     // division mixed with FP, we let the divided value (42) be the original constant.
 348     const std::int32_t offsetDiv3(42);
 349     const SimdFloat    c2(-0.191502161678719066F);
 350     const SimdFloat    c1(0.697570460207922770F);
 351     const SimdFloat    c0(0.492659620528969547F);
 352     const SimdFloat    one(1.0F);
 353     const SimdFloat    two(2.0F);
 354     const SimdFloat    three(3.0F);
 355     const SimdFloat    oneThird(1.0F / 3.0F);
 356     const SimdFloat    cbrt2(1.2599210498948731648F);
 357     const SimdFloat    sqrCbrt2(1.5874010519681994748F);
 358
 359     // To calculate cbrt(x) we first take the absolute value of x but save the sign,
 360     // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
 361     // the main step.
 362     // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
 363     // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
 364     // The cube root can the be evaluated by calculating the cube root of the fraction
 365     // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or
 366     // +/-2^(2/3), and then we load this into a new IEEE754 fp number with the exponent 2^n, where
 367     // n is the integer part of the original exponent divided by 3.
 368
 369     SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
 370     SimdFloat xAbs     = andNot(signBit, x);   // select everthing but the sign bit => abs(x)
 371     SimdFBool xIsNonZero = (minFloat <= xAbs); // treat denormals as 0
 372
 373     SimdFInt32 exponent;
 374     SimdFloat  y = frexp(xAbs, &exponent);
 375     // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
 376     // by first using a polynomial and then evaluating
 377     // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
 378     // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
 379     SimdFloat z        = fma(fma(y, c2, c1), y, c0);
 380     SimdFloat w        = z * z * z;
 381     SimdFloat nom      = z * fma(two, y, w);
 382     SimdFloat invDenom = inv(fma(two, w, y));
 383
 384     // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
 385     // division converted to multiplication... but we can't do that since our SIMD layer cannot
 386     // assume the presence of integer shift operations!
 387     // However, when I first worked with the integer algorithm I still came up with a neat
 388     // optimization, so I'll describe the full algorithm here in case we ever want to use it
 389     // in the future:
 390     //
 391     // Our dividend is signed, which is a complication, but let's consider the unsigned case
 392     // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
 393     // our dividend is less than 16 bits (exponent range) we can accomplish this by
 394     // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
 395     // right-shifting by 16 bits to divide out the 2^16 part.
 396     // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
 397     // To handle the signed exponent one alternative would be to take absolute values, saving
 398     // signs, etc - but that gets a bit complicated with 2-complement integers.
 399     // Instead, we remember that we don't really want the exact division per se - what we're
 400     // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
 401     // we require that m must be positive (fewer cases to handle) instead of having n as the
 402     // strict e/3.
 403     // To handle this we start by adding 127 to the exponent. This value corresponds to the
 404     // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
 405     // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
 406     // and then multiply by 21845 to get a division result offsetExpDiv3.
 407     // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
 408     // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
 409     // the extra 1 corrects for the value we added to the exponent to get correct division.
 410     // This remainder would have the value 0,1, or 2, but since we only use it to select
 411     // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
 412     //
 413     // OK; end of long detour. Here's how we actually do it in our implementation by using
 414     // floating-point for the exponent instead to avoid needing integer shifts:
 415     //
 416     // 1) Convert the exponent (obtained from frexp) to a float
 417     // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
 418     //    do floating-point division instead of our integer hack, so it's the exponent bias-1, or
 419     //    the largest exponent minus 2.
 420     // 3) Divide the float by 3 by multiplying with 1/3
 421     // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
 422     //    combination with floating-point, because many integers cannot be represented exactly in
 423     //    floating point, and if we are just epsilon below the result might be truncated to a lower
 424     //    integer. I have not observed this on x86, but to have a safety margin we can add a small
 425     //    fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
 426     //    We can even save this extra floating-point addition by adding a small fraction (0.1) when
 427     //    we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
 428     // 5) Get the remainder part by subtracting the truncated floating-point part.
 429     //    Here too we will have a plain division, so the remainder is a strict modulus
 430     //    and will have the values 0, 1 or 2.
 431     //
 432     // Before worrying about the few wasted cycles due to longer fp latency, this has the
 433     // additional advantage that we don't use a single integer operation, so the algorithm
 434     // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
 435
 436     // The  0.1 here is the safety margin due to  truncation described in item 4 in the comments above.
 437     SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
 438
 439     SimdFloat offsetExpDiv3 =
 440             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
 441
 442     SimdFInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdFloat(static_cast<float>(offsetDiv3)));
 443
 444     SimdFloat remainder = offsetExp - offsetExpDiv3 * three;
 445
 446     // If remainder is 0 we should just have the factor 1.0,
 447     // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
 448     SimdFloat factor = blend(one, cbrt2, SimdFloat(0.5) < remainder);
 449     // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
 450     factor = blend(factor, sqrCbrt2, SimdFloat(1.5) < remainder);
 451
 452     // Assemble the non-signed fraction, and add the sign back by xor
 453     SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
 454     // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
 455     SimdFloat result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
 456
 457     return result;
 458 }
 459
 460 /*! \brief Inverse cube root for SIMD floats
 461  *
 462  * \param x      Argument to calculate cube root of. Can be positive or
 463  *               negative, but the magnitude cannot be lower than
 464  *               the smallest normal number.
 465  * \return       Cube root of x. Undefined for values that don't
 466  *               fulfill the restriction of abs(x) > minFloat.
 467  */
 468 static inline SimdFloat gmx_simdcall invcbrt(SimdFloat x)
 469 {
 470     const SimdFloat signBit(GMX_FLOAT_NEGZERO);
 471     const SimdFloat minFloat(std::numeric_limits<float>::min());
 472     // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
 473     // negative exponent from frexp() is -126, we can subtract one more unit to get 126
 474     // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
 475     // division mixed with FP, we let the divided value (42) be the original constant.
 476     const std::int32_t offsetDiv3(42);
 477     const SimdFloat    c2(-0.191502161678719066F);
 478     const SimdFloat    c1(0.697570460207922770F);
 479     const SimdFloat    c0(0.492659620528969547F);
 480     const SimdFloat    one(1.0F);
 481     const SimdFloat    two(2.0F);
 482     const SimdFloat    three(3.0F);
 483     const SimdFloat    oneThird(1.0F / 3.0F);
 484     const SimdFloat    invCbrt2(1.0F / 1.2599210498948731648F);
 485     const SimdFloat    invSqrCbrt2(1.0F / 1.5874010519681994748F);
 486
 487     // We use pretty much exactly the same implementation as for cbrt(x),
 488     // but to compute the inverse we swap the nominator/denominator
 489     // in the quotient, and also swap the sign of the exponent parts.
 490
 491     SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
 492     SimdFloat xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
 493
 494     SimdFInt32 exponent;
 495     SimdFloat  y = frexp(xAbs, &exponent);
 496     // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
 497     // by first using a polynomial and then evaluating
 498     // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
 499     // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
 500     SimdFloat z        = fma(fma(y, c2, c1), y, c0);
 501     SimdFloat w        = z * z * z;
 502     SimdFloat nom      = fma(two, w, y);
 503     SimdFloat invDenom = inv(z * fma(two, y, w));
 504
 505     // The  0.1 here is the safety margin due to  truncation described in item 4 in the comments above.
 506     SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
 507     SimdFloat offsetExpDiv3 =
 508             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
 509
 510     // We should swap the sign here, so we change order of the terms in the subtraction
 511     SimdFInt32 expDiv3 = cvtR2I(SimdFloat(static_cast<float>(offsetDiv3)) - offsetExpDiv3);
 512
 513     // Swap sign here too, so remainder is either 0, -1 or -2
 514     SimdFloat remainder = offsetExpDiv3 * three - offsetExp;
 515
 516     // If remainder is 0 we should just have the factor 1.0,
 517     // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
 518     SimdFloat factor = blend(one, invCbrt2, remainder < SimdFloat(-0.5));
 519     // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
 520     factor = blend(factor, invSqrCbrt2, remainder < SimdFloat(-1.5));
 521
 522     // Assemble the non-signed fraction, and add the sign back by xor
 523     SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
 524     // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
 525     SimdFloat result = ldexp(fraction, expDiv3);
 526
 527     return result;
 528 }
 529
 530 /*! \brief SIMD float log2(x). This is the base-2 logarithm.
 531  *
 532  * \param x Argument, should be >0.
 533  * \result The base-2 logarithm of x. Undefined if argument is invalid.
 534  */
 535 static inline SimdFloat gmx_simdcall log2(SimdFloat x)
 536 {
 537     // This implementation computes log2 by
 538     // 1) Extracting the exponent and adding it to...
 539     // 2) A 9th-order minimax approximation using only odd
 540     //    terms of (x-1)/(x+1), where x is the mantissa.
 541
 542 #        if GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 543     // Just rescale if native log2() is not present, but log() is.
 544     return log(x) * SimdFloat(std::log2(std::exp(1.0)));
 545 #        else
 546     const SimdFloat one(1.0F);
 547     const SimdFloat two(2.0F);
 548     const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
 549     const SimdFloat CL9(0.342149508897807708152F);
 550     const SimdFloat CL7(0.411570606888219447939F);
 551     const SimdFloat CL5(0.577085979152320294183F);
 552     const SimdFloat CL3(0.961796550607099898222F);
 553     const SimdFloat CL1(2.885390081777926774009F);
 554     SimdFloat       fExp, x2, p;
 555     SimdFBool       m;
 556     SimdFInt32      iExp;
 557
 558     // For the log2() function, the argument can never be 0, so use the faster version of frexp()
 559     x    = frexp<MathOptimization::Unsafe>(x, &iExp);
 560     fExp = cvtI2R(iExp);
 561
 562     m = x < invsqrt2;
 563     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 564     fExp = fExp - selectByMask(one, m);
 565     x    = x * blend(one, two, m);
 566
 567     x  = (x - one) * inv(x + one);
 568     x2 = x * x;
 569
 570     p = fma(CL9, x2, CL7);
 571     p = fma(p, x2, CL5);
 572     p = fma(p, x2, CL3);
 573     p = fma(p, x2, CL1);
 574     p = fma(p, x, fExp);
 575
 576     return p;
 577 #        endif
 578 }
 579
 580 #        if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 581 /*! \brief SIMD float log(x). This is the natural logarithm.
 582  *
 583  * \param x Argument, should be >0.
 584  * \result The natural logarithm of x. Undefined if argument is invalid.
 585  */
 586 static inline SimdFloat gmx_simdcall log(SimdFloat x)
 587 {
 588     const SimdFloat one(1.0F);
 589     const SimdFloat two(2.0F);
 590     const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
 591     const SimdFloat corr(0.693147180559945286226764F);
 592     const SimdFloat CL9(0.2371599674224853515625F);
 593     const SimdFloat CL7(0.285279005765914916992188F);
 594     const SimdFloat CL5(0.400005519390106201171875F);
 595     const SimdFloat CL3(0.666666567325592041015625F);
 596     const SimdFloat CL1(2.0F);
 597     SimdFloat       fExp, x2, p;
 598     SimdFBool       m;
 599     SimdFInt32      iExp;
 600
 601     // For log(), the argument cannot be 0, so use the faster version of frexp()
 602     x    = frexp<MathOptimization::Unsafe>(x, &iExp);
 603     fExp = cvtI2R(iExp);
 604
 605     m = x < invsqrt2;
 606     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 607     fExp = fExp - selectByMask(one, m);
 608     x    = x * blend(one, two, m);
 609
 610     x  = (x - one) * inv(x + one);
 611     x2 = x * x;
 612
 613     p = fma(CL9, x2, CL7);
 614     p = fma(p, x2, CL5);
 615     p = fma(p, x2, CL3);
 616     p = fma(p, x2, CL1);
 617     p = fma(p, x, corr * fExp);
 618
 619     return p;
 620 }
 621 #        endif
 622
 623 #        if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
 624 /*! \brief SIMD float 2^x
 625  *
 626  * \tparam opt If this is changed from the default (safe) into the unsafe
 627  *             option, input values that would otherwise lead to zero-clamped
 628  *             results are not allowed and will lead to undefined results.
 629  *
 630  * \param x Argument. For the default (safe) function version this can be
 631  *          arbitrarily small value, but the routine might clamp the result to
 632  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 633  *          This corresponds to inputs below -126 in single or -1022 in double,
 634  *          and it might overflow for arguments reaching 127 (single) or
 635  *          1023 (double). If you enable the unsafe math optimization,
 636  *          very small arguments will not necessarily be zero-clamped, but
 637  *          can produce undefined results.
 638  *
 639  * \result 2^x. The result is undefined for very large arguments that cause
 640  *          internal floating-point overflow. If unsafe optimizations are enabled,
 641  *          this is also true for very small values.
 642  *
 643  * \note    The definition range of this function is just-so-slightly smaller
 644  *          than the allowed IEEE exponents for many architectures. This is due
 645  *          to the implementation, which will hopefully improve in the future.
 646  *
 647  * \warning You cannot rely on this implementation returning inf for arguments
 648  *          that cause overflow. If you have some very large
 649  *          values and need to rely on getting a valid numerical output,
 650  *          take the minimum of your variable and the largest valid argument
 651  *          before calling this routine.
 652  */
 653 template<MathOptimization opt = MathOptimization::Safe>
 654 static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
 655 {
 656     const SimdFloat CC6(0.0001534581200287996416911311F);
 657     const SimdFloat CC5(0.001339993121934088894618990F);
 658     const SimdFloat CC4(0.009618488957115180159497841F);
 659     const SimdFloat CC3(0.05550328776964726865751735F);
 660     const SimdFloat CC2(0.2402264689063408646490722F);
 661     const SimdFloat CC1(0.6931472057372680777553816F);
 662     const SimdFloat one(1.0F);
 663
 664     SimdFloat intpart;
 665     SimdFloat fexppart;
 666     SimdFloat p;
 667
 668     // Large negative values are valid arguments to exp2(), so there are two
 669     // things we need to account for:
 670     // 1. When the exponents reaches -127, the (biased) exponent field will be
 671     //    zero and we can no longer multiply with it. There are special IEEE
 672     //    formats to handle this range, but for now we have to accept that
 673     //    we cannot handle those arguments. If input value becomes even more
 674     //    negative, it will start to loop and we would end up with invalid
 675     //    exponents. Thus, we need to limit or mask this.
 676     // 2. For VERY large negative values, we will have problems that the
 677     //    subtraction to get the fractional part loses accuracy, and then we
 678     //    can end up with overflows in the polynomial.
 679     //
 680     // For now, we handle this by forwarding the math optimization setting to
 681     // ldexp, where the routine will return zero for very small arguments.
 682     //
 683     // However, before doing that we need to make sure we do not call cvtR2I
 684     // with an argument that is so negative it cannot be converted to an integer.
 685     if (opt == MathOptimization::Safe)
 686     {
 687         x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()));
 688     }
 689
 690     fexppart = ldexp<opt>(one, cvtR2I(x));
 691     intpart  = round(x);
 692     x        = x - intpart;
 693
 694     p = fma(CC6, x, CC5);
 695     p = fma(p, x, CC4);
 696     p = fma(p, x, CC3);
 697     p = fma(p, x, CC2);
 698     p = fma(p, x, CC1);
 699     p = fma(p, x, one);
 700     x = p * fexppart;
 701     return x;
 702 }
 703 #        endif
 704
 705 #        if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
 706 /*! \brief SIMD float exp(x).
 707  *
 708  * In addition to scaling the argument for 2^x this routine correctly does
 709  * extended precision arithmetics to improve accuracy.
 710  *
 711  * \tparam opt If this is changed from the default (safe) into the unsafe
 712  *             option, input values that would otherwise lead to zero-clamped
 713  *             results are not allowed and will lead to undefined results.
 714  *
 715  * \param x Argument. For the default (safe) function version this can be
 716  *          arbitrarily small value, but the routine might clamp the result to
 717  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 718  *          This corresponds to input arguments reaching
 719  *          -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
 720  *          Similarly, it might overflow for arguments reaching
 721  *          127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
 722  *          unsafe math optimizations are enabled, small input values that would
 723  *          result in zero-clamped output are not allowed.
 724  *
 725  * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
 726  *         depending on the underlying implementation. If unsafe optimizations
 727  *         are enabled, this is also true for very small values.
 728  *
 729  * \note    The definition range of this function is just-so-slightly smaller
 730  *          than the allowed IEEE exponents for many architectures. This is due
 731  *          to the implementation, which will hopefully improve in the future.
 732  *
 733  * \warning You cannot rely on this implementation returning inf for arguments
 734  *          that cause overflow. If you have some very large
 735  *          values and need to rely on getting a valid numerical output,
 736  *          take the minimum of your variable and the largest valid argument
 737  *          before calling this routine.
 738  */
 739 template<MathOptimization opt = MathOptimization::Safe>
 740 static inline SimdFloat gmx_simdcall exp(SimdFloat x)
 741 {
 742     const SimdFloat argscale(1.44269504088896341F);
 743     const SimdFloat invargscale0(-0.693145751953125F);
 744     const SimdFloat invargscale1(-1.428606765330187045e-06F);
 745     const SimdFloat CC4(0.00136324646882712841033936F);
 746     const SimdFloat CC3(0.00836596917361021041870117F);
 747     const SimdFloat CC2(0.0416710823774337768554688F);
 748     const SimdFloat CC1(0.166665524244308471679688F);
 749     const SimdFloat CC0(0.499999850988388061523438F);
 750     const SimdFloat one(1.0F);
 751     SimdFloat       fexppart;
 752     SimdFloat       intpart;
 753     SimdFloat       y, p;
 754
 755     // Large negative values are valid arguments to exp2(), so there are two
 756     // things we need to account for:
 757     // 1. When the exponents reaches -127, the (biased) exponent field will be
 758     //    zero and we can no longer multiply with it. There are special IEEE
 759     //    formats to handle this range, but for now we have to accept that
 760     //    we cannot handle those arguments. If input value becomes even more
 761     //    negative, it will start to loop and we would end up with invalid
 762     //    exponents. Thus, we need to limit or mask this.
 763     // 2. For VERY large negative values, we will have problems that the
 764     //    subtraction to get the fractional part loses accuracy, and then we
 765     //    can end up with overflows in the polynomial.
 766     //
 767     // For now, we handle this by forwarding the math optimization setting to
 768     // ldexp, where the routine will return zero for very small arguments.
 769     //
 770     // However, before doing that we need to make sure we do not call cvtR2I
 771     // with an argument that is so negative it cannot be converted to an integer
 772     // after being multiplied by argscale.
 773
 774     if (opt == MathOptimization::Safe)
 775     {
 776         x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()) / argscale);
 777     }
 778
 779     y = x * argscale;
 780
 781
 782     fexppart = ldexp<opt>(one, cvtR2I(y));
 783     intpart  = round(y);
 784
 785     // Extended precision arithmetics
 786     x = fma(invargscale0, intpart, x);
 787     x = fma(invargscale1, intpart, x);
 788
 789     p = fma(CC4, x, CC3);
 790     p = fma(p, x, CC2);
 791     p = fma(p, x, CC1);
 792     p = fma(p, x, CC0);
 793     p = fma(x * x, p, x);
 794 #            if GMX_SIMD_HAVE_FMA
 795     x = fma(p, fexppart, fexppart);
 796 #            else
 797     x = (p + one) * fexppart;
 798 #            endif
 799     return x;
 800 }
 801 #        endif
 802
 803 /*! \brief SIMD float pow(x,y)
 804  *
 805  * This returns x^y for SIMD values.
 806  *
 807  * \tparam opt If this is changed from the default (safe) into the unsafe
 808  *             option, there are no guarantees about correct results for x==0.
 809  *
 810  * \param x Base.
 811  *
 812  * \param y exponent.
 813
 814  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
 815  *         depending on the underlying implementation. If unsafe optimizations
 816  *         are enabled, this is also true for x==0.
 817  *
 818  * \warning You cannot rely on this implementation returning inf for arguments
 819  *          that cause overflow. If you have some very large
 820  *          values and need to rely on getting a valid numerical output,
 821  *          take the minimum of your variable and the largest valid argument
 822  *          before calling this routine.
 823  */
 824 template<MathOptimization opt = MathOptimization::Safe>
 825 static inline SimdFloat gmx_simdcall pow(SimdFloat x, SimdFloat y)
 826 {
 827     SimdFloat xcorr;
 828
 829     if (opt == MathOptimization::Safe)
 830     {
 831         xcorr = max(x, SimdFloat(std::numeric_limits<float>::min()));
 832     }
 833     else
 834     {
 835         xcorr = x;
 836     }
 837
 838     SimdFloat result = exp2<opt>(y * log2(xcorr));
 839
 840     if (opt == MathOptimization::Safe)
 841     {
 842         // if x==0 and y>0 we explicitly set the result to 0.0
 843         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
 844         result = blend(result, setZero(), x == setZero() && setZero() < y);
 845     }
 846
 847     return result;
 848 }
 849
 850
 851 /*! \brief SIMD float erf(x).
 852  *
 853  * \param x The value to calculate erf(x) for.
 854  * \result erf(x)
 855  *
 856  * This routine achieves very close to full precision, but we do not care about
 857  * the last bit or the subnormal result range.
 858  */
 859 static inline SimdFloat gmx_simdcall erf(SimdFloat x)
 860 {
 861     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 862     const SimdFloat CA6(7.853861353153693e-5F);
 863     const SimdFloat CA5(-8.010193625184903e-4F);
 864     const SimdFloat CA4(5.188327685732524e-3F);
 865     const SimdFloat CA3(-2.685381193529856e-2F);
 866     const SimdFloat CA2(1.128358514861418e-1F);
 867     const SimdFloat CA1(-3.761262582423300e-1F);
 868     const SimdFloat CA0(1.128379165726710F);
 869     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 870     const SimdFloat CB9(-0.0018629930017603923F);
 871     const SimdFloat CB8(0.003909821287598495F);
 872     const SimdFloat CB7(-0.0052094582210355615F);
 873     const SimdFloat CB6(0.005685614362160572F);
 874     const SimdFloat CB5(-0.0025367682853477272F);
 875     const SimdFloat CB4(-0.010199799682318782F);
 876     const SimdFloat CB3(0.04369575504816542F);
 877     const SimdFloat CB2(-0.11884063474674492F);
 878     const SimdFloat CB1(0.2732120154030589F);
 879     const SimdFloat CB0(0.42758357702025784F);
 880     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 881     const SimdFloat CC10(-0.0445555913112064F);
 882     const SimdFloat CC9(0.21376355144663348F);
 883     const SimdFloat CC8(-0.3473187200259257F);
 884     const SimdFloat CC7(0.016690861551248114F);
 885     const SimdFloat CC6(0.7560973182491192F);
 886     const SimdFloat CC5(-1.2137903600145787F);
 887     const SimdFloat CC4(0.8411872321232948F);
 888     const SimdFloat CC3(-0.08670413896296343F);
 889     const SimdFloat CC2(-0.27124782687240334F);
 890     const SimdFloat CC1(-0.0007502488047806069F);
 891     const SimdFloat CC0(0.5642114853803148F);
 892     const SimdFloat one(1.0F);
 893     const SimdFloat two(2.0F);
 894
 895     SimdFloat x2, x4, y;
 896     SimdFloat t, t2, w, w2;
 897     SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
 898     SimdFloat expmx2;
 899     SimdFloat res_erf, res_erfc, res;
 900     SimdFBool m, maskErf;
 901
 902     // Calculate erf()
 903     x2 = x * x;
 904     x4 = x2 * x2;
 905
 906     pA0 = fma(CA6, x4, CA4);
 907     pA1 = fma(CA5, x4, CA3);
 908     pA0 = fma(pA0, x4, CA2);
 909     pA1 = fma(pA1, x4, CA1);
 910     pA0 = pA0 * x4;
 911     pA0 = fma(pA1, x2, pA0);
 912     // Constant term must come last for precision reasons
 913     pA0 = pA0 + CA0;
 914
 915     res_erf = x * pA0;
 916
 917     // Calculate erfc
 918     y       = abs(x);
 919     maskErf = SimdFloat(0.75F) <= y;
 920     t       = maskzInv(y, maskErf);
 921     w       = t - one;
 922     t2      = t * t;
 923     w2      = w * w;
 924
 925     // No need for a floating-point sieve here (as in erfc), since erf()
 926     // will never return values that are extremely small for large args.
 927     expmx2 = exp(-y * y);
 928
 929     pB1 = fma(CB9, w2, CB7);
 930     pB0 = fma(CB8, w2, CB6);
 931     pB1 = fma(pB1, w2, CB5);
 932     pB0 = fma(pB0, w2, CB4);
 933     pB1 = fma(pB1, w2, CB3);
 934     pB0 = fma(pB0, w2, CB2);
 935     pB1 = fma(pB1, w2, CB1);
 936     pB0 = fma(pB0, w2, CB0);
 937     pB0 = fma(pB1, w, pB0);
 938
 939     pC0 = fma(CC10, t2, CC8);
 940     pC1 = fma(CC9, t2, CC7);
 941     pC0 = fma(pC0, t2, CC6);
 942     pC1 = fma(pC1, t2, CC5);
 943     pC0 = fma(pC0, t2, CC4);
 944     pC1 = fma(pC1, t2, CC3);
 945     pC0 = fma(pC0, t2, CC2);
 946     pC1 = fma(pC1, t2, CC1);
 947
 948     pC0 = fma(pC0, t2, CC0);
 949     pC0 = fma(pC1, t, pC0);
 950     pC0 = pC0 * t;
 951
 952     // Select pB0 or pC0 for erfc()
 953     m        = two < y;
 954     res_erfc = blend(pB0, pC0, m);
 955     res_erfc = res_erfc * expmx2;
 956
 957     // erfc(x<0) = 2-erfc(|x|)
 958     m        = x < setZero();
 959     res_erfc = blend(res_erfc, two - res_erfc, m);
 960
 961     // Select erf() or erfc()
 962     res = blend(res_erf, one - res_erfc, maskErf);
 963
 964     return res;
 965 }
 966
 967 /*! \brief SIMD float erfc(x).
 968  *
 969  * \param x The value to calculate erfc(x) for.
 970  * \result erfc(x)
 971  *
 972  * This routine achieves full precision (bar the last bit) over most of the
 973  * input range, but for large arguments where the result is getting close
 974  * to the minimum representable numbers we accept slightly larger errors
 975  * (think results that are in the ballpark of 10^-30 for single precision)
 976  * since that is not relevant for MD.
 977  */
 978 static inline SimdFloat gmx_simdcall erfc(SimdFloat x)
 979 {
 980     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 981     const SimdFloat CA6(7.853861353153693e-5F);
 982     const SimdFloat CA5(-8.010193625184903e-4F);
 983     const SimdFloat CA4(5.188327685732524e-3F);
 984     const SimdFloat CA3(-2.685381193529856e-2F);
 985     const SimdFloat CA2(1.128358514861418e-1F);
 986     const SimdFloat CA1(-3.761262582423300e-1F);
 987     const SimdFloat CA0(1.128379165726710F);
 988     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 989     const SimdFloat CB9(-0.0018629930017603923F);
 990     const SimdFloat CB8(0.003909821287598495F);
 991     const SimdFloat CB7(-0.0052094582210355615F);
 992     const SimdFloat CB6(0.005685614362160572F);
 993     const SimdFloat CB5(-0.0025367682853477272F);
 994     const SimdFloat CB4(-0.010199799682318782F);
 995     const SimdFloat CB3(0.04369575504816542F);
 996     const SimdFloat CB2(-0.11884063474674492F);
 997     const SimdFloat CB1(0.2732120154030589F);
 998     const SimdFloat CB0(0.42758357702025784F);
 999     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
1000     const SimdFloat CC10(-0.0445555913112064F);
1001     const SimdFloat CC9(0.21376355144663348F);
1002     const SimdFloat CC8(-0.3473187200259257F);
1003     const SimdFloat CC7(0.016690861551248114F);
1004     const SimdFloat CC6(0.7560973182491192F);
1005     const SimdFloat CC5(-1.2137903600145787F);
1006     const SimdFloat CC4(0.8411872321232948F);
1007     const SimdFloat CC3(-0.08670413896296343F);
1008     const SimdFloat CC2(-0.27124782687240334F);
1009     const SimdFloat CC1(-0.0007502488047806069F);
1010     const SimdFloat CC0(0.5642114853803148F);
1011     // Coefficients for expansion of exp(x) in [0,0.1]
1012     // CD0 and CD1 are both 1.0, so no need to declare them separately
1013     const SimdFloat CD2(0.5000066608081202F);
1014     const SimdFloat CD3(0.1664795422874624F);
1015     const SimdFloat CD4(0.04379839977652482F);
1016     const SimdFloat one(1.0F);
1017     const SimdFloat two(2.0F);
1018
1019     /* We need to use a small trick here, since we cannot assume all SIMD
1020      * architectures support integers, and the flag we want (0xfffff000) would
1021      * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
1022      * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
1023      * fp numbers, and perform a logical or. Since the expression is constant,
1024      * we can at least hope it is evaluated at compile-time.
1025      */
1026 #        if GMX_SIMD_HAVE_LOGICAL
1027     const SimdFloat sieve(SimdFloat(-5.965323564e+29F) | SimdFloat(7.05044434e-30F));
1028 #        else
1029     const int                         isieve = 0xFFFFF000;
1030     alignas(GMX_SIMD_ALIGNMENT) float mem[GMX_SIMD_FLOAT_WIDTH];
1031
1032     union {
1033         float f;
1034         int   i;
1035     } conv;
1036     int i;
1037 #        endif
1038
1039     SimdFloat x2, x4, y;
1040     SimdFloat q, z, t, t2, w, w2;
1041     SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
1042     SimdFloat expmx2, corr;
1043     SimdFloat res_erf, res_erfc, res;
1044     SimdFBool m, msk_erf;
1045
1046     // Calculate erf()
1047     x2 = x * x;
1048     x4 = x2 * x2;
1049
1050     pA0 = fma(CA6, x4, CA4);
1051     pA1 = fma(CA5, x4, CA3);
1052     pA0 = fma(pA0, x4, CA2);
1053     pA1 = fma(pA1, x4, CA1);
1054     pA1 = pA1 * x2;
1055     pA0 = fma(pA0, x4, pA1);
1056     // Constant term must come last for precision reasons
1057     pA0 = pA0 + CA0;
1058
1059     res_erf = x * pA0;
1060
1061     // Calculate erfc
1062     y       = abs(x);
1063     msk_erf = SimdFloat(0.75F) <= y;
1064     t       = maskzInv(y, msk_erf);
1065     w       = t - one;
1066     t2      = t * t;
1067     w2      = w * w;
1068     /*
1069      * We cannot simply calculate exp(-y2) directly in single precision, since
1070      * that will lose a couple of bits of precision due to the multiplication.
1071      * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
1072      * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
1073      *
1074      * The only drawback with this is that it requires TWO separate exponential
1075      * evaluations, which would be horrible performance-wise. However, the argument
1076      * for the second exp() call is always small, so there we simply use a
1077      * low-order minimax expansion on [0,0.1].
1078      *
1079      * However, this neat idea requires support for logical ops (and) on
1080      * FP numbers, which some vendors decided isn't necessary in their SIMD
1081      * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
1082      * in double, but we still need memory as a backup when that is not available,
1083      * and this case is rare enough that we go directly there...
1084      */
1085 #        if GMX_SIMD_HAVE_LOGICAL
1086     z = y & sieve;
1087 #        else
1088     store(mem, y);
1089     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1090     {
1091         conv.f = mem[i];
1092         conv.i = conv.i & isieve;
1093         mem[i] = conv.f;
1094     }
1095     z = load<SimdFloat>(mem);
1096 #        endif
1097     q    = (z - y) * (z + y);
1098     corr = fma(CD4, q, CD3);
1099     corr = fma(corr, q, CD2);
1100     corr = fma(corr, q, one);
1101     corr = fma(corr, q, one);
1102
1103     expmx2 = exp(-z * z);
1104     expmx2 = expmx2 * corr;
1105
1106     pB1 = fma(CB9, w2, CB7);
1107     pB0 = fma(CB8, w2, CB6);
1108     pB1 = fma(pB1, w2, CB5);
1109     pB0 = fma(pB0, w2, CB4);
1110     pB1 = fma(pB1, w2, CB3);
1111     pB0 = fma(pB0, w2, CB2);
1112     pB1 = fma(pB1, w2, CB1);
1113     pB0 = fma(pB0, w2, CB0);
1114     pB0 = fma(pB1, w, pB0);
1115
1116     pC0 = fma(CC10, t2, CC8);
1117     pC1 = fma(CC9, t2, CC7);
1118     pC0 = fma(pC0, t2, CC6);
1119     pC1 = fma(pC1, t2, CC5);
1120     pC0 = fma(pC0, t2, CC4);
1121     pC1 = fma(pC1, t2, CC3);
1122     pC0 = fma(pC0, t2, CC2);
1123     pC1 = fma(pC1, t2, CC1);
1124
1125     pC0 = fma(pC0, t2, CC0);
1126     pC0 = fma(pC1, t, pC0);
1127     pC0 = pC0 * t;
1128
1129     // Select pB0 or pC0 for erfc()
1130     m        = two < y;
1131     res_erfc = blend(pB0, pC0, m);
1132     res_erfc = res_erfc * expmx2;
1133
1134     // erfc(x<0) = 2-erfc(|x|)
1135     m        = x < setZero();
1136     res_erfc = blend(res_erfc, two - res_erfc, m);
1137
1138     // Select erf() or erfc()
1139     res = blend(one - res_erf, res_erfc, msk_erf);
1140
1141     return res;
1142 }
1143
1144 /*! \brief SIMD float sin \& cos.
1145  *
1146  * \param x The argument to evaluate sin/cos for
1147  * \param[out] sinval Sin(x)
1148  * \param[out] cosval Cos(x)
1149  *
1150  * This version achieves close to machine precision, but for very large
1151  * magnitudes of the argument we inherently begin to lose accuracy due to the
1152  * argument reduction, despite using extended precision arithmetics internally.
1153  */
1154 static inline void gmx_simdcall sincos(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
1155 {
1156     // Constants to subtract Pi/4*x from y while minimizing precision loss
1157     const SimdFloat argred0(-1.5703125);
1158     const SimdFloat argred1(-4.83751296997070312500e-04F);
1159     const SimdFloat argred2(-7.54953362047672271729e-08F);
1160     const SimdFloat argred3(-2.56334406825708960298e-12F);
1161     const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1162     const SimdFloat const_sin2(-1.9515295891e-4F);
1163     const SimdFloat const_sin1(8.3321608736e-3F);
1164     const SimdFloat const_sin0(-1.6666654611e-1F);
1165     const SimdFloat const_cos2(2.443315711809948e-5F);
1166     const SimdFloat const_cos1(-1.388731625493765e-3F);
1167     const SimdFloat const_cos0(4.166664568298827e-2F);
1168     const SimdFloat half(0.5F);
1169     const SimdFloat one(1.0F);
1170     SimdFloat       ssign, csign;
1171     SimdFloat       x2, y, z, psin, pcos, sss, ccc;
1172     SimdFBool       m;
1173
1174 #        if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1175     const SimdFInt32 ione(1);
1176     const SimdFInt32 itwo(2);
1177     SimdFInt32       iy;
1178
1179     z  = x * two_over_pi;
1180     iy = cvtR2I(z);
1181     y  = round(z);
1182
1183     m     = cvtIB2B((iy & ione) == SimdFInt32(0));
1184     ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
1185     csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
1186 #        else
1187     const SimdFloat quarter(0.25f);
1188     const SimdFloat minusquarter(-0.25f);
1189     SimdFloat       q;
1190     SimdFBool       m1, m2, m3;
1191
1192     /* The most obvious way to find the arguments quadrant in the unit circle
1193      * to calculate the sign is to use integer arithmetic, but that is not
1194      * present in all SIMD implementations. As an alternative, we have devised a
1195      * pure floating-point algorithm that uses truncation for argument reduction
1196      * so that we get a new value 0<=q<1 over the unit circle, and then
1197      * do floating-point comparisons with fractions. This is likely to be
1198      * slightly slower (~10%) due to the longer latencies of floating-point, so
1199      * we only use it when integer SIMD arithmetic is not present.
1200      */
1201     ssign = x;
1202     x     = abs(x);
1203     // It is critical that half-way cases are rounded down
1204     z = fma(x, two_over_pi, half);
1205     y = trunc(z);
1206     q = z * quarter;
1207     q = q - trunc(q);
1208     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
1209      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
1210      * This removes the 2*Pi periodicity without using any integer arithmetic.
1211      * First check if y had the value 2 or 3, set csign if true.
1212      */
1213     q = q - half;
1214     /* If we have logical operations we can work directly on the signbit, which
1215      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
1216      * Thus, if you are altering defines to debug alternative code paths, the
1217      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
1218      * active or inactive - you will get errors if only one is used.
1219      */
1220 #            if GMX_SIMD_HAVE_LOGICAL
1221     ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
1222     csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
1223     ssign = ssign ^ csign;
1224 #            else
1225     ssign = copysign(SimdFloat(1.0f), ssign);
1226     csign = copysign(SimdFloat(1.0f), q);
1227     csign = -csign;
1228     ssign = ssign * csign; // swap ssign if csign was set.
1229 #            endif
1230     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
1231     m1 = (q < minusquarter);
1232     m2 = (setZero() <= q);
1233     m3 = (q < quarter);
1234     m2 = m2 && m3;
1235     m  = m1 || m2;
1236     // where mask is FALSE, swap sign.
1237     csign   = csign * blend(SimdFloat(-1.0f), one, m);
1238 #        endif
1239     x  = fma(y, argred0, x);
1240     x  = fma(y, argred1, x);
1241     x  = fma(y, argred2, x);
1242     x  = fma(y, argred3, x);
1243     x2 = x * x;
1244
1245     psin = fma(const_sin2, x2, const_sin1);
1246     psin = fma(psin, x2, const_sin0);
1247     psin = fma(psin, x * x2, x);
1248     pcos = fma(const_cos2, x2, const_cos1);
1249     pcos = fma(pcos, x2, const_cos0);
1250     pcos = fms(pcos, x2, half);
1251     pcos = fma(pcos, x2, one);
1252
1253     sss = blend(pcos, psin, m);
1254     ccc = blend(psin, pcos, m);
1255     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
1256 #        if GMX_SIMD_HAVE_LOGICAL
1257     *sinval = sss ^ ssign;
1258     *cosval = ccc ^ csign;
1259 #        else
1260     *sinval = sss * ssign;
1261     *cosval = ccc * csign;
1262 #        endif
1263 }
1264
1265 /*! \brief SIMD float sin(x).
1266  *
1267  * \param x The argument to evaluate sin for
1268  * \result Sin(x)
1269  *
1270  * \attention Do NOT call both sin & cos if you need both results, since each of them
1271  * will then call \ref sincos and waste a factor 2 in performance.
1272  */
1273 static inline SimdFloat gmx_simdcall sin(SimdFloat x)
1274 {
1275     SimdFloat s, c;
1276     sincos(x, &s, &c);
1277     return s;
1278 }
1279
1280 /*! \brief SIMD float cos(x).
1281  *
1282  * \param x The argument to evaluate cos for
1283  * \result Cos(x)
1284  *
1285  * \attention Do NOT call both sin & cos if you need both results, since each of them
1286  * will then call \ref sincos and waste a factor 2 in performance.
1287  */
1288 static inline SimdFloat gmx_simdcall cos(SimdFloat x)
1289 {
1290     SimdFloat s, c;
1291     sincos(x, &s, &c);
1292     return c;
1293 }
1294
1295 /*! \brief SIMD float tan(x).
1296  *
1297  * \param x The argument to evaluate tan for
1298  * \result Tan(x)
1299  */
1300 static inline SimdFloat gmx_simdcall tan(SimdFloat x)
1301 {
1302     const SimdFloat argred0(-1.5703125);
1303     const SimdFloat argred1(-4.83751296997070312500e-04F);
1304     const SimdFloat argred2(-7.54953362047672271729e-08F);
1305     const SimdFloat argred3(-2.56334406825708960298e-12F);
1306     const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1307     const SimdFloat CT6(0.009498288995810566122993911);
1308     const SimdFloat CT5(0.002895755790837379295226923);
1309     const SimdFloat CT4(0.02460087336161924491836265);
1310     const SimdFloat CT3(0.05334912882656359828045988);
1311     const SimdFloat CT2(0.1333989091464957704418495);
1312     const SimdFloat CT1(0.3333307599244198227797507);
1313
1314     SimdFloat x2, p, y, z;
1315     SimdFBool m;
1316
1317 #        if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1318     SimdFInt32 iy;
1319     SimdFInt32 ione(1);
1320
1321     z  = x * two_over_pi;
1322     iy = cvtR2I(z);
1323     y  = round(z);
1324     m  = cvtIB2B((iy & ione) == ione);
1325
1326     x = fma(y, argred0, x);
1327     x = fma(y, argred1, x);
1328     x = fma(y, argred2, x);
1329     x = fma(y, argred3, x);
1330     x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1331 #        else
1332     const SimdFloat quarter(0.25f);
1333     const SimdFloat half(0.5f);
1334     const SimdFloat threequarter(0.75f);
1335     SimdFloat       w, q;
1336     SimdFBool       m1, m2, m3;
1337
1338     w     = abs(x);
1339     z     = fma(w, two_over_pi, half);
1340     y     = trunc(z);
1341     q     = z * quarter;
1342     q     = q - trunc(q);
1343     m1    = quarter <= q;
1344     m2    = q < half;
1345     m3    = threequarter <= q;
1346     m1    = m1 && m2;
1347     m     = m1 || m3;
1348     w     = fma(y, argred0, w);
1349     w     = fma(y, argred1, w);
1350     w     = fma(y, argred2, w);
1351     w     = fma(y, argred3, w);
1352     w     = blend(w, -w, m);
1353     x     = w * copysign(SimdFloat(1.0), x);
1354 #        endif
1355     x2 = x * x;
1356     p  = fma(CT6, x2, CT5);
1357     p  = fma(p, x2, CT4);
1358     p  = fma(p, x2, CT3);
1359     p  = fma(p, x2, CT2);
1360     p  = fma(p, x2, CT1);
1361     p  = fma(x2 * p, x, x);
1362
1363     p = blend(p, maskzInv(p, m), m);
1364     return p;
1365 }
1366
1367 /*! \brief SIMD float asin(x).
1368  *
1369  * \param x The argument to evaluate asin for
1370  * \result Asin(x)
1371  */
1372 static inline SimdFloat gmx_simdcall asin(SimdFloat x)
1373 {
1374     const SimdFloat limitlow(1e-4F);
1375     const SimdFloat half(0.5F);
1376     const SimdFloat one(1.0F);
1377     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1378     const SimdFloat CC5(4.2163199048E-2F);
1379     const SimdFloat CC4(2.4181311049E-2F);
1380     const SimdFloat CC3(4.5470025998E-2F);
1381     const SimdFloat CC2(7.4953002686E-2F);
1382     const SimdFloat CC1(1.6666752422E-1F);
1383     SimdFloat       xabs;
1384     SimdFloat       z, z1, z2, q, q1, q2;
1385     SimdFloat       pA, pB;
1386     SimdFBool       m, m2;
1387
1388     xabs = abs(x);
1389     m    = half < xabs;
1390     z1   = half * (one - xabs);
1391     m2   = xabs < one;
1392     q1   = z1 * maskzInvsqrt(z1, m2);
1393     q2   = xabs;
1394     z2   = q2 * q2;
1395     z    = blend(z2, z1, m);
1396     q    = blend(q2, q1, m);
1397
1398     z2 = z * z;
1399     pA = fma(CC5, z2, CC3);
1400     pB = fma(CC4, z2, CC2);
1401     pA = fma(pA, z2, CC1);
1402     pA = pA * z;
1403     z  = fma(pB, z2, pA);
1404     z  = fma(z, q, q);
1405     q2 = halfpi - z;
1406     q2 = q2 - z;
1407     z  = blend(z, q2, m);
1408
1409     m = limitlow < xabs;
1410     z = blend(xabs, z, m);
1411     z = copysign(z, x);
1412
1413     return z;
1414 }
1415
1416 /*! \brief SIMD float acos(x).
1417  *
1418  * \param x The argument to evaluate acos for
1419  * \result Acos(x)
1420  */
1421 static inline SimdFloat gmx_simdcall acos(SimdFloat x)
1422 {
1423     const SimdFloat one(1.0F);
1424     const SimdFloat half(0.5F);
1425     const SimdFloat pi(static_cast<float>(M_PI));
1426     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1427     SimdFloat       xabs;
1428     SimdFloat       z, z1, z2, z3;
1429     SimdFBool       m1, m2, m3;
1430
1431     xabs = abs(x);
1432     m1   = half < xabs;
1433     m2   = setZero() < x;
1434
1435     z  = fnma(half, xabs, half);
1436     m3 = xabs < one;
1437     z  = z * maskzInvsqrt(z, m3);
1438     z  = blend(x, z, m1);
1439     z  = asin(z);
1440
1441     z2 = z + z;
1442     z1 = pi - z2;
1443     z3 = halfpi - z;
1444     z  = blend(z1, z2, m2);
1445     z  = blend(z3, z, m1);
1446
1447     return z;
1448 }
1449
1450 /*! \brief SIMD float asin(x).
1451  *
1452  * \param x The argument to evaluate atan for
1453  * \result Atan(x), same argument/value range as standard math library.
1454  */
1455 static inline SimdFloat gmx_simdcall atan(SimdFloat x)
1456 {
1457     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1458     const SimdFloat CA17(0.002823638962581753730774F);
1459     const SimdFloat CA15(-0.01595690287649631500244F);
1460     const SimdFloat CA13(0.04250498861074447631836F);
1461     const SimdFloat CA11(-0.07489009201526641845703F);
1462     const SimdFloat CA9(0.1063479334115982055664F);
1463     const SimdFloat CA7(-0.1420273631811141967773F);
1464     const SimdFloat CA5(0.1999269574880599975585F);
1465     const SimdFloat CA3(-0.3333310186862945556640F);
1466     const SimdFloat one(1.0F);
1467     SimdFloat       x2, x3, x4, pA, pB;
1468     SimdFBool       m, m2;
1469
1470     m  = x < setZero();
1471     x  = abs(x);
1472     m2 = one < x;
1473     x  = blend(x, maskzInv(x, m2), m2);
1474
1475     x2 = x * x;
1476     x3 = x2 * x;
1477     x4 = x2 * x2;
1478     pA = fma(CA17, x4, CA13);
1479     pB = fma(CA15, x4, CA11);
1480     pA = fma(pA, x4, CA9);
1481     pB = fma(pB, x4, CA7);
1482     pA = fma(pA, x4, CA5);
1483     pB = fma(pB, x4, CA3);
1484     pA = fma(pA, x2, pB);
1485     pA = fma(pA, x3, x);
1486
1487     pA = blend(pA, halfpi - pA, m2);
1488     pA = blend(pA, -pA, m);
1489
1490     return pA;
1491 }
1492
1493 /*! \brief SIMD float atan2(y,x).
1494  *
1495  * \param y Y component of vector, any quartile
1496  * \param x X component of vector, any quartile
1497  * \result Atan(y,x), same argument/value range as standard math library.
1498  *
1499  * \note This routine should provide correct results for all finite
1500  * non-zero or positive-zero arguments. However, negative zero arguments will
1501  * be treated as positive zero, which means the return value will deviate from
1502  * the standard math library atan2(y,x) for those cases. That should not be
1503  * of any concern in Gromacs, and in particular it will not affect calculations
1504  * of angles from vectors.
1505  */
1506 static inline SimdFloat gmx_simdcall atan2(SimdFloat y, SimdFloat x)
1507 {
1508     const SimdFloat pi(static_cast<float>(M_PI));
1509     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0));
1510     SimdFloat       xinv, p, aoffset;
1511     SimdFBool       mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1512
1513     mask_xnz  = x != setZero();
1514     mask_ynz  = y != setZero();
1515     mask_xlt0 = x < setZero();
1516     mask_ylt0 = y < setZero();
1517
1518     aoffset = selectByNotMask(halfpi, mask_xnz);
1519     aoffset = selectByMask(aoffset, mask_ynz);
1520
1521     aoffset = blend(aoffset, pi, mask_xlt0);
1522     aoffset = blend(aoffset, -aoffset, mask_ylt0);
1523
1524     xinv = maskzInv(x, mask_xnz);
1525     p    = y * xinv;
1526     p    = atan(p);
1527     p    = p + aoffset;
1528
1529     return p;
1530 }
1531
1532 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1533  *
1534  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1535  * \result Correction factor to coulomb force - see below for details.
1536  *
1537  * This routine is meant to enable analytical evaluation of the
1538  * direct-space PME electrostatic force to avoid tables.
1539  *
1540  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1541  * are some problems evaluating that:
1542  *
1543  * First, the error function is difficult (read: expensive) to
1544  * approxmiate accurately for intermediate to large arguments, and
1545  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1546  * Second, we now try to avoid calculating potentials in Gromacs but
1547  * use forces directly.
1548  *
1549  * We can simply things slight by noting that the PME part is really
1550  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1551  * \f[
1552  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1553  * \f]
1554  * The first term we already have from the inverse square root, so
1555  * that we can leave out of this routine.
1556  *
1557  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1558  * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1559  * the range used for the minimax fit. Use your favorite plotting program
1560  * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1561  *
1562  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1563  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1564  * then only use even powers. This is another minor optimization, since
1565  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1566  * the vector between the two atoms to get the vectorial force. The
1567  * fastest flops are the ones we can avoid calculating!
1568  *
1569  * So, here's how it should be used:
1570  *
1571  * 1. Calculate \f$r^2\f$.
1572  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1573  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1574  * 4. The return value is the expression:
1575  *
1576  * \f[
1577  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1578  * \f]
1579  *
1580  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1581  *
1582  *  \f[
1583  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1584  *  \f]
1585  *
1586  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1587  *
1588  *  \f[
1589  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1590  *  \f]
1591  *
1592  *    With a bit of math exercise you should be able to confirm that
1593  *    this is exactly
1594  *
1595  *  \f[
1596  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1597  *  \f]
1598  *
1599  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1600  *    and you have your force (divided by \f$r\f$). A final multiplication
1601  *    with the vector connecting the two particles and you have your
1602  *    vectorial force to add to the particles.
1603  *
1604  * This approximation achieves an error slightly lower than 1e-6
1605  * in single precision and 1e-11 in double precision
1606  * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1607  * when added to \f$1/r\f$ the error will be insignificant.
1608  * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1609  *
1610  */
1611 static inline SimdFloat gmx_simdcall pmeForceCorrection(SimdFloat z2)
1612 {
1613     const SimdFloat FN6(-1.7357322914161492954e-8F);
1614     const SimdFloat FN5(1.4703624142580877519e-6F);
1615     const SimdFloat FN4(-0.000053401640219807709149F);
1616     const SimdFloat FN3(0.0010054721316683106153F);
1617     const SimdFloat FN2(-0.019278317264888380590F);
1618     const SimdFloat FN1(0.069670166153766424023F);
1619     const SimdFloat FN0(-0.75225204789749321333F);
1620
1621     const SimdFloat FD4(0.0011193462567257629232F);
1622     const SimdFloat FD3(0.014866955030185295499F);
1623     const SimdFloat FD2(0.11583842382862377919F);
1624     const SimdFloat FD1(0.50736591960530292870F);
1625     const SimdFloat FD0(1.0F);
1626
1627     SimdFloat z4;
1628     SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1629
1630     z4 = z2 * z2;
1631
1632     polyFD0 = fma(FD4, z4, FD2);
1633     polyFD1 = fma(FD3, z4, FD1);
1634     polyFD0 = fma(polyFD0, z4, FD0);
1635     polyFD0 = fma(polyFD1, z2, polyFD0);
1636
1637     polyFD0 = inv(polyFD0);
1638
1639     polyFN0 = fma(FN6, z4, FN4);
1640     polyFN1 = fma(FN5, z4, FN3);
1641     polyFN0 = fma(polyFN0, z4, FN2);
1642     polyFN1 = fma(polyFN1, z4, FN1);
1643     polyFN0 = fma(polyFN0, z4, FN0);
1644     polyFN0 = fma(polyFN1, z2, polyFN0);
1645
1646     return polyFN0 * polyFD0;
1647 }
1648
1649
1650 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1651  *
1652  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1653  * \result Correction factor to coulomb potential - see below for details.
1654  *
1655  * See \ref pmeForceCorrection for details about the approximation.
1656  *
1657  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1658  * as the input argument.
1659  *
1660  * Here's how it should be used:
1661  *
1662  * 1. Calculate \f$r^2\f$.
1663  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1664  * 3. Evaluate this routine with z^2 as the argument.
1665  * 4. The return value is the expression:
1666  *
1667  *  \f[
1668  *   \frac{\mbox{erf}(z)}{z}
1669  *  \f]
1670  *
1671  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1672  *
1673  *  \f[
1674  *    \frac{\mbox{erf}(r \beta)}{r}
1675  *  \f]
1676  *
1677  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1678  *    and you have your potential.
1679  *
1680  * This approximation achieves an error slightly lower than 1e-6
1681  * in single precision and 4e-11 in double precision
1682  * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1683  * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1684  * when added to \f$1/r\f$ the error will be insignificant.
1685  * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1686  */
1687 static inline SimdFloat gmx_simdcall pmePotentialCorrection(SimdFloat z2)
1688 {
1689     const SimdFloat VN6(1.9296833005951166339e-8F);
1690     const SimdFloat VN5(-1.4213390571557850962e-6F);
1691     const SimdFloat VN4(0.000041603292906656984871F);
1692     const SimdFloat VN3(-0.00013134036773265025626F);
1693     const SimdFloat VN2(0.038657983986041781264F);
1694     const SimdFloat VN1(0.11285044772717598220F);
1695     const SimdFloat VN0(1.1283802385263030286F);
1696
1697     const SimdFloat VD3(0.0066752224023576045451F);
1698     const SimdFloat VD2(0.078647795836373922256F);
1699     const SimdFloat VD1(0.43336185284710920150F);
1700     const SimdFloat VD0(1.0F);
1701
1702     SimdFloat z4;
1703     SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1704
1705     z4 = z2 * z2;
1706
1707     polyVD1 = fma(VD3, z4, VD1);
1708     polyVD0 = fma(VD2, z4, VD0);
1709     polyVD0 = fma(polyVD1, z2, polyVD0);
1710
1711     polyVD0 = inv(polyVD0);
1712
1713     polyVN0 = fma(VN6, z4, VN4);
1714     polyVN1 = fma(VN5, z4, VN3);
1715     polyVN0 = fma(polyVN0, z4, VN2);
1716     polyVN1 = fma(polyVN1, z4, VN1);
1717     polyVN0 = fma(polyVN0, z4, VN0);
1718     polyVN0 = fma(polyVN1, z2, polyVN0);
1719
1720     return polyVN0 * polyVD0;
1721 }
1722 #    endif
1723
1724 /*! \} */
1725
1726 #    if GMX_SIMD_HAVE_DOUBLE
1727
1728
1729 /*! \name Double precision SIMD math functions
1730  *
1731  *  \note In most cases you should use the real-precision functions instead.
1732  *  \{
1733  */
1734
1735 /****************************************
1736  * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1737  ****************************************/
1738
1739 #        if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1740 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1741  *
1742  * \param x Values to set sign for
1743  * \param y Values used to set sign
1744  * \return  Magnitude of x, sign of y
1745  */
1746 static inline SimdDouble gmx_simdcall copysign(SimdDouble x, SimdDouble y)
1747 {
1748 #            if GMX_SIMD_HAVE_LOGICAL
1749     return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1750 #            else
1751     return blend(abs(x), -abs(x), (y < setZero()));
1752 #            endif
1753 }
1754 #        endif
1755
1756 #        if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1757 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1758  *
1759  * This is a low-level routine that should only be used by SIMD math routine
1760  * that evaluates the inverse square root.
1761  *
1762  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1763  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
1764  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1765  */
1766 static inline SimdDouble gmx_simdcall rsqrtIter(SimdDouble lu, SimdDouble x)
1767 {
1768     SimdDouble tmp1 = x * lu;
1769     SimdDouble tmp2 = SimdDouble(-0.5) * lu;
1770     tmp1            = fma(tmp1, lu, SimdDouble(-3.0));
1771     return tmp1 * tmp2;
1772 }
1773 #        endif
1774
1775 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1776  *
1777  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1778  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1779  *           For the single precision implementation this is obviously always
1780  *           true for positive values, but for double precision it adds an
1781  *           extra restriction since the first lookup step might have to be
1782  *           performed in single precision on some architectures. Note that the
1783  *           responsibility for checking falls on you - this routine does not
1784  *           check arguments.
1785  *
1786  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
1787  */
1788 static inline SimdDouble gmx_simdcall invsqrt(SimdDouble x)
1789 {
1790     SimdDouble lu = rsqrt(x);
1791 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1792     lu = rsqrtIter(lu, x);
1793 #        endif
1794 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1795     lu = rsqrtIter(lu, x);
1796 #        endif
1797 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1798     lu = rsqrtIter(lu, x);
1799 #        endif
1800 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1801     lu = rsqrtIter(lu, x);
1802 #        endif
1803     return lu;
1804 }
1805
1806 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1807  *
1808  * \param x0  First set of arguments, x0 must be in single range (see below).
1809  * \param x1  Second set of arguments, x1 must be in single range (see below).
1810  * \param[out] out0  Result 1/sqrt(x0)
1811  * \param[out] out1  Result 1/sqrt(x1)
1812  *
1813  *  In particular for double precision we can sometimes calculate square root
1814  *  pairs slightly faster by using single precision until the very last step.
1815  *
1816  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1817  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
1818  *       For the single precision implementation this is obviously always
1819  *       true for positive values, but for double precision it adds an
1820  *       extra restriction since the first lookup step might have to be
1821  *       performed in single precision on some architectures. Note that the
1822  *       responsibility for checking falls on you - this routine does not
1823  *       check arguments.
1824  */
1825 static inline void gmx_simdcall invsqrtPair(SimdDouble x0, SimdDouble x1, SimdDouble* out0, SimdDouble* out1)
1826 {
1827 #        if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
1828                 && (GMX_SIMD_RSQRT_BITS < 22)
1829     SimdFloat  xf  = cvtDD2F(x0, x1);
1830     SimdFloat  luf = rsqrt(xf);
1831     SimdDouble lu0, lu1;
1832     // Intermediate target is single - mantissa+1 bits
1833 #            if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1834     luf = rsqrtIter(luf, xf);
1835 #            endif
1836 #            if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1837     luf = rsqrtIter(luf, xf);
1838 #            endif
1839 #            if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1840     luf = rsqrtIter(luf, xf);
1841 #            endif
1842     cvtF2DD(luf, &lu0, &lu1);
1843     // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1844 #            if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1845     lu0 = rsqrtIter(lu0, x0);
1846     lu1 = rsqrtIter(lu1, x1);
1847 #            endif
1848 #            if (GMX_SIMD_ACCURACY_BITS_SINGLE * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1849     lu0 = rsqrtIter(lu0, x0);
1850     lu1 = rsqrtIter(lu1, x1);
1851 #            endif
1852     *out0 = lu0;
1853     *out1 = lu1;
1854 #        else
1855     *out0 = invsqrt(x0);
1856     *out1 = invsqrt(x1);
1857 #        endif
1858 }
1859
1860 #        if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1861 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1862  *
1863  * This is a low-level routine that should only be used by SIMD math routine
1864  * that evaluates the reciprocal.
1865  *
1866  *  \param lu Approximation of 1/x, typically obtained from lookup.
1867  *  \param x  The reference (starting) value x for which we want 1/x.
1868  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1869  */
1870 static inline SimdDouble gmx_simdcall rcpIter(SimdDouble lu, SimdDouble x)
1871 {
1872     return lu * fnma(lu, x, SimdDouble(2.0));
1873 }
1874 #        endif
1875
1876 /*! \brief Calculate 1/x for SIMD double.
1877  *
1878  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1879  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1880  *           For the single precision implementation this is obviously always
1881  *           true for positive values, but for double precision it adds an
1882  *           extra restriction since the first lookup step might have to be
1883  *           performed in single precision on some architectures. Note that the
1884  *           responsibility for checking falls on you - this routine does not
1885  *           check arguments.
1886  *
1887  *  \return 1/x. Result is undefined if your argument was invalid.
1888  */
1889 static inline SimdDouble gmx_simdcall inv(SimdDouble x)
1890 {
1891     SimdDouble lu = rcp(x);
1892 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1893     lu = rcpIter(lu, x);
1894 #        endif
1895 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1896     lu = rcpIter(lu, x);
1897 #        endif
1898 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1899     lu = rcpIter(lu, x);
1900 #        endif
1901 #        if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1902     lu = rcpIter(lu, x);
1903 #        endif
1904     return lu;
1905 }
1906
1907 /*! \brief Division for SIMD doubles
1908  *
1909  * \param nom    Nominator
1910  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1911  *               For single precision this is equivalent to a nonzero argument,
1912  *               but in double precision it adds an extra restriction since
1913  *               the first lookup step might have to be performed in single
1914  *               precision on some architectures. Note that the responsibility
1915  *               for checking falls on you - this routine does not check arguments.
1916  *
1917  * \return nom/denom
1918  *
1919  * \note This function does not use any masking to avoid problems with
1920  *       zero values in the denominator.
1921  */
1922 static inline SimdDouble gmx_simdcall operator/(SimdDouble nom, SimdDouble denom)
1923 {
1924     return nom * inv(denom);
1925 }
1926
1927
1928 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1929  *
1930  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1931  *  Illegal values in the masked-out elements will not lead to
1932  *  floating-point exceptions.
1933  *
1934  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1935  *           GMX_FLOAT_MAX for masked-in entries.
1936  *           See \ref invsqrt for the discussion about argument restrictions.
1937  *  \param m Mask
1938  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
1939  *          entry was not masked, and 0.0 for masked-out entries.
1940  */
1941 static inline SimdDouble maskzInvsqrt(SimdDouble x, SimdDBool m)
1942 {
1943     SimdDouble lu = maskzRsqrt(x, m);
1944 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1945     lu = rsqrtIter(lu, x);
1946 #        endif
1947 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1948     lu = rsqrtIter(lu, x);
1949 #        endif
1950 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1951     lu = rsqrtIter(lu, x);
1952 #        endif
1953 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1954     lu = rsqrtIter(lu, x);
1955 #        endif
1956     return lu;
1957 }
1958
1959 /*! \brief Calculate 1/x for SIMD double, masked version.
1960  *
1961  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1962  *           GMX_FLOAT_MAX for masked-in entries.
1963  *           See \ref invsqrt for the discussion about argument restrictions.
1964  *  \param m Mask
1965  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1966  */
1967 static inline SimdDouble gmx_simdcall maskzInv(SimdDouble x, SimdDBool m)
1968 {
1969     SimdDouble lu = maskzRcp(x, m);
1970 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1971     lu = rcpIter(lu, x);
1972 #        endif
1973 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1974     lu = rcpIter(lu, x);
1975 #        endif
1976 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1977     lu = rcpIter(lu, x);
1978 #        endif
1979 #        if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1980     lu = rcpIter(lu, x);
1981 #        endif
1982     return lu;
1983 }
1984
1985
1986 /*! \brief Calculate sqrt(x) for SIMD doubles.
1987  *
1988  *  \copydetails sqrt(SimdFloat)
1989  */
1990 template<MathOptimization opt = MathOptimization::Safe>
1991 static inline SimdDouble gmx_simdcall sqrt(SimdDouble x)
1992 {
1993     if (opt == MathOptimization::Safe)
1994     {
1995         // As we might use a float version of rsqrt, we mask out small values
1996         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1997         return res * x;
1998     }
1999     else
2000     {
2001         return x * invsqrt(x);
2002     }
2003 }
2004
2005 /*! \brief Cube root for SIMD doubles
2006  *
2007  * \param x      Argument to calculate cube root of. Can be negative or zero,
2008  *               but NaN or Inf values are not supported. Denormal values will
2009  *               be treated as 0.0.
2010  * \return       Cube root of x.
2011  */
2012 static inline SimdDouble gmx_simdcall cbrt(SimdDouble x)
2013 {
2014     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2015     const SimdDouble minDouble(std::numeric_limits<double>::min());
2016     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2017     // To avoid clang warnings about fragile integer division mixed with FP, we let
2018     // the divided value (1023/3=341) be the original constant.
2019     const std::int32_t offsetDiv3(341);
2020     const SimdDouble   c6(-0.145263899385486377);
2021     const SimdDouble   c5(0.784932344976639262);
2022     const SimdDouble   c4(-1.83469277483613086);
2023     const SimdDouble   c3(2.44693122563534430);
2024     const SimdDouble   c2(-2.11499494167371287);
2025     const SimdDouble   c1(1.50819193781584896);
2026     const SimdDouble   c0(0.354895765043919860);
2027     const SimdDouble   one(1.0);
2028     const SimdDouble   two(2.0);
2029     const SimdDouble   three(3.0);
2030     const SimdDouble   oneThird(1.0 / 3.0);
2031     const SimdDouble   cbrt2(1.2599210498948731648);
2032     const SimdDouble   sqrCbrt2(1.5874010519681994748);
2033
2034     // See the single precision routines for documentation of the algorithm
2035
2036     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2037     SimdDouble xAbs     = andNot(signBit, x);    // select everthing but the sign bit => abs(x)
2038     SimdDBool  xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
2039
2040     SimdDInt32 exponent;
2041     SimdDouble y        = frexp(xAbs, &exponent);
2042     SimdDouble z        = fma(y, c6, c5);
2043     z                   = fma(z, y, c4);
2044     z                   = fma(z, y, c3);
2045     z                   = fma(z, y, c2);
2046     z                   = fma(z, y, c1);
2047     z                   = fma(z, y, c0);
2048     SimdDouble w        = z * z * z;
2049     SimdDouble nom      = z * fma(two, y, w);
2050     SimdDouble invDenom = inv(fma(two, w, y));
2051
2052     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2053     SimdDouble offsetExpDiv3 =
2054             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2055     SimdDInt32 expDiv3   = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
2056     SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
2057     SimdDouble factor    = blend(one, cbrt2, SimdDouble(0.5) < remainder);
2058     factor               = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
2059     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
2060     SimdDouble result    = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
2061     return result;
2062 }
2063
2064 /*! \brief Inverse cube root for SIMD doubles.
2065  *
2066  * \param x      Argument to calculate cube root of. Can be positive or
2067  *               negative, but the magnitude cannot be lower than
2068  *               the smallest normal number.
2069  * \return       Cube root of x. Undefined for values that don't
2070  *               fulfill the restriction of abs(x) > minDouble.
2071  */
2072 static inline SimdDouble gmx_simdcall invcbrt(SimdDouble x)
2073 {
2074     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2075     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2076     // To avoid clang warnings about fragile integer division mixed with FP, we let
2077     // the divided value (1023/3=341) be the original constant.
2078     const std::int32_t offsetDiv3(341);
2079     const SimdDouble   c6(-0.145263899385486377);
2080     const SimdDouble   c5(0.784932344976639262);
2081     const SimdDouble   c4(-1.83469277483613086);
2082     const SimdDouble   c3(2.44693122563534430);
2083     const SimdDouble   c2(-2.11499494167371287);
2084     const SimdDouble   c1(1.50819193781584896);
2085     const SimdDouble   c0(0.354895765043919860);
2086     const SimdDouble   one(1.0);
2087     const SimdDouble   two(2.0);
2088     const SimdDouble   three(3.0);
2089     const SimdDouble   oneThird(1.0 / 3.0);
2090     const SimdDouble   invCbrt2(1.0 / 1.2599210498948731648);
2091     const SimdDouble   invSqrCbrt2(1.0F / 1.5874010519681994748);
2092
2093     // See the single precision routines for documentation of the algorithm
2094
2095     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2096     SimdDouble xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2097
2098     SimdDInt32 exponent;
2099     SimdDouble y         = frexp(xAbs, &exponent);
2100     SimdDouble z         = fma(y, c6, c5);
2101     z                    = fma(z, y, c4);
2102     z                    = fma(z, y, c3);
2103     z                    = fma(z, y, c2);
2104     z                    = fma(z, y, c1);
2105     z                    = fma(z, y, c0);
2106     SimdDouble w         = z * z * z;
2107     SimdDouble nom       = fma(two, w, y);
2108     SimdDouble invDenom  = inv(z * fma(two, y, w));
2109     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2110     SimdDouble offsetExpDiv3 =
2111             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2112     SimdDInt32 expDiv3   = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
2113     SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
2114     SimdDouble factor    = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
2115     factor               = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
2116     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
2117     SimdDouble result    = ldexp(fraction, expDiv3);
2118     return result;
2119 }
2120
2121 /*! \brief SIMD double log2(x). This is the base-2 logarithm.
2122  *
2123  * \param x Argument, should be >0.
2124  * \result The base-2 logarithm of x. Undefined if argument is invalid.
2125  */
2126 static inline SimdDouble gmx_simdcall log2(SimdDouble x)
2127 {
2128 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2129     // Just rescale if native log2() is not present, but log is.
2130     return log(x) * SimdDouble(std::log2(std::exp(1.0)));
2131 #        else
2132     const SimdDouble one(1.0);
2133     const SimdDouble two(2.0);
2134     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2135     const SimdDouble CL15(0.2138031565795550370534528);
2136     const SimdDouble CL13(0.2208884091496370882801159);
2137     const SimdDouble CL11(0.2623358279761824340958754);
2138     const SimdDouble CL9(0.3205984930182496084327681);
2139     const SimdDouble CL7(0.4121985864521960363227038);
2140     const SimdDouble CL5(0.5770780163410746954610886);
2141     const SimdDouble CL3(0.9617966939260027547931031);
2142     const SimdDouble CL1(2.885390081777926774009302);
2143     SimdDouble       fExp, x2, p;
2144     SimdDBool        m;
2145     SimdDInt32       iExp;
2146
2147     // For log2(), the argument cannot be 0, so use the faster version of frexp()
2148     x    = frexp<MathOptimization::Unsafe>(x, &iExp);
2149     fExp = cvtI2R(iExp);
2150
2151     m = x < invsqrt2;
2152     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2153     fExp = fExp - selectByMask(one, m);
2154     x    = x * blend(one, two, m);
2155
2156     x  = (x - one) * inv(x + one);
2157     x2 = x * x;
2158
2159     p = fma(CL15, x2, CL13);
2160     p = fma(p, x2, CL11);
2161     p = fma(p, x2, CL9);
2162     p = fma(p, x2, CL7);
2163     p = fma(p, x2, CL5);
2164     p = fma(p, x2, CL3);
2165     p = fma(p, x2, CL1);
2166     p = fma(p, x, fExp);
2167
2168     return p;
2169 #        endif
2170 }
2171
2172 #        if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2173 /*! \brief SIMD double log(x). This is the natural logarithm.
2174  *
2175  * \param x Argument, should be >0.
2176  * \result The natural logarithm of x. Undefined if argument is invalid.
2177  */
2178 static inline SimdDouble gmx_simdcall log(SimdDouble x)
2179 {
2180     const SimdDouble one(1.0);
2181     const SimdDouble two(2.0);
2182     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2183     const SimdDouble corr(0.693147180559945286226764);
2184     const SimdDouble CL15(0.148197055177935105296783);
2185     const SimdDouble CL13(0.153108178020442575739679);
2186     const SimdDouble CL11(0.181837339521549679055568);
2187     const SimdDouble CL9(0.22222194152736701733275);
2188     const SimdDouble CL7(0.285714288030134544449368);
2189     const SimdDouble CL5(0.399999999989941956712869);
2190     const SimdDouble CL3(0.666666666666685503450651);
2191     const SimdDouble CL1(2.0);
2192     SimdDouble       fExp, x2, p;
2193     SimdDBool        m;
2194     SimdDInt32       iExp;
2195
2196     // For log(), the argument cannot be 0, so use the faster version of frexp()
2197     x    = frexp<MathOptimization::Unsafe>(x, &iExp);
2198     fExp = cvtI2R(iExp);
2199
2200     m = x < invsqrt2;
2201     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2202     fExp = fExp - selectByMask(one, m);
2203     x    = x * blend(one, two, m);
2204
2205     x  = (x - one) * inv(x + one);
2206     x2 = x * x;
2207
2208     p = fma(CL15, x2, CL13);
2209     p = fma(p, x2, CL11);
2210     p = fma(p, x2, CL9);
2211     p = fma(p, x2, CL7);
2212     p = fma(p, x2, CL5);
2213     p = fma(p, x2, CL3);
2214     p = fma(p, x2, CL1);
2215     p = fma(p, x, corr * fExp);
2216
2217     return p;
2218 }
2219 #        endif
2220
2221 #        if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
2222 /*! \brief SIMD double 2^x.
2223  *
2224  * \copydetails exp2(SimdFloat)
2225  */
2226 template<MathOptimization opt = MathOptimization::Safe>
2227 static inline SimdDouble gmx_simdcall exp2(SimdDouble x)
2228 {
2229     const SimdDouble CE11(4.435280790452730022081181e-10);
2230     const SimdDouble CE10(7.074105630863314448024247e-09);
2231     const SimdDouble CE9(1.017819803432096698472621e-07);
2232     const SimdDouble CE8(1.321543308956718799557863e-06);
2233     const SimdDouble CE7(0.00001525273348995851746990884);
2234     const SimdDouble CE6(0.0001540353046251466849082632);
2235     const SimdDouble CE5(0.001333355814678995257307880);
2236     const SimdDouble CE4(0.009618129107588335039176502);
2237     const SimdDouble CE3(0.05550410866481992147457793);
2238     const SimdDouble CE2(0.2402265069591015620470894);
2239     const SimdDouble CE1(0.6931471805599453304615075);
2240     const SimdDouble one(1.0);
2241
2242     SimdDouble intpart;
2243     SimdDouble fexppart;
2244     SimdDouble p;
2245
2246     // Large negative values are valid arguments to exp2(), so there are two
2247     // things we need to account for:
2248     // 1. When the exponents reaches -1023, the (biased) exponent field will be
2249     //    zero and we can no longer multiply with it. There are special IEEE
2250     //    formats to handle this range, but for now we have to accept that
2251     //    we cannot handle those arguments. If input value becomes even more
2252     //    negative, it will start to loop and we would end up with invalid
2253     //    exponents. Thus, we need to limit or mask this.
2254     // 2. For VERY large negative values, we will have problems that the
2255     //    subtraction to get the fractional part loses accuracy, and then we
2256     //    can end up with overflows in the polynomial.
2257     //
2258     // For now, we handle this by forwarding the math optimization setting to
2259     // ldexp, where the routine will return zero for very small arguments.
2260     //
2261     // However, before doing that we need to make sure we do not call cvtR2I
2262     // with an argument that is so negative it cannot be converted to an integer.
2263     if (opt == MathOptimization::Safe)
2264     {
2265         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
2266     }
2267
2268     fexppart = ldexp<opt>(one, cvtR2I(x));
2269     intpart  = round(x);
2270     x        = x - intpart;
2271
2272     p = fma(CE11, x, CE10);
2273     p = fma(p, x, CE9);
2274     p = fma(p, x, CE8);
2275     p = fma(p, x, CE7);
2276     p = fma(p, x, CE6);
2277     p = fma(p, x, CE5);
2278     p = fma(p, x, CE4);
2279     p = fma(p, x, CE3);
2280     p = fma(p, x, CE2);
2281     p = fma(p, x, CE1);
2282     p = fma(p, x, one);
2283     x = p * fexppart;
2284     return x;
2285 }
2286 #        endif
2287
2288 #        if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
2289 /*! \brief SIMD double exp(x).
2290  *
2291  * \copydetails exp(SimdFloat)
2292  */
2293 template<MathOptimization opt = MathOptimization::Safe>
2294 static inline SimdDouble gmx_simdcall exp(SimdDouble x)
2295 {
2296     const SimdDouble argscale(1.44269504088896340735992468100);
2297     const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
2298     const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
2299     const SimdDouble CE12(2.078375306791423699350304e-09);
2300     const SimdDouble CE11(2.518173854179933105218635e-08);
2301     const SimdDouble CE10(2.755842049600488770111608e-07);
2302     const SimdDouble CE9(2.755691815216689746619849e-06);
2303     const SimdDouble CE8(2.480158383706245033920920e-05);
2304     const SimdDouble CE7(0.0001984127043518048611841321);
2305     const SimdDouble CE6(0.001388888889360258341755930);
2306     const SimdDouble CE5(0.008333333332907368102819109);
2307     const SimdDouble CE4(0.04166666666663836745814631);
2308     const SimdDouble CE3(0.1666666666666796929434570);
2309     const SimdDouble CE2(0.5);
2310     const SimdDouble one(1.0);
2311     SimdDouble       fexppart;
2312     SimdDouble       intpart;
2313     SimdDouble       y, p;
2314
2315     // Large negative values are valid arguments to exp2(), so there are two
2316     // things we need to account for:
2317     // 1. When the exponents reaches -1023, the (biased) exponent field will be
2318     //    zero and we can no longer multiply with it. There are special IEEE
2319     //    formats to handle this range, but for now we have to accept that
2320     //    we cannot handle those arguments. If input value becomes even more
2321     //    negative, it will start to loop and we would end up with invalid
2322     //    exponents. Thus, we need to limit or mask this.
2323     // 2. For VERY large negative values, we will have problems that the
2324     //    subtraction to get the fractional part loses accuracy, and then we
2325     //    can end up with overflows in the polynomial.
2326     //
2327     // For now, we handle this by forwarding the math optimization setting to
2328     // ldexp, where the routine will return zero for very small arguments.
2329     //
2330     // However, before doing that we need to make sure we do not call cvtR2I
2331     // with an argument that is so negative it cannot be converted to an integer
2332     // after being multiplied by argscale.
2333
2334     if (opt == MathOptimization::Safe)
2335     {
2336         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
2337     }
2338
2339     y = x * argscale;
2340
2341     fexppart = ldexp<opt>(one, cvtR2I(y));
2342     intpart  = round(y);
2343
2344     // Extended precision arithmetics
2345     x = fma(invargscale0, intpart, x);
2346     x = fma(invargscale1, intpart, x);
2347
2348     p = fma(CE12, x, CE11);
2349     p = fma(p, x, CE10);
2350     p = fma(p, x, CE9);
2351     p = fma(p, x, CE8);
2352     p = fma(p, x, CE7);
2353     p = fma(p, x, CE6);
2354     p = fma(p, x, CE5);
2355     p = fma(p, x, CE4);
2356     p = fma(p, x, CE3);
2357     p = fma(p, x, CE2);
2358     p = fma(p, x * x, x);
2359 #            if GMX_SIMD_HAVE_FMA
2360     x = fma(p, fexppart, fexppart);
2361 #            else
2362     x = (p + one) * fexppart;
2363 #            endif
2364
2365     return x;
2366 }
2367 #        endif
2368
2369 /*! \brief SIMD double pow(x,y)
2370  *
2371  * This returns x^y for SIMD values.
2372  *
2373  * \tparam opt If this is changed from the default (safe) into the unsafe
2374  *             option, there are no guarantees about correct results for x==0.
2375  *
2376  * \param x Base.
2377  *
2378  * \param y exponent.
2379  *
2380  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
2381  *         depending on the underlying implementation. If unsafe optimizations
2382  *         are enabled, this is also true for x==0.
2383  *
2384  * \warning You cannot rely on this implementation returning inf for arguments
2385  *          that cause overflow. If you have some very large
2386  *          values and need to rely on getting a valid numerical output,
2387  *          take the minimum of your variable and the largest valid argument
2388  *          before calling this routine.
2389  */
2390 template<MathOptimization opt = MathOptimization::Safe>
2391 static inline SimdDouble gmx_simdcall pow(SimdDouble x, SimdDouble y)
2392 {
2393     SimdDouble xcorr;
2394
2395     if (opt == MathOptimization::Safe)
2396     {
2397         xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
2398     }
2399     else
2400     {
2401         xcorr = x;
2402     }
2403
2404     SimdDouble result = exp2<opt>(y * log2(xcorr));
2405
2406     if (opt == MathOptimization::Safe)
2407     {
2408         // if x==0 and y>0 we explicitly set the result to 0.0
2409         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
2410         result = blend(result, setZero(), x == setZero() && setZero() < y);
2411     }
2412
2413     return result;
2414 }
2415
2416
2417 /*! \brief SIMD double erf(x).
2418  *
2419  * \param x The value to calculate erf(x) for.
2420  * \result erf(x)
2421  *
2422  * This routine achieves very close to full precision, but we do not care about
2423  * the last bit or the subnormal result range.
2424  */
2425 static inline SimdDouble gmx_simdcall erf(SimdDouble x)
2426 {
2427     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2428     const SimdDouble CAP4(-0.431780540597889301512e-4);
2429     const SimdDouble CAP3(-0.00578562306260059236059);
2430     const SimdDouble CAP2(-0.028593586920219752446);
2431     const SimdDouble CAP1(-0.315924962948621698209);
2432     const SimdDouble CAP0(0.14952975608477029151);
2433
2434     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2435     const SimdDouble CAQ4(0.00015126584532155383535);
2436     const SimdDouble CAQ3(0.00536692680669480725423);
2437     const SimdDouble CAQ2(0.0668686825594046122636);
2438     const SimdDouble CAQ1(0.402604990869284362773);
2439     // CAQ0 == 1.0
2440     const SimdDouble CAoffset(0.9788494110107421875);
2441
2442     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2443     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2444     const SimdDouble CBP5(0.00119770193298159629350136085658);
2445     const SimdDouble CBP4(0.0164944422378370965881008942733);
2446     const SimdDouble CBP3(0.0984581468691775932063932439252);
2447     const SimdDouble CBP2(0.317364595806937763843589437418);
2448     const SimdDouble CBP1(0.554167062641455850932670067075);
2449     const SimdDouble CBP0(0.427583576155807163756925301060);
2450     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2451     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2452     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2453     const SimdDouble CBQ4(0.955364736493055670530981883072);
2454     const SimdDouble CBQ3(2.36815675631420037315349279199);
2455     const SimdDouble CBQ2(3.55261649184083035537184223542);
2456     const SimdDouble CBQ1(2.93501136050160872574376997993);
2457     // CBQ0 == 1.0
2458
2459     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2460     const SimdDouble CCP6(-2.8175401114513378771);
2461     const SimdDouble CCP5(-3.22729451764143718517);
2462     const SimdDouble CCP4(-2.5518551727311523996);
2463     const SimdDouble CCP3(-0.687717681153649930619);
2464     const SimdDouble CCP2(-0.212652252872804219852);
2465     const SimdDouble CCP1(0.0175389834052493308818);
2466     const SimdDouble CCP0(0.00628057170626964891937);
2467
2468     const SimdDouble CCQ6(5.48409182238641741584);
2469     const SimdDouble CCQ5(13.5064170191802889145);
2470     const SimdDouble CCQ4(22.9367376522880577224);
2471     const SimdDouble CCQ3(15.930646027911794143);
2472     const SimdDouble CCQ2(11.0567237927800161565);
2473     const SimdDouble CCQ1(2.79257750980575282228);
2474     // CCQ0 == 1.0
2475     const SimdDouble CCoffset(0.5579090118408203125);
2476
2477     const SimdDouble one(1.0);
2478     const SimdDouble two(2.0);
2479     const SimdDouble minFloat(std::numeric_limits<float>::min());
2480
2481     SimdDouble xabs, x2, x4, t, t2, w, w2;
2482     SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2483     SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2484     SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2485     SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2486     SimdDouble expmx2;
2487     SimdDBool  mask, mask_erf, notmask_erf;
2488
2489     // Calculate erf()
2490     xabs        = abs(x);
2491     mask_erf    = (xabs < one);
2492     notmask_erf = (one <= xabs);
2493     x2          = x * x;
2494     x4          = x2 * x2;
2495
2496     PolyAP0 = fma(CAP4, x4, CAP2);
2497     PolyAP1 = fma(CAP3, x4, CAP1);
2498     PolyAP0 = fma(PolyAP0, x4, CAP0);
2499     PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2500
2501     PolyAQ1 = fma(CAQ5, x4, CAQ3);
2502     PolyAQ0 = fma(CAQ4, x4, CAQ2);
2503     PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2504     PolyAQ0 = fma(PolyAQ0, x4, one);
2505     PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2506
2507     res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2508     res_erf = CAoffset + res_erf;
2509     res_erf = x * res_erf;
2510
2511     // Calculate erfc() in range [1,4.5]
2512     t  = xabs - one;
2513     t2 = t * t;
2514
2515     PolyBP0 = fma(CBP6, t2, CBP4);
2516     PolyBP1 = fma(CBP5, t2, CBP3);
2517     PolyBP0 = fma(PolyBP0, t2, CBP2);
2518     PolyBP1 = fma(PolyBP1, t2, CBP1);
2519     PolyBP0 = fma(PolyBP0, t2, CBP0);
2520     PolyBP0 = fma(PolyBP1, t, PolyBP0);
2521
2522     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2523     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2524     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2525     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2526     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2527     PolyBQ0 = fma(PolyBQ0, t2, one);
2528     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2529
2530     // The denominator polynomial can be zero outside the range
2531     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2532
2533     res_erfcB = res_erfcB * xabs;
2534
2535     // Calculate erfc() in range [4.5,inf]
2536     w  = maskzInv(xabs, notmask_erf && (minFloat <= xabs));
2537     w2 = w * w;
2538
2539     PolyCP0 = fma(CCP6, w2, CCP4);
2540     PolyCP1 = fma(CCP5, w2, CCP3);
2541     PolyCP0 = fma(PolyCP0, w2, CCP2);
2542     PolyCP1 = fma(PolyCP1, w2, CCP1);
2543     PolyCP0 = fma(PolyCP0, w2, CCP0);
2544     PolyCP0 = fma(PolyCP1, w, PolyCP0);
2545
2546     PolyCQ0 = fma(CCQ6, w2, CCQ4);
2547     PolyCQ1 = fma(CCQ5, w2, CCQ3);
2548     PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2549     PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2550     PolyCQ0 = fma(PolyCQ0, w2, one);
2551     PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2552
2553     expmx2 = exp(-x2);
2554
2555     // The denominator polynomial can be zero outside the range
2556     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2557     res_erfcC = res_erfcC + CCoffset;
2558     res_erfcC = res_erfcC * w;
2559
2560     mask     = (SimdDouble(4.5) < xabs);
2561     res_erfc = blend(res_erfcB, res_erfcC, mask);
2562
2563     res_erfc = res_erfc * expmx2;
2564
2565     // erfc(x<0) = 2-erfc(|x|)
2566     mask     = (x < setZero());
2567     res_erfc = blend(res_erfc, two - res_erfc, mask);
2568
2569     // Select erf() or erfc()
2570     res = blend(one - res_erfc, res_erf, mask_erf);
2571
2572     return res;
2573 }
2574
2575 /*! \brief SIMD double erfc(x).
2576  *
2577  * \param x The value to calculate erfc(x) for.
2578  * \result erfc(x)
2579  *
2580  * This routine achieves full precision (bar the last bit) over most of the
2581  * input range, but for large arguments where the result is getting close
2582  * to the minimum representable numbers we accept slightly larger errors
2583  * (think results that are in the ballpark of 10^-200 for double)
2584  * since that is not relevant for MD.
2585  */
2586 static inline SimdDouble gmx_simdcall erfc(SimdDouble x)
2587 {
2588     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2589     const SimdDouble CAP4(-0.431780540597889301512e-4);
2590     const SimdDouble CAP3(-0.00578562306260059236059);
2591     const SimdDouble CAP2(-0.028593586920219752446);
2592     const SimdDouble CAP1(-0.315924962948621698209);
2593     const SimdDouble CAP0(0.14952975608477029151);
2594
2595     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2596     const SimdDouble CAQ4(0.00015126584532155383535);
2597     const SimdDouble CAQ3(0.00536692680669480725423);
2598     const SimdDouble CAQ2(0.0668686825594046122636);
2599     const SimdDouble CAQ1(0.402604990869284362773);
2600     // CAQ0 == 1.0
2601     const SimdDouble CAoffset(0.9788494110107421875);
2602
2603     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2604     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2605     const SimdDouble CBP5(0.00119770193298159629350136085658);
2606     const SimdDouble CBP4(0.0164944422378370965881008942733);
2607     const SimdDouble CBP3(0.0984581468691775932063932439252);
2608     const SimdDouble CBP2(0.317364595806937763843589437418);
2609     const SimdDouble CBP1(0.554167062641455850932670067075);
2610     const SimdDouble CBP0(0.427583576155807163756925301060);
2611     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2612     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2613     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2614     const SimdDouble CBQ4(0.955364736493055670530981883072);
2615     const SimdDouble CBQ3(2.36815675631420037315349279199);
2616     const SimdDouble CBQ2(3.55261649184083035537184223542);
2617     const SimdDouble CBQ1(2.93501136050160872574376997993);
2618     // CBQ0 == 1.0
2619
2620     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2621     const SimdDouble CCP6(-2.8175401114513378771);
2622     const SimdDouble CCP5(-3.22729451764143718517);
2623     const SimdDouble CCP4(-2.5518551727311523996);
2624     const SimdDouble CCP3(-0.687717681153649930619);
2625     const SimdDouble CCP2(-0.212652252872804219852);
2626     const SimdDouble CCP1(0.0175389834052493308818);
2627     const SimdDouble CCP0(0.00628057170626964891937);
2628
2629     const SimdDouble CCQ6(5.48409182238641741584);
2630     const SimdDouble CCQ5(13.5064170191802889145);
2631     const SimdDouble CCQ4(22.9367376522880577224);
2632     const SimdDouble CCQ3(15.930646027911794143);
2633     const SimdDouble CCQ2(11.0567237927800161565);
2634     const SimdDouble CCQ1(2.79257750980575282228);
2635     // CCQ0 == 1.0
2636     const SimdDouble CCoffset(0.5579090118408203125);
2637
2638     const SimdDouble one(1.0);
2639     const SimdDouble two(2.0);
2640     const SimdDouble minFloat(std::numeric_limits<float>::min());
2641
2642     SimdDouble xabs, x2, x4, t, t2, w, w2;
2643     SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2644     SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2645     SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2646     SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2647     SimdDouble expmx2;
2648     SimdDBool  mask, mask_erf, notmask_erf;
2649
2650     // Calculate erf()
2651     xabs        = abs(x);
2652     mask_erf    = (xabs < one);
2653     notmask_erf = (one <= xabs);
2654     x2          = x * x;
2655     x4          = x2 * x2;
2656
2657     PolyAP0 = fma(CAP4, x4, CAP2);
2658     PolyAP1 = fma(CAP3, x4, CAP1);
2659     PolyAP0 = fma(PolyAP0, x4, CAP0);
2660     PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2661     PolyAQ1 = fma(CAQ5, x4, CAQ3);
2662     PolyAQ0 = fma(CAQ4, x4, CAQ2);
2663     PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2664     PolyAQ0 = fma(PolyAQ0, x4, one);
2665     PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2666
2667     res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2668     res_erf = CAoffset + res_erf;
2669     res_erf = x * res_erf;
2670
2671     // Calculate erfc() in range [1,4.5]
2672     t  = xabs - one;
2673     t2 = t * t;
2674
2675     PolyBP0 = fma(CBP6, t2, CBP4);
2676     PolyBP1 = fma(CBP5, t2, CBP3);
2677     PolyBP0 = fma(PolyBP0, t2, CBP2);
2678     PolyBP1 = fma(PolyBP1, t2, CBP1);
2679     PolyBP0 = fma(PolyBP0, t2, CBP0);
2680     PolyBP0 = fma(PolyBP1, t, PolyBP0);
2681
2682     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2683     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2684     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2685     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2686     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2687     PolyBQ0 = fma(PolyBQ0, t2, one);
2688     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2689
2690     // The denominator polynomial can be zero outside the range
2691     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2692
2693     res_erfcB = res_erfcB * xabs;
2694
2695     // Calculate erfc() in range [4.5,inf]
2696     // Note that 1/x can only handle single precision!
2697     w  = maskzInv(xabs, minFloat <= xabs);
2698     w2 = w * w;
2699
2700     PolyCP0 = fma(CCP6, w2, CCP4);
2701     PolyCP1 = fma(CCP5, w2, CCP3);
2702     PolyCP0 = fma(PolyCP0, w2, CCP2);
2703     PolyCP1 = fma(PolyCP1, w2, CCP1);
2704     PolyCP0 = fma(PolyCP0, w2, CCP0);
2705     PolyCP0 = fma(PolyCP1, w, PolyCP0);
2706
2707     PolyCQ0 = fma(CCQ6, w2, CCQ4);
2708     PolyCQ1 = fma(CCQ5, w2, CCQ3);
2709     PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2710     PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2711     PolyCQ0 = fma(PolyCQ0, w2, one);
2712     PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2713
2714     expmx2 = exp(-x2);
2715
2716     // The denominator polynomial can be zero outside the range
2717     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2718     res_erfcC = res_erfcC + CCoffset;
2719     res_erfcC = res_erfcC * w;
2720
2721     mask     = (SimdDouble(4.5) < xabs);
2722     res_erfc = blend(res_erfcB, res_erfcC, mask);
2723
2724     res_erfc = res_erfc * expmx2;
2725
2726     // erfc(x<0) = 2-erfc(|x|)
2727     mask     = (x < setZero());
2728     res_erfc = blend(res_erfc, two - res_erfc, mask);
2729
2730     // Select erf() or erfc()
2731     res = blend(res_erfc, one - res_erf, mask_erf);
2732
2733     return res;
2734 }
2735
2736 /*! \brief SIMD double sin \& cos.
2737  *
2738  * \param x The argument to evaluate sin/cos for
2739  * \param[out] sinval Sin(x)
2740  * \param[out] cosval Cos(x)
2741  *
2742  * This version achieves close to machine precision, but for very large
2743  * magnitudes of the argument we inherently begin to lose accuracy due to the
2744  * argument reduction, despite using extended precision arithmetics internally.
2745  */
2746 static inline void gmx_simdcall sincos(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
2747 {
2748     // Constants to subtract Pi/4*x from y while minimizing precision loss
2749     const SimdDouble argred0(-2 * 0.78539816290140151978);
2750     const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2751     const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2752     const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2753     const SimdDouble two_over_pi(2.0 / M_PI);
2754     const SimdDouble const_sin5(1.58938307283228937328511e-10);
2755     const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2756     const SimdDouble const_sin3(2.75573131776846360512547e-06);
2757     const SimdDouble const_sin2(-0.000198412698278911770864914);
2758     const SimdDouble const_sin1(0.0083333333333191845961746);
2759     const SimdDouble const_sin0(-0.166666666666666130709393);
2760
2761     const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2762     const SimdDouble const_cos6(2.08757471207040055479366e-09);
2763     const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2764     const SimdDouble const_cos4(2.48015872890001867311915e-05);
2765     const SimdDouble const_cos3(-0.00138888888888714019282329);
2766     const SimdDouble const_cos2(0.0416666666666665519592062);
2767     const SimdDouble half(0.5);
2768     const SimdDouble one(1.0);
2769     SimdDouble       ssign, csign;
2770     SimdDouble       x2, y, z, psin, pcos, sss, ccc;
2771     SimdDBool        mask;
2772 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2773     const SimdDInt32 ione(1);
2774     const SimdDInt32 itwo(2);
2775     SimdDInt32       iy;
2776
2777     z  = x * two_over_pi;
2778     iy = cvtR2I(z);
2779     y  = round(z);
2780
2781     mask  = cvtIB2B((iy & ione) == setZero());
2782     ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2783     csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2784 #        else
2785     const SimdDouble quarter(0.25);
2786     const SimdDouble minusquarter(-0.25);
2787     SimdDouble       q;
2788     SimdDBool        m1, m2, m3;
2789
2790     /* The most obvious way to find the arguments quadrant in the unit circle
2791      * to calculate the sign is to use integer arithmetic, but that is not
2792      * present in all SIMD implementations. As an alternative, we have devised a
2793      * pure floating-point algorithm that uses truncation for argument reduction
2794      * so that we get a new value 0<=q<1 over the unit circle, and then
2795      * do floating-point comparisons with fractions. This is likely to be
2796      * slightly slower (~10%) due to the longer latencies of floating-point, so
2797      * we only use it when integer SIMD arithmetic is not present.
2798      */
2799     ssign = x;
2800     x     = abs(x);
2801     // It is critical that half-way cases are rounded down
2802     z = fma(x, two_over_pi, half);
2803     y = trunc(z);
2804     q = z * quarter;
2805     q = q - trunc(q);
2806     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2807      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2808      * This removes the 2*Pi periodicity without using any integer arithmetic.
2809      * First check if y had the value 2 or 3, set csign if true.
2810      */
2811     q = q - half;
2812     /* If we have logical operations we can work directly on the signbit, which
2813      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2814      * Thus, if you are altering defines to debug alternative code paths, the
2815      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2816      * active or inactive - you will get errors if only one is used.
2817      */
2818 #            if GMX_SIMD_HAVE_LOGICAL
2819     ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2820     csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2821     ssign = ssign ^ csign;
2822 #            else
2823     ssign = copysign(SimdDouble(1.0), ssign);
2824     csign = copysign(SimdDouble(1.0), q);
2825     csign = -csign;
2826     ssign = ssign * csign; // swap ssign if csign was set.
2827 #            endif
2828     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2829     m1   = (q < minusquarter);
2830     m2   = (setZero() <= q);
2831     m3   = (q < quarter);
2832     m2   = m2 && m3;
2833     mask = m1 || m2;
2834     // where mask is FALSE, swap sign.
2835     csign   = csign * blend(SimdDouble(-1.0), one, mask);
2836 #        endif
2837     x  = fma(y, argred0, x);
2838     x  = fma(y, argred1, x);
2839     x  = fma(y, argred2, x);
2840     x  = fma(y, argred3, x);
2841     x2 = x * x;
2842
2843     psin = fma(const_sin5, x2, const_sin4);
2844     psin = fma(psin, x2, const_sin3);
2845     psin = fma(psin, x2, const_sin2);
2846     psin = fma(psin, x2, const_sin1);
2847     psin = fma(psin, x2, const_sin0);
2848     psin = fma(psin, x2 * x, x);
2849
2850     pcos = fma(const_cos7, x2, const_cos6);
2851     pcos = fma(pcos, x2, const_cos5);
2852     pcos = fma(pcos, x2, const_cos4);
2853     pcos = fma(pcos, x2, const_cos3);
2854     pcos = fma(pcos, x2, const_cos2);
2855     pcos = fms(pcos, x2, half);
2856     pcos = fma(pcos, x2, one);
2857
2858     sss = blend(pcos, psin, mask);
2859     ccc = blend(psin, pcos, mask);
2860     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2861 #        if GMX_SIMD_HAVE_LOGICAL
2862     *sinval = sss ^ ssign;
2863     *cosval = ccc ^ csign;
2864 #        else
2865     *sinval = sss * ssign;
2866     *cosval = ccc * csign;
2867 #        endif
2868 }
2869
2870 /*! \brief SIMD double sin(x).
2871  *
2872  * \param x The argument to evaluate sin for
2873  * \result Sin(x)
2874  *
2875  * \attention Do NOT call both sin & cos if you need both results, since each of them
2876  * will then call \ref sincos and waste a factor 2 in performance.
2877  */
2878 static inline SimdDouble gmx_simdcall sin(SimdDouble x)
2879 {
2880     SimdDouble s, c;
2881     sincos(x, &s, &c);
2882     return s;
2883 }
2884
2885 /*! \brief SIMD double cos(x).
2886  *
2887  * \param x The argument to evaluate cos for
2888  * \result Cos(x)
2889  *
2890  * \attention Do NOT call both sin & cos if you need both results, since each of them
2891  * will then call \ref sincos and waste a factor 2 in performance.
2892  */
2893 static inline SimdDouble gmx_simdcall cos(SimdDouble x)
2894 {
2895     SimdDouble s, c;
2896     sincos(x, &s, &c);
2897     return c;
2898 }
2899
2900 /*! \brief SIMD double tan(x).
2901  *
2902  * \param x The argument to evaluate tan for
2903  * \result Tan(x)
2904  */
2905 static inline SimdDouble gmx_simdcall tan(SimdDouble x)
2906 {
2907     const SimdDouble argred0(-2 * 0.78539816290140151978);
2908     const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2909     const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2910     const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2911     const SimdDouble two_over_pi(2.0 / M_PI);
2912     const SimdDouble CT15(1.01419718511083373224408e-05);
2913     const SimdDouble CT14(-2.59519791585924697698614e-05);
2914     const SimdDouble CT13(5.23388081915899855325186e-05);
2915     const SimdDouble CT12(-3.05033014433946488225616e-05);
2916     const SimdDouble CT11(7.14707504084242744267497e-05);
2917     const SimdDouble CT10(8.09674518280159187045078e-05);
2918     const SimdDouble CT9(0.000244884931879331847054404);
2919     const SimdDouble CT8(0.000588505168743587154904506);
2920     const SimdDouble CT7(0.00145612788922812427978848);
2921     const SimdDouble CT6(0.00359208743836906619142924);
2922     const SimdDouble CT5(0.00886323944362401618113356);
2923     const SimdDouble CT4(0.0218694882853846389592078);
2924     const SimdDouble CT3(0.0539682539781298417636002);
2925     const SimdDouble CT2(0.133333333333125941821962);
2926     const SimdDouble CT1(0.333333333333334980164153);
2927     const SimdDouble minFloat(std::numeric_limits<float>::min());
2928
2929     SimdDouble x2, p, y, z;
2930     SimdDBool  m;
2931
2932 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2933     SimdDInt32 iy;
2934     SimdDInt32 ione(1);
2935
2936     z  = x * two_over_pi;
2937     iy = cvtR2I(z);
2938     y  = round(z);
2939     m  = cvtIB2B((iy & ione) == ione);
2940
2941     x = fma(y, argred0, x);
2942     x = fma(y, argred1, x);
2943     x = fma(y, argred2, x);
2944     x = fma(y, argred3, x);
2945     x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2946 #        else
2947     const SimdDouble quarter(0.25);
2948     const SimdDouble half(0.5);
2949     const SimdDouble threequarter(0.75);
2950     const SimdDouble minFloat(std::numeric_limits<float>::min());
2951     SimdDouble       w, q;
2952     SimdDBool        m1, m2, m3;
2953
2954     w  = abs(x);
2955     z  = fma(w, two_over_pi, half);
2956     y  = trunc(z);
2957     q  = z * quarter;
2958     q  = q - trunc(q);
2959     m1 = (quarter <= q);
2960     m2 = (q < half);
2961     m3 = (threequarter <= q);
2962     m1 = m1 && m2;
2963     m  = m1 || m3;
2964     w  = fma(y, argred0, w);
2965     w  = fma(y, argred1, w);
2966     w  = fma(y, argred2, w);
2967     w  = fma(y, argred3, w);
2968
2969     w     = blend(w, -w, m);
2970     x     = w * copysign(SimdDouble(1.0), x);
2971 #        endif
2972     x2 = x * x;
2973     p  = fma(CT15, x2, CT14);
2974     p  = fma(p, x2, CT13);
2975     p  = fma(p, x2, CT12);
2976     p  = fma(p, x2, CT11);
2977     p  = fma(p, x2, CT10);
2978     p  = fma(p, x2, CT9);
2979     p  = fma(p, x2, CT8);
2980     p  = fma(p, x2, CT7);
2981     p  = fma(p, x2, CT6);
2982     p  = fma(p, x2, CT5);
2983     p  = fma(p, x2, CT4);
2984     p  = fma(p, x2, CT3);
2985     p  = fma(p, x2, CT2);
2986     p  = fma(p, x2, CT1);
2987     p  = fma(x2, p * x, x);
2988
2989     p = blend(p, maskzInv(p, m && (minFloat < abs(p))), m);
2990     return p;
2991 }
2992
2993 /*! \brief SIMD double asin(x).
2994  *
2995  * \param x The argument to evaluate asin for
2996  * \result Asin(x)
2997  */
2998 static inline SimdDouble gmx_simdcall asin(SimdDouble x)
2999 {
3000     // Same algorithm as cephes library
3001     const SimdDouble limit1(0.625);
3002     const SimdDouble limit2(1e-8);
3003     const SimdDouble one(1.0);
3004     const SimdDouble quarterpi(M_PI / 4.0);
3005     const SimdDouble morebits(6.123233995736765886130e-17);
3006
3007     const SimdDouble P5(4.253011369004428248960e-3);
3008     const SimdDouble P4(-6.019598008014123785661e-1);
3009     const SimdDouble P3(5.444622390564711410273e0);
3010     const SimdDouble P2(-1.626247967210700244449e1);
3011     const SimdDouble P1(1.956261983317594739197e1);
3012     const SimdDouble P0(-8.198089802484824371615e0);
3013
3014     const SimdDouble Q4(-1.474091372988853791896e1);
3015     const SimdDouble Q3(7.049610280856842141659e1);
3016     const SimdDouble Q2(-1.471791292232726029859e2);
3017     const SimdDouble Q1(1.395105614657485689735e2);
3018     const SimdDouble Q0(-4.918853881490881290097e1);
3019
3020     const SimdDouble R4(2.967721961301243206100e-3);
3021     const SimdDouble R3(-5.634242780008963776856e-1);
3022     const SimdDouble R2(6.968710824104713396794e0);
3023     const SimdDouble R1(-2.556901049652824852289e1);
3024     const SimdDouble R0(2.853665548261061424989e1);
3025
3026     const SimdDouble S3(-2.194779531642920639778e1);
3027     const SimdDouble S2(1.470656354026814941758e2);
3028     const SimdDouble S1(-3.838770957603691357202e2);
3029     const SimdDouble S0(3.424398657913078477438e2);
3030
3031     SimdDouble xabs;
3032     SimdDouble zz, ww, z, q, w, zz2, ww2;
3033     SimdDouble PA, PB;
3034     SimdDouble QA, QB;
3035     SimdDouble RA, RB;
3036     SimdDouble SA, SB;
3037     SimdDouble nom, denom;
3038     SimdDBool  mask, mask2;
3039
3040     xabs = abs(x);
3041
3042     mask = (limit1 < xabs);
3043
3044     zz  = one - xabs;
3045     ww  = xabs * xabs;
3046     zz2 = zz * zz;
3047     ww2 = ww * ww;
3048
3049     // R
3050     RA = fma(R4, zz2, R2);
3051     RB = fma(R3, zz2, R1);
3052     RA = fma(RA, zz2, R0);
3053     RA = fma(RB, zz, RA);
3054
3055     // S, SA = zz2
3056     SB = fma(S3, zz2, S1);
3057     SA = zz2 + S2;
3058     SA = fma(SA, zz2, S0);
3059     SA = fma(SB, zz, SA);
3060
3061     // P
3062     PA = fma(P5, ww2, P3);
3063     PB = fma(P4, ww2, P2);
3064     PA = fma(PA, ww2, P1);
3065     PB = fma(PB, ww2, P0);
3066     PA = fma(PA, ww, PB);
3067
3068     // Q, QA = ww2
3069     QB = fma(Q4, ww2, Q2);
3070     QA = ww2 + Q3;
3071     QA = fma(QA, ww2, Q1);
3072     QB = fma(QB, ww2, Q0);
3073     QA = fma(QA, ww, QB);
3074
3075     RA = RA * zz;
3076     PA = PA * ww;
3077
3078     nom   = blend(PA, RA, mask);
3079     denom = blend(QA, SA, mask);
3080
3081     mask2 = (limit2 < xabs);
3082     q     = nom * maskzInv(denom, mask2);
3083
3084     zz = zz + zz;
3085     zz = sqrt(zz);
3086     z  = quarterpi - zz;
3087     zz = fms(zz, q, morebits);
3088     z  = z - zz;
3089     z  = z + quarterpi;
3090
3091     w = xabs * q;
3092     w = w + xabs;
3093
3094     z = blend(w, z, mask);
3095
3096     z = blend(xabs, z, mask2);
3097
3098     z = copysign(z, x);
3099
3100     return z;
3101 }
3102
3103 /*! \brief SIMD double acos(x).
3104  *
3105  * \param x The argument to evaluate acos for
3106  * \result Acos(x)
3107  */
3108 static inline SimdDouble gmx_simdcall acos(SimdDouble x)
3109 {
3110     const SimdDouble one(1.0);
3111     const SimdDouble half(0.5);
3112     const SimdDouble quarterpi0(7.85398163397448309616e-1);
3113     const SimdDouble quarterpi1(6.123233995736765886130e-17);
3114
3115     SimdDBool  mask1;
3116     SimdDouble z, z1, z2;
3117
3118     mask1 = (half < x);
3119     z1    = half * (one - x);
3120     z1    = sqrt(z1);
3121     z     = blend(x, z1, mask1);
3122
3123     z = asin(z);
3124
3125     z1 = z + z;
3126
3127     z2 = quarterpi0 - z;
3128     z2 = z2 + quarterpi1;
3129     z2 = z2 + quarterpi0;
3130
3131     z = blend(z2, z1, mask1);
3132
3133     return z;
3134 }
3135
3136 /*! \brief SIMD double asin(x).
3137  *
3138  * \param x The argument to evaluate atan for
3139  * \result Atan(x), same argument/value range as standard math library.
3140  */
3141 static inline SimdDouble gmx_simdcall atan(SimdDouble x)
3142 {
3143     // Same algorithm as cephes library
3144     const SimdDouble limit1(0.66);
3145     const SimdDouble limit2(2.41421356237309504880);
3146     const SimdDouble quarterpi(M_PI / 4.0);
3147     const SimdDouble halfpi(M_PI / 2.0);
3148     const SimdDouble mone(-1.0);
3149     const SimdDouble morebits1(0.5 * 6.123233995736765886130E-17);
3150     const SimdDouble morebits2(6.123233995736765886130E-17);
3151
3152     const SimdDouble P4(-8.750608600031904122785E-1);
3153     const SimdDouble P3(-1.615753718733365076637E1);
3154     const SimdDouble P2(-7.500855792314704667340E1);
3155     const SimdDouble P1(-1.228866684490136173410E2);
3156     const SimdDouble P0(-6.485021904942025371773E1);
3157
3158     const SimdDouble Q4(2.485846490142306297962E1);
3159     const SimdDouble Q3(1.650270098316988542046E2);
3160     const SimdDouble Q2(4.328810604912902668951E2);
3161     const SimdDouble Q1(4.853903996359136964868E2);
3162     const SimdDouble Q0(1.945506571482613964425E2);
3163
3164     SimdDouble y, xabs, t1, t2;
3165     SimdDouble z, z2;
3166     SimdDouble P_A, P_B, Q_A, Q_B;
3167     SimdDBool  mask1, mask2;
3168
3169     xabs = abs(x);
3170
3171     mask1 = (limit1 < xabs);
3172     mask2 = (limit2 < xabs);
3173
3174     t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
3175     t2 = mone * maskzInv(xabs, mask2);
3176
3177     y    = selectByMask(quarterpi, mask1);
3178     y    = blend(y, halfpi, mask2);
3179     xabs = blend(xabs, t1, mask1);
3180     xabs = blend(xabs, t2, mask2);
3181
3182     z  = xabs * xabs;
3183     z2 = z * z;
3184
3185     P_A = fma(P4, z2, P2);
3186     P_B = fma(P3, z2, P1);
3187     P_A = fma(P_A, z2, P0);
3188     P_A = fma(P_B, z, P_A);
3189
3190     // Q_A = z2
3191     Q_B = fma(Q4, z2, Q2);
3192     Q_A = z2 + Q3;
3193     Q_A = fma(Q_A, z2, Q1);
3194     Q_B = fma(Q_B, z2, Q0);
3195     Q_A = fma(Q_A, z, Q_B);
3196
3197     z = z * P_A;
3198     z = z * inv(Q_A);
3199     z = fma(z, xabs, xabs);
3200
3201     t1 = selectByMask(morebits1, mask1);
3202     t1 = blend(t1, morebits2, mask2);
3203
3204     z = z + t1;
3205     y = y + z;
3206
3207     y = copysign(y, x);
3208
3209     return y;
3210 }
3211
3212 /*! \brief SIMD double atan2(y,x).
3213  *
3214  * \param y Y component of vector, any quartile
3215  * \param x X component of vector, any quartile
3216  * \result Atan(y,x), same argument/value range as standard math library.
3217  *
3218  * \note This routine should provide correct results for all finite
3219  * non-zero or positive-zero arguments. However, negative zero arguments will
3220  * be treated as positive zero, which means the return value will deviate from
3221  * the standard math library atan2(y,x) for those cases. That should not be
3222  * of any concern in Gromacs, and in particular it will not affect calculations
3223  * of angles from vectors.
3224  */
3225 static inline SimdDouble gmx_simdcall atan2(SimdDouble y, SimdDouble x)
3226 {
3227     const SimdDouble pi(M_PI);
3228     const SimdDouble halfpi(M_PI / 2.0);
3229     const SimdDouble minFloat(std::numeric_limits<float>::min());
3230     SimdDouble       xinv, p, aoffset;
3231     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3232
3233     mask_xnz  = x != setZero();
3234     mask_ynz  = y != setZero();
3235     mask_xlt0 = (x < setZero());
3236     mask_ylt0 = (y < setZero());
3237
3238     aoffset = selectByNotMask(halfpi, mask_xnz);
3239     aoffset = selectByMask(aoffset, mask_ynz);
3240
3241     aoffset = blend(aoffset, pi, mask_xlt0);
3242     aoffset = blend(aoffset, -aoffset, mask_ylt0);
3243
3244     xinv = maskzInv(x, mask_xnz && (minFloat <= abs(x)));
3245     p    = y * xinv;
3246     p    = atan(p);
3247     p    = p + aoffset;
3248
3249     return p;
3250 }
3251
3252
3253 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
3254  *
3255  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3256  *           interaction distance and beta the ewald splitting parameters.
3257  * \result Correction factor to coulomb force.
3258  *
3259  * This routine is meant to enable analytical evaluation of the
3260  * direct-space PME electrostatic force to avoid tables. For details, see the
3261  * single precision function.
3262  */
3263 static inline SimdDouble gmx_simdcall pmeForceCorrection(SimdDouble z2)
3264 {
3265     const SimdDouble FN10(-8.0072854618360083154e-14);
3266     const SimdDouble FN9(1.1859116242260148027e-11);
3267     const SimdDouble FN8(-8.1490406329798423616e-10);
3268     const SimdDouble FN7(3.4404793543907847655e-8);
3269     const SimdDouble FN6(-9.9471420832602741006e-7);
3270     const SimdDouble FN5(0.000020740315999115847456);
3271     const SimdDouble FN4(-0.00031991745139313364005);
3272     const SimdDouble FN3(0.0035074449373659008203);
3273     const SimdDouble FN2(-0.031750380176100813405);
3274     const SimdDouble FN1(0.13884101728898463426);
3275     const SimdDouble FN0(-0.75225277815249618847);
3276
3277     const SimdDouble FD5(0.000016009278224355026701);
3278     const SimdDouble FD4(0.00051055686934806966046);
3279     const SimdDouble FD3(0.0081803507497974289008);
3280     const SimdDouble FD2(0.077181146026670287235);
3281     const SimdDouble FD1(0.41543303143712535988);
3282     const SimdDouble FD0(1.0);
3283
3284     SimdDouble z4;
3285     SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3286
3287     z4 = z2 * z2;
3288
3289     polyFD1 = fma(FD5, z4, FD3);
3290     polyFD1 = fma(polyFD1, z4, FD1);
3291     polyFD1 = polyFD1 * z2;
3292     polyFD0 = fma(FD4, z4, FD2);
3293     polyFD0 = fma(polyFD0, z4, FD0);
3294     polyFD0 = polyFD0 + polyFD1;
3295
3296     polyFD0 = inv(polyFD0);
3297
3298     polyFN0 = fma(FN10, z4, FN8);
3299     polyFN0 = fma(polyFN0, z4, FN6);
3300     polyFN0 = fma(polyFN0, z4, FN4);
3301     polyFN0 = fma(polyFN0, z4, FN2);
3302     polyFN0 = fma(polyFN0, z4, FN0);
3303     polyFN1 = fma(FN9, z4, FN7);
3304     polyFN1 = fma(polyFN1, z4, FN5);
3305     polyFN1 = fma(polyFN1, z4, FN3);
3306     polyFN1 = fma(polyFN1, z4, FN1);
3307     polyFN0 = fma(polyFN1, z2, polyFN0);
3308
3309
3310     return polyFN0 * polyFD0;
3311 }
3312
3313
3314 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
3315  *
3316  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3317  *           interaction distance and beta the ewald splitting parameters.
3318  * \result Correction factor to coulomb force.
3319  *
3320  * This routine is meant to enable analytical evaluation of the
3321  * direct-space PME electrostatic potential to avoid tables. For details, see the
3322  * single precision function.
3323  */
3324 static inline SimdDouble gmx_simdcall pmePotentialCorrection(SimdDouble z2)
3325 {
3326     const SimdDouble VN9(-9.3723776169321855475e-13);
3327     const SimdDouble VN8(1.2280156762674215741e-10);
3328     const SimdDouble VN7(-7.3562157912251309487e-9);
3329     const SimdDouble VN6(2.6215886208032517509e-7);
3330     const SimdDouble VN5(-4.9532491651265819499e-6);
3331     const SimdDouble VN4(0.00025907400778966060389);
3332     const SimdDouble VN3(0.0010585044856156469792);
3333     const SimdDouble VN2(0.045247661136833092885);
3334     const SimdDouble VN1(0.11643931522926034421);
3335     const SimdDouble VN0(1.1283791671726767970);
3336
3337     const SimdDouble VD5(0.000021784709867336150342);
3338     const SimdDouble VD4(0.00064293662010911388448);
3339     const SimdDouble VD3(0.0096311444822588683504);
3340     const SimdDouble VD2(0.085608012351550627051);
3341     const SimdDouble VD1(0.43652499166614811084);
3342     const SimdDouble VD0(1.0);
3343
3344     SimdDouble z4;
3345     SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
3346
3347     z4 = z2 * z2;
3348
3349     polyVD1 = fma(VD5, z4, VD3);
3350     polyVD0 = fma(VD4, z4, VD2);
3351     polyVD1 = fma(polyVD1, z4, VD1);
3352     polyVD0 = fma(polyVD0, z4, VD0);
3353     polyVD0 = fma(polyVD1, z2, polyVD0);
3354
3355     polyVD0 = inv(polyVD0);
3356
3357     polyVN1 = fma(VN9, z4, VN7);
3358     polyVN0 = fma(VN8, z4, VN6);
3359     polyVN1 = fma(polyVN1, z4, VN5);
3360     polyVN0 = fma(polyVN0, z4, VN4);
3361     polyVN1 = fma(polyVN1, z4, VN3);
3362     polyVN0 = fma(polyVN0, z4, VN2);
3363     polyVN1 = fma(polyVN1, z4, VN1);
3364     polyVN0 = fma(polyVN0, z4, VN0);
3365     polyVN0 = fma(polyVN1, z2, polyVN0);
3366
3367     return polyVN0 * polyVD0;
3368 }
3369
3370 /*! \} */
3371
3372
3373 /*! \name SIMD math functions for double prec. data, single prec. accuracy
3374  *
3375  *  \note In some cases we do not need full double accuracy of individual
3376  *        SIMD math functions, although the data is stored in double precision
3377  *        SIMD registers. This might be the case for special algorithms, or
3378  *        if the architecture does not support single precision.
3379  *        Since the full double precision evaluation of math functions
3380  *        typically require much more expensive polynomial approximations
3381  *        these functions implement the algorithms used in the single precision
3382  *        SIMD math functions, but they operate on double precision
3383  *        SIMD variables.
3384  *
3385  *  \{
3386  */
3387
3388 /*********************************************************************
3389  * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3390  *********************************************************************/
3391
3392 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
3393  *
3394  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3395  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3396  *           For the single precision implementation this is obviously always
3397  *           true for positive values, but for double precision it adds an
3398  *           extra restriction since the first lookup step might have to be
3399  *           performed in single precision on some architectures. Note that the
3400  *           responsibility for checking falls on you - this routine does not
3401  *           check arguments.
3402  *
3403  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
3404  */
3405 static inline SimdDouble gmx_simdcall invsqrtSingleAccuracy(SimdDouble x)
3406 {
3407     SimdDouble lu = rsqrt(x);
3408 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3409     lu = rsqrtIter(lu, x);
3410 #        endif
3411 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3412     lu = rsqrtIter(lu, x);
3413 #        endif
3414 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3415     lu = rsqrtIter(lu, x);
3416 #        endif
3417     return lu;
3418 }
3419
3420 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
3421  *
3422  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3423  *  Illegal values in the masked-out elements will not lead to
3424  *  floating-point exceptions.
3425  *
3426  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3427  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3428  *           For the single precision implementation this is obviously always
3429  *           true for positive values, but for double precision it adds an
3430  *           extra restriction since the first lookup step might have to be
3431  *           performed in single precision on some architectures. Note that the
3432  *           responsibility for checking falls on you - this routine does not
3433  *           check arguments.
3434  *
3435  *  \param m Mask
3436  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
3437  *          entry was not masked, and 0.0 for masked-out entries.
3438  */
3439 static inline SimdDouble maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
3440 {
3441     SimdDouble lu = maskzRsqrt(x, m);
3442 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3443     lu = rsqrtIter(lu, x);
3444 #        endif
3445 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3446     lu = rsqrtIter(lu, x);
3447 #        endif
3448 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3449     lu = rsqrtIter(lu, x);
3450 #        endif
3451     return lu;
3452 }
3453
3454 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
3455  *
3456  * \param x0  First set of arguments, x0 must be in single range (see below).
3457  * \param x1  Second set of arguments, x1 must be in single range (see below).
3458  * \param[out] out0  Result 1/sqrt(x0)
3459  * \param[out] out1  Result 1/sqrt(x1)
3460  *
3461  *  In particular for double precision we can sometimes calculate square root
3462  *  pairs slightly faster by using single precision until the very last step.
3463  *
3464  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
3465  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
3466  *       For the single precision implementation this is obviously always
3467  *       true for positive values, but for double precision it adds an
3468  *       extra restriction since the first lookup step might have to be
3469  *       performed in single precision on some architectures. Note that the
3470  *       responsibility for checking falls on you - this routine does not
3471  *       check arguments.
3472  */
3473 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdDouble  x0,
3474                                                           SimdDouble  x1,
3475                                                           SimdDouble* out0,
3476                                                           SimdDouble* out1)
3477 {
3478 #        if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
3479                 && (GMX_SIMD_RSQRT_BITS < 22)
3480     SimdFloat  xf  = cvtDD2F(x0, x1);
3481     SimdFloat  luf = rsqrt(xf);
3482     SimdDouble lu0, lu1;
3483     // Intermediate target is single - mantissa+1 bits
3484 #            if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3485     luf = rsqrtIter(luf, xf);
3486 #            endif
3487 #            if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3488     luf = rsqrtIter(luf, xf);
3489 #            endif
3490 #            if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3491     luf = rsqrtIter(luf, xf);
3492 #            endif
3493     cvtF2DD(luf, &lu0, &lu1);
3494     // We now have single-precision accuracy values in lu0/lu1
3495     *out0 = lu0;
3496     *out1 = lu1;
3497 #        else
3498     *out0 = invsqrtSingleAccuracy(x0);
3499     *out1 = invsqrtSingleAccuracy(x1);
3500 #        endif
3501 }
3502
3503 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3504  *
3505  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3506  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3507  *           For the single precision implementation this is obviously always
3508  *           true for positive values, but for double precision it adds an
3509  *           extra restriction since the first lookup step might have to be
3510  *           performed in single precision on some architectures. Note that the
3511  *           responsibility for checking falls on you - this routine does not
3512  *           check arguments.
3513  *
3514  *  \return 1/x. Result is undefined if your argument was invalid.
3515  */
3516 static inline SimdDouble gmx_simdcall invSingleAccuracy(SimdDouble x)
3517 {
3518     SimdDouble lu = rcp(x);
3519 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3520     lu = rcpIter(lu, x);
3521 #        endif
3522 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3523     lu = rcpIter(lu, x);
3524 #        endif
3525 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3526     lu = rcpIter(lu, x);
3527 #        endif
3528     return lu;
3529 }
3530
3531 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3532  *
3533  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3534  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3535  *           For the single precision implementation this is obviously always
3536  *           true for positive values, but for double precision it adds an
3537  *           extra restriction since the first lookup step might have to be
3538  *           performed in single precision on some architectures. Note that the
3539  *           responsibility for checking falls on you - this routine does not
3540  *           check arguments.
3541  *
3542  *  \param m Mask
3543  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3544  */
3545 static inline SimdDouble gmx_simdcall maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3546 {
3547     SimdDouble lu = maskzRcp(x, m);
3548 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3549     lu = rcpIter(lu, x);
3550 #        endif
3551 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3552     lu = rcpIter(lu, x);
3553 #        endif
3554 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3555     lu = rcpIter(lu, x);
3556 #        endif
3557     return lu;
3558 }
3559
3560
3561 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3562  *
3563  *  \copydetails sqrt(SimdFloat)
3564  */
3565 template<MathOptimization opt = MathOptimization::Safe>
3566 static inline SimdDouble gmx_simdcall sqrtSingleAccuracy(SimdDouble x)
3567 {
3568     if (opt == MathOptimization::Safe)
3569     {
3570         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3571         return res * x;
3572     }
3573     else
3574     {
3575         return x * invsqrtSingleAccuracy(x);
3576     }
3577 }
3578
3579 /*! \brief Cube root for SIMD doubles, single accuracy.
3580  *
3581  * \param x      Argument to calculate cube root of. Can be negative or zero,
3582  *               but NaN or Inf values are not supported. Denormal values will
3583  *               be treated as 0.0.
3584  * \return       Cube root of x.
3585  */
3586 static inline SimdDouble gmx_simdcall cbrtSingleAccuracy(SimdDouble x)
3587 {
3588     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3589     const SimdDouble minDouble(std::numeric_limits<double>::min());
3590     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3591     // Use the divided value as original constant to avoid division warnings.
3592     const std::int32_t offsetDiv3(341);
3593     const SimdDouble   c2(-0.191502161678719066);
3594     const SimdDouble   c1(0.697570460207922770);
3595     const SimdDouble   c0(0.492659620528969547);
3596     const SimdDouble   one(1.0);
3597     const SimdDouble   two(2.0);
3598     const SimdDouble   three(3.0);
3599     const SimdDouble   oneThird(1.0 / 3.0);
3600     const SimdDouble   cbrt2(1.2599210498948731648);
3601     const SimdDouble   sqrCbrt2(1.5874010519681994748);
3602
3603     // See the single precision routines for documentation of the algorithm
3604
3605     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3606     SimdDouble xAbs     = andNot(signBit, x);    // select everthing but the sign bit => abs(x)
3607     SimdDBool  xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
3608
3609     SimdDInt32 exponent;
3610     SimdDouble y        = frexp(xAbs, &exponent);
3611     SimdDouble z        = fma(fma(y, c2, c1), y, c0);
3612     SimdDouble w        = z * z * z;
3613     SimdDouble nom      = z * fma(two, y, w);
3614     SimdDouble invDenom = inv(fma(two, w, y));
3615
3616     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3617     SimdDouble offsetExpDiv3 =
3618             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3619     SimdDInt32 expDiv3   = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
3620     SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
3621     SimdDouble factor    = blend(one, cbrt2, SimdDouble(0.5) < remainder);
3622     factor               = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
3623     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
3624     SimdDouble result    = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
3625     return result;
3626 }
3627
3628 /*! \brief Inverse cube root for SIMD doubles, single accuracy.
3629  *
3630  * \param x      Argument to calculate cube root of. Can be positive or
3631  *               negative, but the magnitude cannot be lower than
3632  *               the smallest normal number.
3633  * \return       Cube root of x. Undefined for values that don't
3634  *               fulfill the restriction of abs(x) > minDouble.
3635  */
3636 static inline SimdDouble gmx_simdcall invcbrtSingleAccuracy(SimdDouble x)
3637 {
3638     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3639     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3640     // Use the divided value as original constant to avoid division warnings.
3641     const std::int32_t offsetDiv3(341);
3642     const SimdDouble   c2(-0.191502161678719066);
3643     const SimdDouble   c1(0.697570460207922770);
3644     const SimdDouble   c0(0.492659620528969547);
3645     const SimdDouble   one(1.0);
3646     const SimdDouble   two(2.0);
3647     const SimdDouble   three(3.0);
3648     const SimdDouble   oneThird(1.0 / 3.0);
3649     const SimdDouble   invCbrt2(1.0 / 1.2599210498948731648);
3650     const SimdDouble   invSqrCbrt2(1.0F / 1.5874010519681994748);
3651
3652     // See the single precision routines for documentation of the algorithm
3653
3654     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3655     SimdDouble xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3656
3657     SimdDInt32 exponent;
3658     SimdDouble y         = frexp(xAbs, &exponent);
3659     SimdDouble z         = fma(fma(y, c2, c1), y, c0);
3660     SimdDouble w         = z * z * z;
3661     SimdDouble nom       = fma(two, w, y);
3662     SimdDouble invDenom  = inv(z * fma(two, y, w));
3663     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3664     SimdDouble offsetExpDiv3 =
3665             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3666     SimdDInt32 expDiv3   = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
3667     SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
3668     SimdDouble factor    = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
3669     factor               = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
3670     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
3671     SimdDouble result    = ldexp(fraction, expDiv3);
3672     return result;
3673 }
3674
3675 /*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
3676  *
3677  * \param x Argument, should be >0.
3678  * \result The base 2 logarithm of x. Undefined if argument is invalid.
3679  */
3680 static inline SimdDouble gmx_simdcall log2SingleAccuracy(SimdDouble x)
3681 {
3682 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3683     return log(x) * SimdDouble(std::log2(std::exp(1.0)));
3684 #        else
3685     const SimdDouble one(1.0);
3686     const SimdDouble two(2.0);
3687     const SimdDouble sqrt2(std::sqrt(2.0));
3688     const SimdDouble CL9(0.342149508897807708152F);
3689     const SimdDouble CL7(0.411570606888219447939F);
3690     const SimdDouble CL5(0.577085979152320294183F);
3691     const SimdDouble CL3(0.961796550607099898222F);
3692     const SimdDouble CL1(2.885390081777926774009F);
3693     SimdDouble       fexp, x2, p;
3694     SimdDInt32       iexp;
3695     SimdDBool        mask;
3696
3697     // For log2(), the argument cannot be 0, so use the faster version of frexp
3698     x    = frexp<MathOptimization::Unsafe>(x, &iexp);
3699     fexp = cvtI2R(iexp);
3700
3701     mask = (x < sqrt2);
3702     // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3703     fexp = fexp - selectByMask(one, mask);
3704     x    = x * blend(one, two, mask);
3705
3706     x  = (x - one) * invSingleAccuracy(x + one);
3707     x2 = x * x;
3708
3709     p = fma(CL9, x2, CL7);
3710     p = fma(p, x2, CL5);
3711     p = fma(p, x2, CL3);
3712     p = fma(p, x2, CL1);
3713     p = fma(p, x, fexp);
3714
3715     return p;
3716 #        endif
3717 }
3718
3719 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3720  *
3721  * \param x Argument, should be >0.
3722  * \result The natural logarithm of x. Undefined if argument is invalid.
3723  */
3724 static inline SimdDouble gmx_simdcall logSingleAccuracy(SimdDouble x)
3725 {
3726 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3727     return log(x);
3728 #        else
3729     const SimdDouble one(1.0);
3730     const SimdDouble two(2.0);
3731     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
3732     const SimdDouble corr(0.693147180559945286226764);
3733     const SimdDouble CL9(0.2371599674224853515625);
3734     const SimdDouble CL7(0.285279005765914916992188);
3735     const SimdDouble CL5(0.400005519390106201171875);
3736     const SimdDouble CL3(0.666666567325592041015625);
3737     const SimdDouble CL1(2.0);
3738     SimdDouble       fexp, x2, p;
3739     SimdDInt32       iexp;
3740     SimdDBool        mask;
3741
3742     // For log(), the argument cannot be 0, so use the faster version of frexp
3743     x    = frexp<MathOptimization::Unsafe>(x, &iexp);
3744     fexp = cvtI2R(iexp);
3745
3746     mask = x < invsqrt2;
3747     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
3748     fexp = fexp - selectByMask(one, mask);
3749     x    = x * blend(one, two, mask);
3750
3751     x  = (x - one) * invSingleAccuracy(x + one);
3752     x2 = x * x;
3753
3754     p = fma(CL9, x2, CL7);
3755     p = fma(p, x2, CL5);
3756     p = fma(p, x2, CL3);
3757     p = fma(p, x2, CL1);
3758     p = fma(p, x, corr * fexp);
3759
3760     return p;
3761 #        endif
3762 }
3763
3764 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3765  *
3766  * \copydetails exp2(SimdFloat)
3767  */
3768 template<MathOptimization opt = MathOptimization::Safe>
3769 static inline SimdDouble gmx_simdcall exp2SingleAccuracy(SimdDouble x)
3770 {
3771 #        if GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
3772     return exp2(x);
3773 #        else
3774     const SimdDouble CC6(0.0001534581200287996416911311);
3775     const SimdDouble CC5(0.001339993121934088894618990);
3776     const SimdDouble CC4(0.009618488957115180159497841);
3777     const SimdDouble CC3(0.05550328776964726865751735);
3778     const SimdDouble CC2(0.2402264689063408646490722);
3779     const SimdDouble CC1(0.6931472057372680777553816);
3780     const SimdDouble one(1.0);
3781
3782     SimdDouble intpart;
3783     SimdDouble p;
3784     SimdDInt32 ix;
3785
3786     // Large negative values are valid arguments to exp2(), so there are two
3787     // things we need to account for:
3788     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3789     //    zero and we can no longer multiply with it. There are special IEEE
3790     //    formats to handle this range, but for now we have to accept that
3791     //    we cannot handle those arguments. If input value becomes even more
3792     //    negative, it will start to loop and we would end up with invalid
3793     //    exponents. Thus, we need to limit or mask this.
3794     // 2. For VERY large negative values, we will have problems that the
3795     //    subtraction to get the fractional part loses accuracy, and then we
3796     //    can end up with overflows in the polynomial.
3797     //
3798     // For now, we handle this by forwarding the math optimization setting to
3799     // ldexp, where the routine will return zero for very small arguments.
3800     //
3801     // However, before doing that we need to make sure we do not call cvtR2I
3802     // with an argument that is so negative it cannot be converted to an integer.
3803     if (opt == MathOptimization::Safe)
3804     {
3805         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
3806     }
3807
3808     ix      = cvtR2I(x);
3809     intpart = round(x);
3810     x       = x - intpart;
3811
3812     p = fma(CC6, x, CC5);
3813     p = fma(p, x, CC4);
3814     p = fma(p, x, CC3);
3815     p = fma(p, x, CC2);
3816     p = fma(p, x, CC1);
3817     p = fma(p, x, one);
3818     x = ldexp<opt>(p, ix);
3819
3820     return x;
3821 #        endif
3822 }
3823
3824
3825 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3826  *
3827  * \copydetails exp(SimdFloat)
3828  */
3829 template<MathOptimization opt = MathOptimization::Safe>
3830 static inline SimdDouble gmx_simdcall expSingleAccuracy(SimdDouble x)
3831 {
3832 #        if GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
3833     return exp(x);
3834 #        else
3835     const SimdDouble argscale(1.44269504088896341);
3836     // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3837     const SimdDouble smallArgLimit(-709.0895657128);
3838     const SimdDouble invargscale(-0.69314718055994528623);
3839     const SimdDouble CC4(0.00136324646882712841033936);
3840     const SimdDouble CC3(0.00836596917361021041870117);
3841     const SimdDouble CC2(0.0416710823774337768554688);
3842     const SimdDouble CC1(0.166665524244308471679688);
3843     const SimdDouble CC0(0.499999850988388061523438);
3844     const SimdDouble one(1.0);
3845     SimdDouble       intpart;
3846     SimdDouble       y, p;
3847     SimdDInt32       iy;
3848
3849     // Large negative values are valid arguments to exp2(), so there are two
3850     // things we need to account for:
3851     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3852     //    zero and we can no longer multiply with it. There are special IEEE
3853     //    formats to handle this range, but for now we have to accept that
3854     //    we cannot handle those arguments. If input value becomes even more
3855     //    negative, it will start to loop and we would end up with invalid
3856     //    exponents. Thus, we need to limit or mask this.
3857     // 2. For VERY large negative values, we will have problems that the
3858     //    subtraction to get the fractional part loses accuracy, and then we
3859     //    can end up with overflows in the polynomial.
3860     //
3861     // For now, we handle this by forwarding the math optimization setting to
3862     // ldexp, where the routine will return zero for very small arguments.
3863     //
3864     // However, before doing that we need to make sure we do not call cvtR2I
3865     // with an argument that is so negative it cannot be converted to an integer
3866     // after being multiplied by argscale.
3867
3868     if (opt == MathOptimization::Safe)
3869     {
3870         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
3871     }
3872
3873     y = x * argscale;
3874
3875     iy      = cvtR2I(y);
3876     intpart = round(y); // use same rounding algorithm here
3877
3878     // Extended precision arithmetics not needed since
3879     // we have double precision and only need single accuracy.
3880     x = fma(invargscale, intpart, x);
3881
3882     p = fma(CC4, x, CC3);
3883     p = fma(p, x, CC2);
3884     p = fma(p, x, CC1);
3885     p = fma(p, x, CC0);
3886     p = fma(x * x, p, x);
3887     p = p + one;
3888     x = ldexp<opt>(p, iy);
3889     return x;
3890 #        endif
3891 }
3892
3893 /*! \brief SIMD pow(x,y). Double precision SIMD data, single accuracy.
3894  *
3895  * This returns x^y for SIMD values.
3896  *
3897  * \tparam opt If this is changed from the default (safe) into the unsafe
3898  *             option, there are no guarantees about correct results for x==0.
3899  *
3900  * \param x Base.
3901  *
3902  * \param y exponent.
3903
3904  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
3905  *         depending on the underlying implementation. If unsafe optimizations
3906  *         are enabled, this is also true for x==0.
3907  *
3908  * \warning You cannot rely on this implementation returning inf for arguments
3909  *          that cause overflow. If you have some very large
3910  *          values and need to rely on getting a valid numerical output,
3911  *          take the minimum of your variable and the largest valid argument
3912  *          before calling this routine.
3913  */
3914 template<MathOptimization opt = MathOptimization::Safe>
3915 static inline SimdDouble gmx_simdcall powSingleAccuracy(SimdDouble x, SimdDouble y)
3916 {
3917     SimdDouble xcorr;
3918
3919     if (opt == MathOptimization::Safe)
3920     {
3921         xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
3922     }
3923     else
3924     {
3925         xcorr = x;
3926     }
3927
3928     SimdDouble result = exp2SingleAccuracy<opt>(y * log2SingleAccuracy(xcorr));
3929
3930     if (opt == MathOptimization::Safe)
3931     {
3932         // if x==0 and y>0 we explicitly set the result to 0.0
3933         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
3934         result = blend(result, setZero(), x == setZero() && setZero() < y);
3935     }
3936
3937     return result;
3938 }
3939
3940 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3941  *
3942  * \param x The value to calculate erf(x) for.
3943  * \result erf(x)
3944  *
3945  * This routine achieves very close to single precision, but we do not care about
3946  * the last bit or the subnormal result range.
3947  */
3948 static inline SimdDouble gmx_simdcall erfSingleAccuracy(SimdDouble x)
3949 {
3950     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3951     const SimdDouble CA6(7.853861353153693e-5);
3952     const SimdDouble CA5(-8.010193625184903e-4);
3953     const SimdDouble CA4(5.188327685732524e-3);
3954     const SimdDouble CA3(-2.685381193529856e-2);
3955     const SimdDouble CA2(1.128358514861418e-1);
3956     const SimdDouble CA1(-3.761262582423300e-1);
3957     const SimdDouble CA0(1.128379165726710);
3958     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3959     const SimdDouble CB9(-0.0018629930017603923);
3960     const SimdDouble CB8(0.003909821287598495);
3961     const SimdDouble CB7(-0.0052094582210355615);
3962     const SimdDouble CB6(0.005685614362160572);
3963     const SimdDouble CB5(-0.0025367682853477272);
3964     const SimdDouble CB4(-0.010199799682318782);
3965     const SimdDouble CB3(0.04369575504816542);
3966     const SimdDouble CB2(-0.11884063474674492);
3967     const SimdDouble CB1(0.2732120154030589);
3968     const SimdDouble CB0(0.42758357702025784);
3969     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3970     const SimdDouble CC10(-0.0445555913112064);
3971     const SimdDouble CC9(0.21376355144663348);
3972     const SimdDouble CC8(-0.3473187200259257);
3973     const SimdDouble CC7(0.016690861551248114);
3974     const SimdDouble CC6(0.7560973182491192);
3975     const SimdDouble CC5(-1.2137903600145787);
3976     const SimdDouble CC4(0.8411872321232948);
3977     const SimdDouble CC3(-0.08670413896296343);
3978     const SimdDouble CC2(-0.27124782687240334);
3979     const SimdDouble CC1(-0.0007502488047806069);
3980     const SimdDouble CC0(0.5642114853803148);
3981     const SimdDouble one(1.0);
3982     const SimdDouble two(2.0);
3983
3984     SimdDouble x2, x4, y;
3985     SimdDouble t, t2, w, w2;
3986     SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3987     SimdDouble expmx2;
3988     SimdDouble res_erf, res_erfc, res;
3989     SimdDBool  mask, msk_erf;
3990
3991     // Calculate erf()
3992     x2 = x * x;
3993     x4 = x2 * x2;
3994
3995     pA0 = fma(CA6, x4, CA4);
3996     pA1 = fma(CA5, x4, CA3);
3997     pA0 = fma(pA0, x4, CA2);
3998     pA1 = fma(pA1, x4, CA1);
3999     pA0 = pA0 * x4;
4000     pA0 = fma(pA1, x2, pA0);
4001     // Constant term must come last for precision reasons
4002     pA0 = pA0 + CA0;
4003
4004     res_erf = x * pA0;
4005
4006     // Calculate erfc
4007     y       = abs(x);
4008     msk_erf = (SimdDouble(0.75) <= y);
4009     t       = maskzInvSingleAccuracy(y, msk_erf);
4010     w       = t - one;
4011     t2      = t * t;
4012     w2      = w * w;
4013
4014     expmx2 = expSingleAccuracy(-y * y);
4015
4016     pB1 = fma(CB9, w2, CB7);
4017     pB0 = fma(CB8, w2, CB6);
4018     pB1 = fma(pB1, w2, CB5);
4019     pB0 = fma(pB0, w2, CB4);
4020     pB1 = fma(pB1, w2, CB3);
4021     pB0 = fma(pB0, w2, CB2);
4022     pB1 = fma(pB1, w2, CB1);
4023     pB0 = fma(pB0, w2, CB0);
4024     pB0 = fma(pB1, w, pB0);
4025
4026     pC0 = fma(CC10, t2, CC8);
4027     pC1 = fma(CC9, t2, CC7);
4028     pC0 = fma(pC0, t2, CC6);
4029     pC1 = fma(pC1, t2, CC5);
4030     pC0 = fma(pC0, t2, CC4);
4031     pC1 = fma(pC1, t2, CC3);
4032     pC0 = fma(pC0, t2, CC2);
4033     pC1 = fma(pC1, t2, CC1);
4034
4035     pC0 = fma(pC0, t2, CC0);
4036     pC0 = fma(pC1, t, pC0);
4037     pC0 = pC0 * t;
4038
4039     // Select pB0 or pC0 for erfc()
4040     mask     = (two < y);
4041     res_erfc = blend(pB0, pC0, mask);
4042     res_erfc = res_erfc * expmx2;
4043
4044     // erfc(x<0) = 2-erfc(|x|)
4045     mask     = (x < setZero());
4046     res_erfc = blend(res_erfc, two - res_erfc, mask);
4047
4048     // Select erf() or erfc()
4049     mask = (y < SimdDouble(0.75));
4050     res  = blend(one - res_erfc, res_erf, mask);
4051
4052     return res;
4053 }
4054
4055 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
4056  *
4057  * \param x The value to calculate erfc(x) for.
4058  * \result erfc(x)
4059  *
4060  * This routine achieves singleprecision (bar the last bit) over most of the
4061  * input range, but for large arguments where the result is getting close
4062  * to the minimum representable numbers we accept slightly larger errors
4063  * (think results that are in the ballpark of 10^-30) since that is not
4064  * relevant for MD.
4065  */
4066 static inline SimdDouble gmx_simdcall erfcSingleAccuracy(SimdDouble x)
4067 {
4068     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
4069     const SimdDouble CA6(7.853861353153693e-5);
4070     const SimdDouble CA5(-8.010193625184903e-4);
4071     const SimdDouble CA4(5.188327685732524e-3);
4072     const SimdDouble CA3(-2.685381193529856e-2);
4073     const SimdDouble CA2(1.128358514861418e-1);
4074     const SimdDouble CA1(-3.761262582423300e-1);
4075     const SimdDouble CA0(1.128379165726710);
4076     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
4077     const SimdDouble CB9(-0.0018629930017603923);
4078     const SimdDouble CB8(0.003909821287598495);
4079     const SimdDouble CB7(-0.0052094582210355615);
4080     const SimdDouble CB6(0.005685614362160572);
4081     const SimdDouble CB5(-0.0025367682853477272);
4082     const SimdDouble CB4(-0.010199799682318782);
4083     const SimdDouble CB3(0.04369575504816542);
4084     const SimdDouble CB2(-0.11884063474674492);
4085     const SimdDouble CB1(0.2732120154030589);
4086     const SimdDouble CB0(0.42758357702025784);
4087     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
4088     const SimdDouble CC10(-0.0445555913112064);
4089     const SimdDouble CC9(0.21376355144663348);
4090     const SimdDouble CC8(-0.3473187200259257);
4091     const SimdDouble CC7(0.016690861551248114);
4092     const SimdDouble CC6(0.7560973182491192);
4093     const SimdDouble CC5(-1.2137903600145787);
4094     const SimdDouble CC4(0.8411872321232948);
4095     const SimdDouble CC3(-0.08670413896296343);
4096     const SimdDouble CC2(-0.27124782687240334);
4097     const SimdDouble CC1(-0.0007502488047806069);
4098     const SimdDouble CC0(0.5642114853803148);
4099     const SimdDouble one(1.0);
4100     const SimdDouble two(2.0);
4101
4102     SimdDouble x2, x4, y;
4103     SimdDouble t, t2, w, w2;
4104     SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
4105     SimdDouble expmx2;
4106     SimdDouble res_erf, res_erfc, res;
4107     SimdDBool  mask, msk_erf;
4108
4109     // Calculate erf()
4110     x2 = x * x;
4111     x4 = x2 * x2;
4112
4113     pA0 = fma(CA6, x4, CA4);
4114     pA1 = fma(CA5, x4, CA3);
4115     pA0 = fma(pA0, x4, CA2);
4116     pA1 = fma(pA1, x4, CA1);
4117     pA1 = pA1 * x2;
4118     pA0 = fma(pA0, x4, pA1);
4119     // Constant term must come last for precision reasons
4120     pA0 = pA0 + CA0;
4121
4122     res_erf = x * pA0;
4123
4124     // Calculate erfc
4125     y       = abs(x);
4126     msk_erf = (SimdDouble(0.75) <= y);
4127     t       = maskzInvSingleAccuracy(y, msk_erf);
4128     w       = t - one;
4129     t2      = t * t;
4130     w2      = w * w;
4131
4132     expmx2 = expSingleAccuracy(-y * y);
4133
4134     pB1 = fma(CB9, w2, CB7);
4135     pB0 = fma(CB8, w2, CB6);
4136     pB1 = fma(pB1, w2, CB5);
4137     pB0 = fma(pB0, w2, CB4);
4138     pB1 = fma(pB1, w2, CB3);
4139     pB0 = fma(pB0, w2, CB2);
4140     pB1 = fma(pB1, w2, CB1);
4141     pB0 = fma(pB0, w2, CB0);
4142     pB0 = fma(pB1, w, pB0);
4143
4144     pC0 = fma(CC10, t2, CC8);
4145     pC1 = fma(CC9, t2, CC7);
4146     pC0 = fma(pC0, t2, CC6);
4147     pC1 = fma(pC1, t2, CC5);
4148     pC0 = fma(pC0, t2, CC4);
4149     pC1 = fma(pC1, t2, CC3);
4150     pC0 = fma(pC0, t2, CC2);
4151     pC1 = fma(pC1, t2, CC1);
4152
4153     pC0 = fma(pC0, t2, CC0);
4154     pC0 = fma(pC1, t, pC0);
4155     pC0 = pC0 * t;
4156
4157     // Select pB0 or pC0 for erfc()
4158     mask     = (two < y);
4159     res_erfc = blend(pB0, pC0, mask);
4160     res_erfc = res_erfc * expmx2;
4161
4162     // erfc(x<0) = 2-erfc(|x|)
4163     mask     = (x < setZero());
4164     res_erfc = blend(res_erfc, two - res_erfc, mask);
4165
4166     // Select erf() or erfc()
4167     mask = (y < SimdDouble(0.75));
4168     res  = blend(res_erfc, one - res_erf, mask);
4169
4170     return res;
4171 }
4172
4173 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
4174  *
4175  * \param x The argument to evaluate sin/cos for
4176  * \param[out] sinval Sin(x)
4177  * \param[out] cosval Cos(x)
4178  */
4179 static inline void gmx_simdcall sinCosSingleAccuracy(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
4180 {
4181     // Constants to subtract Pi/4*x from y while minimizing precision loss
4182     const SimdDouble argred0(2 * 0.78539816290140151978);
4183     const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4184     const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4185     const SimdDouble two_over_pi(2.0 / M_PI);
4186     const SimdDouble const_sin2(-1.9515295891e-4);
4187     const SimdDouble const_sin1(8.3321608736e-3);
4188     const SimdDouble const_sin0(-1.6666654611e-1);
4189     const SimdDouble const_cos2(2.443315711809948e-5);
4190     const SimdDouble const_cos1(-1.388731625493765e-3);
4191     const SimdDouble const_cos0(4.166664568298827e-2);
4192
4193     const SimdDouble half(0.5);
4194     const SimdDouble one(1.0);
4195     SimdDouble       ssign, csign;
4196     SimdDouble       x2, y, z, psin, pcos, sss, ccc;
4197     SimdDBool        mask;
4198
4199 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4200     const SimdDInt32 ione(1);
4201     const SimdDInt32 itwo(2);
4202     SimdDInt32       iy;
4203
4204     z  = x * two_over_pi;
4205     iy = cvtR2I(z);
4206     y  = round(z);
4207
4208     mask  = cvtIB2B((iy & ione) == setZero());
4209     ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
4210     csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
4211 #        else
4212     const SimdDouble quarter(0.25);
4213     const SimdDouble minusquarter(-0.25);
4214     SimdDouble       q;
4215     SimdDBool        m1, m2, m3;
4216
4217     /* The most obvious way to find the arguments quadrant in the unit circle
4218      * to calculate the sign is to use integer arithmetic, but that is not
4219      * present in all SIMD implementations. As an alternative, we have devised a
4220      * pure floating-point algorithm that uses truncation for argument reduction
4221      * so that we get a new value 0<=q<1 over the unit circle, and then
4222      * do floating-point comparisons with fractions. This is likely to be
4223      * slightly slower (~10%) due to the longer latencies of floating-point, so
4224      * we only use it when integer SIMD arithmetic is not present.
4225      */
4226     ssign = x;
4227     x     = abs(x);
4228     // It is critical that half-way cases are rounded down
4229     z = fma(x, two_over_pi, half);
4230     y = trunc(z);
4231     q = z * quarter;
4232     q = q - trunc(q);
4233     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
4234      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
4235      * This removes the 2*Pi periodicity without using any integer arithmetic.
4236      * First check if y had the value 2 or 3, set csign if true.
4237      */
4238     q = q - half;
4239     /* If we have logical operations we can work directly on the signbit, which
4240      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
4241      * Thus, if you are altering defines to debug alternative code paths, the
4242      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
4243      * active or inactive - you will get errors if only one is used.
4244      */
4245 #            if GMX_SIMD_HAVE_LOGICAL
4246     ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
4247     csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
4248     ssign = ssign ^ csign;
4249 #            else
4250     ssign = copysign(SimdDouble(1.0), ssign);
4251     csign = copysign(SimdDouble(1.0), q);
4252     csign = -csign;
4253     ssign = ssign * csign; // swap ssign if csign was set.
4254 #            endif
4255     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
4256     m1   = (q < minusquarter);
4257     m2   = (setZero() <= q);
4258     m3   = (q < quarter);
4259     m2   = m2 && m3;
4260     mask = m1 || m2;
4261     // where mask is FALSE, swap sign.
4262     csign   = csign * blend(SimdDouble(-1.0), one, mask);
4263 #        endif
4264     x  = fnma(y, argred0, x);
4265     x  = fnma(y, argred1, x);
4266     x  = fnma(y, argred2, x);
4267     x2 = x * x;
4268
4269     psin = fma(const_sin2, x2, const_sin1);
4270     psin = fma(psin, x2, const_sin0);
4271     psin = fma(psin, x * x2, x);
4272     pcos = fma(const_cos2, x2, const_cos1);
4273     pcos = fma(pcos, x2, const_cos0);
4274     pcos = fms(pcos, x2, half);
4275     pcos = fma(pcos, x2, one);
4276
4277     sss = blend(pcos, psin, mask);
4278     ccc = blend(psin, pcos, mask);
4279     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
4280 #        if GMX_SIMD_HAVE_LOGICAL
4281     *sinval = sss ^ ssign;
4282     *cosval = ccc ^ csign;
4283 #        else
4284     *sinval = sss * ssign;
4285     *cosval = ccc * csign;
4286 #        endif
4287 }
4288
4289 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
4290  *
4291  * \param x The argument to evaluate sin for
4292  * \result Sin(x)
4293  *
4294  * \attention Do NOT call both sin & cos if you need both results, since each of them
4295  * will then call \ref sincos and waste a factor 2 in performance.
4296  */
4297 static inline SimdDouble gmx_simdcall sinSingleAccuracy(SimdDouble x)
4298 {
4299     SimdDouble s, c;
4300     sinCosSingleAccuracy(x, &s, &c);
4301     return s;
4302 }
4303
4304 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
4305  *
4306  * \param x The argument to evaluate cos for
4307  * \result Cos(x)
4308  *
4309  * \attention Do NOT call both sin & cos if you need both results, since each of them
4310  * will then call \ref sincos and waste a factor 2 in performance.
4311  */
4312 static inline SimdDouble gmx_simdcall cosSingleAccuracy(SimdDouble x)
4313 {
4314     SimdDouble s, c;
4315     sinCosSingleAccuracy(x, &s, &c);
4316     return c;
4317 }
4318
4319 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
4320  *
4321  * \param x The argument to evaluate tan for
4322  * \result Tan(x)
4323  */
4324 static inline SimdDouble gmx_simdcall tanSingleAccuracy(SimdDouble x)
4325 {
4326     const SimdDouble argred0(2 * 0.78539816290140151978);
4327     const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4328     const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4329     const SimdDouble two_over_pi(2.0 / M_PI);
4330     const SimdDouble CT6(0.009498288995810566122993911);
4331     const SimdDouble CT5(0.002895755790837379295226923);
4332     const SimdDouble CT4(0.02460087336161924491836265);
4333     const SimdDouble CT3(0.05334912882656359828045988);
4334     const SimdDouble CT2(0.1333989091464957704418495);
4335     const SimdDouble CT1(0.3333307599244198227797507);
4336
4337     SimdDouble x2, p, y, z;
4338     SimdDBool  mask;
4339
4340 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4341     SimdDInt32 iy;
4342     SimdDInt32 ione(1);
4343
4344     z    = x * two_over_pi;
4345     iy   = cvtR2I(z);
4346     y    = round(z);
4347     mask = cvtIB2B((iy & ione) == ione);
4348
4349     x = fnma(y, argred0, x);
4350     x = fnma(y, argred1, x);
4351     x = fnma(y, argred2, x);
4352     x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
4353 #        else
4354     const SimdDouble quarter(0.25);
4355     const SimdDouble half(0.5);
4356     const SimdDouble threequarter(0.75);
4357     SimdDouble       w, q;
4358     SimdDBool        m1, m2, m3;
4359
4360     w    = abs(x);
4361     z    = fma(w, two_over_pi, half);
4362     y    = trunc(z);
4363     q    = z * quarter;
4364     q    = q - trunc(q);
4365     m1   = (quarter <= q);
4366     m2   = (q < half);
4367     m3   = (threequarter <= q);
4368     m1   = m1 && m2;
4369     mask = m1 || m3;
4370     w    = fnma(y, argred0, w);
4371     w    = fnma(y, argred1, w);
4372     w    = fnma(y, argred2, w);
4373
4374     w = blend(w, -w, mask);
4375     x = w * copysign(SimdDouble(1.0), x);
4376 #        endif
4377     x2 = x * x;
4378     p  = fma(CT6, x2, CT5);
4379     p  = fma(p, x2, CT4);
4380     p  = fma(p, x2, CT3);
4381     p  = fma(p, x2, CT2);
4382     p  = fma(p, x2, CT1);
4383     p  = fma(x2, p * x, x);
4384
4385     p = blend(p, maskzInvSingleAccuracy(p, mask), mask);
4386     return p;
4387 }
4388
4389 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4390  *
4391  * \param x The argument to evaluate asin for
4392  * \result Asin(x)
4393  */
4394 static inline SimdDouble gmx_simdcall asinSingleAccuracy(SimdDouble x)
4395 {
4396     const SimdDouble limitlow(1e-4);
4397     const SimdDouble half(0.5);
4398     const SimdDouble one(1.0);
4399     const SimdDouble halfpi(M_PI / 2.0);
4400     const SimdDouble CC5(4.2163199048E-2);
4401     const SimdDouble CC4(2.4181311049E-2);
4402     const SimdDouble CC3(4.5470025998E-2);
4403     const SimdDouble CC2(7.4953002686E-2);
4404     const SimdDouble CC1(1.6666752422E-1);
4405     SimdDouble       xabs;
4406     SimdDouble       z, z1, z2, q, q1, q2;
4407     SimdDouble       pA, pB;
4408     SimdDBool        mask, mask2;
4409
4410     xabs  = abs(x);
4411     mask  = (half < xabs);
4412     z1    = half * (one - xabs);
4413     mask2 = (xabs < one);
4414     q1    = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
4415     q2    = xabs;
4416     z2    = q2 * q2;
4417     z     = blend(z2, z1, mask);
4418     q     = blend(q2, q1, mask);
4419
4420     z2 = z * z;
4421     pA = fma(CC5, z2, CC3);
4422     pB = fma(CC4, z2, CC2);
4423     pA = fma(pA, z2, CC1);
4424     pA = pA * z;
4425     z  = fma(pB, z2, pA);
4426     z  = fma(z, q, q);
4427     q2 = halfpi - z;
4428     q2 = q2 - z;
4429     z  = blend(z, q2, mask);
4430
4431     mask = (limitlow < xabs);
4432     z    = blend(xabs, z, mask);
4433     z    = copysign(z, x);
4434
4435     return z;
4436 }
4437
4438 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
4439  *
4440  * \param x The argument to evaluate acos for
4441  * \result Acos(x)
4442  */
4443 static inline SimdDouble gmx_simdcall acosSingleAccuracy(SimdDouble x)
4444 {
4445     const SimdDouble one(1.0);
4446     const SimdDouble half(0.5);
4447     const SimdDouble pi(M_PI);
4448     const SimdDouble halfpi(M_PI / 2.0);
4449     SimdDouble       xabs;
4450     SimdDouble       z, z1, z2, z3;
4451     SimdDBool        mask1, mask2, mask3;
4452
4453     xabs  = abs(x);
4454     mask1 = (half < xabs);
4455     mask2 = (setZero() < x);
4456
4457     z     = half * (one - xabs);
4458     mask3 = (xabs < one);
4459     z     = z * maskzInvsqrtSingleAccuracy(z, mask3);
4460     z     = blend(x, z, mask1);
4461     z     = asinSingleAccuracy(z);
4462
4463     z2 = z + z;
4464     z1 = pi - z2;
4465     z3 = halfpi - z;
4466     z  = blend(z1, z2, mask2);
4467     z  = blend(z3, z, mask1);
4468
4469     return z;
4470 }
4471
4472 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4473  *
4474  * \param x The argument to evaluate atan for
4475  * \result Atan(x), same argument/value range as standard math library.
4476  */
4477 static inline SimdDouble gmx_simdcall atanSingleAccuracy(SimdDouble x)
4478 {
4479     const SimdDouble halfpi(M_PI / 2);
4480     const SimdDouble CA17(0.002823638962581753730774);
4481     const SimdDouble CA15(-0.01595690287649631500244);
4482     const SimdDouble CA13(0.04250498861074447631836);
4483     const SimdDouble CA11(-0.07489009201526641845703);
4484     const SimdDouble CA9(0.1063479334115982055664);
4485     const SimdDouble CA7(-0.1420273631811141967773);
4486     const SimdDouble CA5(0.1999269574880599975585);
4487     const SimdDouble CA3(-0.3333310186862945556640);
4488     SimdDouble       x2, x3, x4, pA, pB;
4489     SimdDBool        mask, mask2;
4490
4491     mask  = (x < setZero());
4492     x     = abs(x);
4493     mask2 = (SimdDouble(1.0) < x);
4494     x     = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
4495
4496     x2 = x * x;
4497     x3 = x2 * x;
4498     x4 = x2 * x2;
4499     pA = fma(CA17, x4, CA13);
4500     pB = fma(CA15, x4, CA11);
4501     pA = fma(pA, x4, CA9);
4502     pB = fma(pB, x4, CA7);
4503     pA = fma(pA, x4, CA5);
4504     pB = fma(pB, x4, CA3);
4505     pA = fma(pA, x2, pB);
4506     pA = fma(pA, x3, x);
4507
4508     pA = blend(pA, halfpi - pA, mask2);
4509     pA = blend(pA, -pA, mask);
4510
4511     return pA;
4512 }
4513
4514 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
4515  *
4516  * \param y Y component of vector, any quartile
4517  * \param x X component of vector, any quartile
4518  * \result Atan(y,x), same argument/value range as standard math library.
4519  *
4520  * \note This routine should provide correct results for all finite
4521  * non-zero or positive-zero arguments. However, negative zero arguments will
4522  * be treated as positive zero, which means the return value will deviate from
4523  * the standard math library atan2(y,x) for those cases. That should not be
4524  * of any concern in Gromacs, and in particular it will not affect calculations
4525  * of angles from vectors.
4526  */
4527 static inline SimdDouble gmx_simdcall atan2SingleAccuracy(SimdDouble y, SimdDouble x)
4528 {
4529     const SimdDouble pi(M_PI);
4530     const SimdDouble halfpi(M_PI / 2.0);
4531     SimdDouble       xinv, p, aoffset;
4532     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
4533
4534     mask_xnz  = x != setZero();
4535     mask_ynz  = y != setZero();
4536     mask_xlt0 = (x < setZero());
4537     mask_ylt0 = (y < setZero());
4538
4539     aoffset = selectByNotMask(halfpi, mask_xnz);
4540     aoffset = selectByMask(aoffset, mask_ynz);
4541
4542     aoffset = blend(aoffset, pi, mask_xlt0);
4543     aoffset = blend(aoffset, -aoffset, mask_ylt0);
4544
4545     xinv = maskzInvSingleAccuracy(x, mask_xnz);
4546     p    = y * xinv;
4547     p    = atanSingleAccuracy(p);
4548     p    = p + aoffset;
4549
4550     return p;
4551 }
4552
4553 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
4554  *
4555  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4556  * \result Correction factor to coulomb force - see below for details.
4557  *
4558  * This routine is meant to enable analytical evaluation of the
4559  * direct-space PME electrostatic force to avoid tables.
4560  *
4561  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
4562  * are some problems evaluating that:
4563  *
4564  * First, the error function is difficult (read: expensive) to
4565  * approxmiate accurately for intermediate to large arguments, and
4566  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
4567  * Second, we now try to avoid calculating potentials in Gromacs but
4568  * use forces directly.
4569  *
4570  * We can simply things slight by noting that the PME part is really
4571  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
4572  * \f[
4573  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
4574  * \f]
4575  * The first term we already have from the inverse square root, so
4576  * that we can leave out of this routine.
4577  *
4578  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
4579  * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
4580  * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
4581  * in this range!
4582  *
4583  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
4584  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
4585  * then only use even powers. This is another minor optimization, since
4586  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
4587  * the vector between the two atoms to get the vectorial force. The
4588  * fastest flops are the ones we can avoid calculating!
4589  *
4590  * So, here's how it should be used:
4591  *
4592  * 1. Calculate \f$r^2\f$.
4593  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
4594  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
4595  * 4. The return value is the expression:
4596  *
4597  * \f[
4598  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
4599  * \f]
4600  *
4601  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
4602  *
4603  *  \f[
4604  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
4605  *  \f]
4606  *
4607  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
4608  *
4609  *  \f[
4610  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
4611  *  \f]
4612  *
4613  *    With a bit of math exercise you should be able to confirm that
4614  *    this is exactly
4615  *
4616  *  \f[
4617  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
4618  *  \f]
4619  *
4620  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
4621  *    and you have your force (divided by \f$r\f$). A final multiplication
4622  *    with the vector connecting the two particles and you have your
4623  *    vectorial force to add to the particles.
4624  *
4625  * This approximation achieves an accuracy slightly lower than 1e-6; when
4626  * added to \f$1/r\f$ the error will be insignificant.
4627  *
4628  */
4629 static inline SimdDouble gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdDouble z2)
4630 {
4631     const SimdDouble FN6(-1.7357322914161492954e-8);
4632     const SimdDouble FN5(1.4703624142580877519e-6);
4633     const SimdDouble FN4(-0.000053401640219807709149);
4634     const SimdDouble FN3(0.0010054721316683106153);
4635     const SimdDouble FN2(-0.019278317264888380590);
4636     const SimdDouble FN1(0.069670166153766424023);
4637     const SimdDouble FN0(-0.75225204789749321333);
4638
4639     const SimdDouble FD4(0.0011193462567257629232);
4640     const SimdDouble FD3(0.014866955030185295499);
4641     const SimdDouble FD2(0.11583842382862377919);
4642     const SimdDouble FD1(0.50736591960530292870);
4643     const SimdDouble FD0(1.0);
4644
4645     SimdDouble z4;
4646     SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
4647
4648     z4 = z2 * z2;
4649
4650     polyFD0 = fma(FD4, z4, FD2);
4651     polyFD1 = fma(FD3, z4, FD1);
4652     polyFD0 = fma(polyFD0, z4, FD0);
4653     polyFD0 = fma(polyFD1, z2, polyFD0);
4654
4655     polyFD0 = invSingleAccuracy(polyFD0);
4656
4657     polyFN0 = fma(FN6, z4, FN4);
4658     polyFN1 = fma(FN5, z4, FN3);
4659     polyFN0 = fma(polyFN0, z4, FN2);
4660     polyFN1 = fma(polyFN1, z4, FN1);
4661     polyFN0 = fma(polyFN0, z4, FN0);
4662     polyFN0 = fma(polyFN1, z2, polyFN0);
4663
4664     return polyFN0 * polyFD0;
4665 }
4666
4667
4668 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4669  *
4670  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4671  * \result Correction factor to coulomb potential - see below for details.
4672  *
4673  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4674  * as the input argument.
4675  *
4676  * Here's how it should be used:
4677  *
4678  * 1. Calculate \f$r^2\f$.
4679  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4680  * 3. Evaluate this routine with z^2 as the argument.
4681  * 4. The return value is the expression:
4682  *
4683  *  \f[
4684  *   \frac{\mbox{erf}(z)}{z}
4685  *  \f]
4686  *
4687  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4688  *
4689  *  \f[
4690  *    \frac{\mbox{erf}(r \beta)}{r}
4691  *  \f]
4692  *
4693  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4694  *    and you have your potential.
4695  *
4696  * This approximation achieves an accuracy slightly lower than 1e-6; when
4697  * added to \f$1/r\f$ the error will be insignificant.
4698  */
4699 static inline SimdDouble gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4700 {
4701     const SimdDouble VN6(1.9296833005951166339e-8);
4702     const SimdDouble VN5(-1.4213390571557850962e-6);
4703     const SimdDouble VN4(0.000041603292906656984871);
4704     const SimdDouble VN3(-0.00013134036773265025626);
4705     const SimdDouble VN2(0.038657983986041781264);
4706     const SimdDouble VN1(0.11285044772717598220);
4707     const SimdDouble VN0(1.1283802385263030286);
4708
4709     const SimdDouble VD3(0.0066752224023576045451);
4710     const SimdDouble VD2(0.078647795836373922256);
4711     const SimdDouble VD1(0.43336185284710920150);
4712     const SimdDouble VD0(1.0);
4713
4714     SimdDouble z4;
4715     SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
4716
4717     z4 = z2 * z2;
4718
4719     polyVD1 = fma(VD3, z4, VD1);
4720     polyVD0 = fma(VD2, z4, VD0);
4721     polyVD0 = fma(polyVD1, z2, polyVD0);
4722
4723     polyVD0 = invSingleAccuracy(polyVD0);
4724
4725     polyVN0 = fma(VN6, z4, VN4);
4726     polyVN1 = fma(VN5, z4, VN3);
4727     polyVN0 = fma(polyVN0, z4, VN2);
4728     polyVN1 = fma(polyVN1, z4, VN1);
4729     polyVN0 = fma(polyVN0, z4, VN0);
4730     polyVN0 = fma(polyVN1, z2, polyVN0);
4731
4732     return polyVN0 * polyVD0;
4733 }
4734
4735 #    endif
4736
4737
4738 /*! \name SIMD4 math functions
4739  *
4740  * \note Only a subset of the math functions are implemented for SIMD4.
4741  *  \{
4742  */
4743
4744
4745 #    if GMX_SIMD4_HAVE_FLOAT
4746
4747 /*************************************************************************
4748  * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4749  *************************************************************************/
4750
4751 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4752  *
4753  * This is a low-level routine that should only be used by SIMD math routine
4754  * that evaluates the inverse square root.
4755  *
4756  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4757  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4758  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4759  */
4760 static inline Simd4Float gmx_simdcall rsqrtIter(Simd4Float lu, Simd4Float x)
4761 {
4762     Simd4Float tmp1 = x * lu;
4763     Simd4Float tmp2 = Simd4Float(-0.5F) * lu;
4764     tmp1            = fma(tmp1, lu, Simd4Float(-3.0F));
4765     return tmp1 * tmp2;
4766 }
4767
4768 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4769  *
4770  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4771  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4772  *           For the single precision implementation this is obviously always
4773  *           true for positive values, but for double precision it adds an
4774  *           extra restriction since the first lookup step might have to be
4775  *           performed in single precision on some architectures. Note that the
4776  *           responsibility for checking falls on you - this routine does not
4777  *           check arguments.
4778  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4779  */
4780 static inline Simd4Float gmx_simdcall invsqrt(Simd4Float x)
4781 {
4782     Simd4Float lu = rsqrt(x);
4783 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4784     lu = rsqrtIter(lu, x);
4785 #        endif
4786 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4787     lu = rsqrtIter(lu, x);
4788 #        endif
4789 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4790     lu = rsqrtIter(lu, x);
4791 #        endif
4792     return lu;
4793 }
4794
4795
4796 #    endif // GMX_SIMD4_HAVE_FLOAT
4797
4798
4799 #    if GMX_SIMD4_HAVE_DOUBLE
4800 /*************************************************************************
4801  * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4802  *************************************************************************/
4803
4804 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4805  *
4806  * This is a low-level routine that should only be used by SIMD math routine
4807  * that evaluates the inverse square root.
4808  *
4809  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4810  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4811  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4812  */
4813 static inline Simd4Double gmx_simdcall rsqrtIter(Simd4Double lu, Simd4Double x)
4814 {
4815     Simd4Double tmp1 = x * lu;
4816     Simd4Double tmp2 = Simd4Double(-0.5F) * lu;
4817     tmp1             = fma(tmp1, lu, Simd4Double(-3.0F));
4818     return tmp1 * tmp2;
4819 }
4820
4821 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4822  *
4823  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4824  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4825  *           For the single precision implementation this is obviously always
4826  *           true for positive values, but for double precision it adds an
4827  *           extra restriction since the first lookup step might have to be
4828  *           performed in single precision on some architectures. Note that the
4829  *           responsibility for checking falls on you - this routine does not
4830  *           check arguments.
4831  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4832  */
4833 static inline Simd4Double gmx_simdcall invsqrt(Simd4Double x)
4834 {
4835     Simd4Double lu = rsqrt(x);
4836 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4837     lu = rsqrtIter(lu, x);
4838 #        endif
4839 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4840     lu = rsqrtIter(lu, x);
4841 #        endif
4842 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4843     lu = rsqrtIter(lu, x);
4844 #        endif
4845 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4846     lu = rsqrtIter(lu, x);
4847 #        endif
4848     return lu;
4849 }
4850
4851
4852 /**********************************************************************
4853  * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4854  **********************************************************************/
4855
4856 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4857  *
4858  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4859  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4860  *           For the single precision implementation this is obviously always
4861  *           true for positive values, but for double precision it adds an
4862  *           extra restriction since the first lookup step might have to be
4863  *           performed in single precision on some architectures. Note that the
4864  *           responsibility for checking falls on you - this routine does not
4865  *           check arguments.
4866  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4867  */
4868 static inline Simd4Double gmx_simdcall invsqrtSingleAccuracy(Simd4Double x)
4869 {
4870     Simd4Double lu = rsqrt(x);
4871 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4872     lu = rsqrtIter(lu, x);
4873 #        endif
4874 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4875     lu = rsqrtIter(lu, x);
4876 #        endif
4877 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4878     lu = rsqrtIter(lu, x);
4879 #        endif
4880     return lu;
4881 }
4882
4883
4884 #    endif // GMX_SIMD4_HAVE_DOUBLE
4885
4886 /*! \} */
4887
4888 #    if GMX_SIMD_HAVE_FLOAT
4889 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4890  *
4891  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4892  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4893  *           For the single precision implementation this is obviously always
4894  *           true for positive values, but for double precision it adds an
4895  *           extra restriction since the first lookup step might have to be
4896  *           performed in single precision on some architectures. Note that the
4897  *           responsibility for checking falls on you - this routine does not
4898  *           check arguments.
4899  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4900  */
4901 static inline SimdFloat gmx_simdcall invsqrtSingleAccuracy(SimdFloat x)
4902 {
4903     return invsqrt(x);
4904 }
4905
4906 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4907  *
4908  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4909  *  Illegal values in the masked-out elements will not lead to
4910  *  floating-point exceptions.
4911  *
4912  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4913  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4914  *           For the single precision implementation this is obviously always
4915  *           true for positive values, but for double precision it adds an
4916  *           extra restriction since the first lookup step might have to be
4917  *           performed in single precision on some architectures. Note that the
4918  *           responsibility for checking falls on you - this routine does not
4919  *           check arguments.
4920  *  \param m Mask
4921  *  \return  1/sqrt(x). Result is undefined if your argument was invalid or
4922  *           entry was not masked, and 0.0 for masked-out entries.
4923  */
4924 static inline SimdFloat maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4925 {
4926     return maskzInvsqrt(x, m);
4927 }
4928
4929 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4930  *
4931  * \param x0  First set of arguments, x0 must be in single range (see below).
4932  * \param x1  Second set of arguments, x1 must be in single range (see below).
4933  * \param[out] out0  Result 1/sqrt(x0)
4934  * \param[out] out1  Result 1/sqrt(x1)
4935  *
4936  *  In particular for double precision we can sometimes calculate square root
4937  *  pairs slightly faster by using single precision until the very last step.
4938  *
4939  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4940  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
4941  *       For the single precision implementation this is obviously always
4942  *       true for positive values, but for double precision it adds an
4943  *       extra restriction since the first lookup step might have to be
4944  *       performed in single precision on some architectures. Note that the
4945  *       responsibility for checking falls on you - this routine does not
4946  *       check arguments.
4947  */
4948 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
4949 {
4950     return invsqrtPair(x0, x1, out0, out1);
4951 }
4952
4953 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4954  *
4955  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4956  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4957  *           For the single precision implementation this is obviously always
4958  *           true for positive values, but for double precision it adds an
4959  *           extra restriction since the first lookup step might have to be
4960  *           performed in single precision on some architectures. Note that the
4961  *           responsibility for checking falls on you - this routine does not
4962  *           check arguments.
4963  *  \return  1/x. Result is undefined if your argument was invalid.
4964  */
4965 static inline SimdFloat gmx_simdcall invSingleAccuracy(SimdFloat x)
4966 {
4967     return inv(x);
4968 }
4969
4970
4971 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4972  *
4973  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4974  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4975  *           For the single precision implementation this is obviously always
4976  *           true for positive values, but for double precision it adds an
4977  *           extra restriction since the first lookup step might have to be
4978  *           performed in single precision on some architectures. Note that the
4979  *           responsibility for checking falls on you - this routine does not
4980  *           check arguments.
4981  *  \param m Mask
4982  *  \return  1/x for elements where m is true, or 0.0 for masked-out entries.
4983  */
4984 static inline SimdFloat maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4985 {
4986     return maskzInv(x, m);
4987 }
4988
4989 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4990  *
4991  * \copydetails sqrt(SimdFloat)
4992  */
4993 template<MathOptimization opt = MathOptimization::Safe>
4994 static inline SimdFloat gmx_simdcall sqrtSingleAccuracy(SimdFloat x)
4995 {
4996     return sqrt<opt>(x);
4997 }
4998
4999 /*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
5000  *
5001  * \copydetails cbrt(SimdFloat)
5002  */
5003 static inline SimdFloat gmx_simdcall cbrtSingleAccuracy(SimdFloat x)
5004 {
5005     return cbrt(x);
5006 }
5007
5008 /*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
5009  *
5010  * \copydetails cbrt(SimdFloat)
5011  */
5012 static inline SimdFloat gmx_simdcall invcbrtSingleAccuracy(SimdFloat x)
5013 {
5014     return invcbrt(x);
5015 }
5016
5017 /*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
5018  *
5019  * \param x Argument, should be >0.
5020  * \result The base-2 logarithm of x. Undefined if argument is invalid.
5021  */
5022 static inline SimdFloat gmx_simdcall log2SingleAccuracy(SimdFloat x)
5023 {
5024     return log2(x);
5025 }
5026
5027 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
5028  *
5029  * \param x Argument, should be >0.
5030  * \result The natural logarithm of x. Undefined if argument is invalid.
5031  */
5032 static inline SimdFloat gmx_simdcall logSingleAccuracy(SimdFloat x)
5033 {
5034     return log(x);
5035 }
5036
5037 /*! \brief SIMD float 2^x, only targeting single accuracy.
5038  *
5039  * \copydetails exp2(SimdFloat)
5040  */
5041 template<MathOptimization opt = MathOptimization::Safe>
5042 static inline SimdFloat gmx_simdcall exp2SingleAccuracy(SimdFloat x)
5043 {
5044     return exp2<opt>(x);
5045 }
5046
5047 /*! \brief SIMD float e^x, only targeting single accuracy.
5048  *
5049  * \copydetails exp(SimdFloat)
5050  */
5051 template<MathOptimization opt = MathOptimization::Safe>
5052 static inline SimdFloat gmx_simdcall expSingleAccuracy(SimdFloat x)
5053 {
5054     return exp<opt>(x);
5055 }
5056
5057 /*! \brief SIMD pow(x,y), only targeting single accuracy.
5058  *
5059  * \copydetails pow(SimdFloat,SimdFloat)
5060  */
5061 template<MathOptimization opt = MathOptimization::Safe>
5062 static inline SimdFloat gmx_simdcall powSingleAccuracy(SimdFloat x, SimdFloat y)
5063 {
5064     return pow<opt>(x, y);
5065 }
5066
5067 /*! \brief SIMD float erf(x), only targeting single accuracy.
5068  *
5069  * \param x The value to calculate erf(x) for.
5070  * \result erf(x)
5071  *
5072  * This routine achieves very close to single precision, but we do not care about
5073  * the last bit or the subnormal result range.
5074  */
5075 static inline SimdFloat gmx_simdcall erfSingleAccuracy(SimdFloat x)
5076 {
5077     return erf(x);
5078 }
5079
5080 /*! \brief SIMD float erfc(x), only targeting single accuracy.
5081  *
5082  * \param x The value to calculate erfc(x) for.
5083  * \result erfc(x)
5084  *
5085  * This routine achieves singleprecision (bar the last bit) over most of the
5086  * input range, but for large arguments where the result is getting close
5087  * to the minimum representable numbers we accept slightly larger errors
5088  * (think results that are in the ballpark of 10^-30) since that is not
5089  * relevant for MD.
5090  */
5091 static inline SimdFloat gmx_simdcall erfcSingleAccuracy(SimdFloat x)
5092 {
5093     return erfc(x);
5094 }
5095
5096 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
5097  *
5098  * \param x The argument to evaluate sin/cos for
5099  * \param[out] sinval Sin(x)
5100  * \param[out] cosval Cos(x)
5101  */
5102 static inline void gmx_simdcall sinCosSingleAccuracy(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
5103 {
5104     sincos(x, sinval, cosval);
5105 }
5106
5107 /*! \brief SIMD float sin(x), only targeting single accuracy.
5108  *
5109  * \param x The argument to evaluate sin for
5110  * \result Sin(x)
5111  *
5112  * \attention Do NOT call both sin & cos if you need both results, since each of them
5113  * will then call \ref sincos and waste a factor 2 in performance.
5114  */
5115 static inline SimdFloat gmx_simdcall sinSingleAccuracy(SimdFloat x)
5116 {
5117     return sin(x);
5118 }
5119
5120 /*! \brief SIMD float cos(x), only targeting single accuracy.
5121  *
5122  * \param x The argument to evaluate cos for
5123  * \result Cos(x)
5124  *
5125  * \attention Do NOT call both sin & cos if you need both results, since each of them
5126  * will then call \ref sincos and waste a factor 2 in performance.
5127  */
5128 static inline SimdFloat gmx_simdcall cosSingleAccuracy(SimdFloat x)
5129 {
5130     return cos(x);
5131 }
5132
5133 /*! \brief SIMD float tan(x), only targeting single accuracy.
5134  *
5135  * \param x The argument to evaluate tan for
5136  * \result Tan(x)
5137  */
5138 static inline SimdFloat gmx_simdcall tanSingleAccuracy(SimdFloat x)
5139 {
5140     return tan(x);
5141 }
5142
5143 /*! \brief SIMD float asin(x), only targeting single accuracy.
5144  *
5145  * \param x The argument to evaluate asin for
5146  * \result Asin(x)
5147  */
5148 static inline SimdFloat gmx_simdcall asinSingleAccuracy(SimdFloat x)
5149 {
5150     return asin(x);
5151 }
5152
5153 /*! \brief SIMD float acos(x), only targeting single accuracy.
5154  *
5155  * \param x The argument to evaluate acos for
5156  * \result Acos(x)
5157  */
5158 static inline SimdFloat gmx_simdcall acosSingleAccuracy(SimdFloat x)
5159 {
5160     return acos(x);
5161 }
5162
5163 /*! \brief SIMD float atan(x), only targeting single accuracy.
5164  *
5165  * \param x The argument to evaluate atan for
5166  * \result Atan(x), same argument/value range as standard math library.
5167  */
5168 static inline SimdFloat gmx_simdcall atanSingleAccuracy(SimdFloat x)
5169 {
5170     return atan(x);
5171 }
5172
5173 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
5174  *
5175  * \param y Y component of vector, any quartile
5176  * \param x X component of vector, any quartile
5177  * \result Atan(y,x), same argument/value range as standard math library.
5178  *
5179  * \note This routine should provide correct results for all finite
5180  * non-zero or positive-zero arguments. However, negative zero arguments will
5181  * be treated as positive zero, which means the return value will deviate from
5182  * the standard math library atan2(y,x) for those cases. That should not be
5183  * of any concern in Gromacs, and in particular it will not affect calculations
5184  * of angles from vectors.
5185  */
5186 static inline SimdFloat gmx_simdcall atan2SingleAccuracy(SimdFloat y, SimdFloat x)
5187 {
5188     return atan2(y, x);
5189 }
5190
5191 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
5192  *
5193  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5194  * \result Correction factor to coulomb force.
5195  */
5196 static inline SimdFloat gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdFloat z2)
5197 {
5198     return pmeForceCorrection(z2);
5199 }
5200
5201 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
5202  *
5203  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5204  * \result Correction factor to coulomb force.
5205  */
5206 static inline SimdFloat gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
5207 {
5208     return pmePotentialCorrection(z2);
5209 }
5210 #    endif // GMX_SIMD_HAVE_FLOAT
5211
5212 #    if GMX_SIMD4_HAVE_FLOAT
5213 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
5214  *
5215  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
5216  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
5217  *           For the single precision implementation this is obviously always
5218  *           true for positive values, but for double precision it adds an
5219  *           extra restriction since the first lookup step might have to be
5220  *           performed in single precision on some architectures. Note that the
5221  *           responsibility for checking falls on you - this routine does not
5222  *           check arguments.
5223  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
5224  */
5225 static inline Simd4Float gmx_simdcall invsqrtSingleAccuracy(Simd4Float x)
5226 {
5227     return invsqrt(x);
5228 }
5229 #    endif // GMX_SIMD4_HAVE_FLOAT
5230
5231 /*! \}   end of addtogroup module_simd */
5232 /*! \endcond  end of condition libabl */
5233
5234 #endif // GMX_SIMD
5235
5236 } // namespace gmx
5237
5238 #endif // GMX_SIMD_SIMD_MATH_H