2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H_
36 #define GMX_SIMD_SIMD_MATH_H_
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \ingroup module_simd
62 #include "gromacs/math/utilities.h"
63 #include "gromacs/simd/simd.h"
66 /*! \addtogroup module_simd */
69 /*! \name Implementation accuracy settings
75 #ifdef GMX_SIMD_HAVE_FLOAT
77 /*! \name Single precision SIMD math functions
79 * \note In most cases you should use the real-precision functions instead.
83 /****************************************
84 * SINGLE PRECISION SIMD MATH FUNCTIONS *
85 ****************************************/
87 /*! \brief SIMD float utility to sum a+b+c+d.
89 * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
91 * \param a term 1 (multiple values)
92 * \param b term 2 (multiple values)
93 * \param c term 3 (multiple values)
94 * \param d term 4 (multiple values)
95 * \return sum of terms 1-4 (multiple values)
97 static gmx_inline gmx_simd_float_t gmx_simdcall
98 gmx_simd_sum4_f(gmx_simd_float_t a
, gmx_simd_float_t b
,
99 gmx_simd_float_t c
, gmx_simd_float_t d
)
101 return gmx_simd_add_f(gmx_simd_add_f(a
, b
), gmx_simd_add_f(c
, d
));
104 /*! \brief Return -a if b is negative, SIMD float.
106 * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
108 * \param a Values to set sign for
109 * \param b Values used to set sign
110 * \return if b is negative, the sign of a will be changed.
112 * This is equivalent to doing an xor operation on a with the sign bit of b,
113 * with the exception that negative zero is not considered to be negative
114 * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
116 static gmx_inline gmx_simd_float_t gmx_simdcall
117 gmx_simd_xor_sign_f(gmx_simd_float_t a
, gmx_simd_float_t b
)
119 #ifdef GMX_SIMD_HAVE_LOGICAL
120 return gmx_simd_xor_f(a
, gmx_simd_and_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO
), b
));
122 return gmx_simd_blendv_f(a
, gmx_simd_fneg_f(a
), gmx_simd_cmplt_f(b
, gmx_simd_setzero_f()));
126 #ifndef gmx_simd_rsqrt_iter_f
127 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
129 * This is a low-level routine that should only be used by SIMD math routine
130 * that evaluates the inverse square root.
132 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
133 * \param x The reference (starting) value x for which we want 1/sqrt(x).
134 * \return An improved approximation with roughly twice as many bits of accuracy.
136 static gmx_inline gmx_simd_float_t gmx_simdcall
137 gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu
, gmx_simd_float_t x
)
139 # ifdef GMX_SIMD_HAVE_FMA
140 return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x
, gmx_simd_mul_f(lu
, lu
), gmx_simd_set1_f(1.0f
)), gmx_simd_mul_f(lu
, gmx_simd_set1_f(0.5f
)), lu
);
142 return gmx_simd_mul_f(gmx_simd_set1_f(0.5f
), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f
), gmx_simd_mul_f(gmx_simd_mul_f(lu
, lu
), x
)), lu
));
147 /*! \brief Calculate 1/sqrt(x) for SIMD float.
149 * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
151 * \param x Argument that must be >0. This routine does not check arguments.
152 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
154 static gmx_inline gmx_simd_float_t gmx_simdcall
155 gmx_simd_invsqrt_f(gmx_simd_float_t x
)
157 gmx_simd_float_t lu
= gmx_simd_rsqrt_f(x
);
158 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
159 lu
= gmx_simd_rsqrt_iter_f(lu
, x
);
161 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
162 lu
= gmx_simd_rsqrt_iter_f(lu
, x
);
164 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
165 lu
= gmx_simd_rsqrt_iter_f(lu
, x
);
170 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
172 * Identical to gmx_simd_invsqrt_f but avoids fp-exception for non-masked entries.
173 * The result for the non-masked entries is undefined and the user has to use blend
174 * with the same mask to obtain a defined result.
176 * \param x Argument that must be >0 for masked entries
177 * \param m Masked entries
178 * \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was not masked.
180 static gmx_inline gmx_simd_float_t
181 gmx_simd_invsqrt_maskfpe_f(gmx_simd_float_t x
, gmx_simd_fbool_t gmx_unused m
)
184 return gmx_simd_invsqrt_f(x
);
186 return gmx_simd_invsqrt_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f
), x
, m
));
190 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD float.
192 * Identical to gmx_simd_invsqrt_f but avoids fp-exception for masked entries.
193 * The result for the non-masked entries is undefined and the user has to use blend
194 * with the same mask to obtain a defined result.
196 * \param x Argument that must be >0 for non-masked entries
197 * \param m Masked entries
198 * \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was masked.
200 static gmx_inline gmx_simd_float_t
201 gmx_simd_invsqrt_notmaskfpe_f(gmx_simd_float_t x
, gmx_simd_fbool_t gmx_unused m
)
204 return gmx_simd_invsqrt_f(x
);
206 return gmx_simd_invsqrt_f(gmx_simd_blendv_f(x
, gmx_simd_set1_f(1.0f
), m
));
210 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
212 * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
214 * \param x0 First set of arguments, x0 must be positive - no argument checking.
215 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
216 * \param[out] out0 Result 1/sqrt(x0)
217 * \param[out] out1 Result 1/sqrt(x1)
219 * In particular for double precision we can sometimes calculate square root
220 * pairs slightly faster by using single precision until the very last step.
222 static gmx_inline
void gmx_simdcall
223 gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0
, gmx_simd_float_t x1
,
224 gmx_simd_float_t
*out0
, gmx_simd_float_t
*out1
)
226 *out0
= gmx_simd_invsqrt_f(x0
);
227 *out1
= gmx_simd_invsqrt_f(x1
);
230 #ifndef gmx_simd_rcp_iter_f
231 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
233 * This is a low-level routine that should only be used by SIMD math routine
234 * that evaluates the reciprocal.
236 * \param lu Approximation of 1/x, typically obtained from lookup.
237 * \param x The reference (starting) value x for which we want 1/x.
238 * \return An improved approximation with roughly twice as many bits of accuracy.
240 static gmx_inline gmx_simd_float_t gmx_simdcall
241 gmx_simd_rcp_iter_f(gmx_simd_float_t lu
, gmx_simd_float_t x
)
243 return gmx_simd_mul_f(lu
, gmx_simd_fnmadd_f(lu
, x
, gmx_simd_set1_f(2.0f
)));
247 /*! \brief Calculate 1/x for SIMD float.
249 * You should normally call the real-precision routine \ref gmx_simd_inv_r.
251 * \param x Argument that must be nonzero. This routine does not check arguments.
252 * \return 1/x. Result is undefined if your argument was invalid.
254 static gmx_inline gmx_simd_float_t gmx_simdcall
255 gmx_simd_inv_f(gmx_simd_float_t x
)
257 gmx_simd_float_t lu
= gmx_simd_rcp_f(x
);
258 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
259 lu
= gmx_simd_rcp_iter_f(lu
, x
);
261 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
262 lu
= gmx_simd_rcp_iter_f(lu
, x
);
264 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
265 lu
= gmx_simd_rcp_iter_f(lu
, x
);
270 /*! \brief Calculate 1/x for masked entries of SIMD float.
272 * Identical to gmx_simd_inv_f but avoids fp-exception for non-masked entries.
273 * The result for the non-masked entries is undefined and the user has to use blend
274 * with the same mask to obtain a defined result.
276 * \param x Argument that must be nonzero for masked entries
277 * \param m Masked entries
278 * \return 1/x. Result is undefined if your argument was invalid or entry was not masked.
280 static gmx_inline gmx_simd_float_t
281 gmx_simd_inv_maskfpe_f(gmx_simd_float_t x
, gmx_simd_fbool_t gmx_unused m
)
284 return gmx_simd_inv_f(x
);
286 return gmx_simd_inv_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f
), x
, m
));
291 /*! \brief Calculate 1/x for non-masked entries of SIMD float.
293 * Identical to gmx_simd_inv_f but avoids fp-exception for masked entries.
294 * The result for the non-masked entries is undefined and the user has to use blend
295 * with the same mask to obtain a defined result.
297 * \param x Argument that must be nonzero for non-masked entries
298 * \param m Masked entries
299 * \return 1/x. Result is undefined if your argument was invalid or entry was masked.
301 static gmx_inline gmx_simd_float_t
302 gmx_simd_inv_notmaskfpe_f(gmx_simd_float_t x
, gmx_simd_fbool_t gmx_unused m
)
305 return gmx_simd_inv_f(x
);
307 return gmx_simd_inv_f(gmx_simd_blendv_f(x
, gmx_simd_set1_f(1.0f
), m
));
311 /*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
313 * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
315 * \param x Argument that must be >=0.
316 * \return sqrt(x). If x=0, the result will correctly be set to 0.
317 * The result is undefined if the input value is negative.
319 static gmx_inline gmx_simd_float_t gmx_simdcall
320 gmx_simd_sqrt_f(gmx_simd_float_t x
)
322 gmx_simd_fbool_t mask
;
323 gmx_simd_float_t res
;
325 mask
= gmx_simd_cmpeq_f(x
, gmx_simd_setzero_f());
326 res
= gmx_simd_blendnotzero_f(gmx_simd_invsqrt_notmaskfpe_f(x
, mask
), mask
);
327 return gmx_simd_mul_f(res
, x
);
330 /*! \brief SIMD float log(x). This is the natural logarithm.
332 * You should normally call the real-precision routine \ref gmx_simd_log_r.
334 * \param x Argument, should be >0.
335 * \result The natural logarithm of x. Undefined if argument is invalid.
337 #ifndef gmx_simd_log_f
338 static gmx_inline gmx_simd_float_t gmx_simdcall
339 gmx_simd_log_f(gmx_simd_float_t x
)
341 const gmx_simd_float_t half
= gmx_simd_set1_f(0.5f
);
342 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
343 const gmx_simd_float_t sqrt2
= gmx_simd_set1_f(sqrt(2.0f
));
344 const gmx_simd_float_t corr
= gmx_simd_set1_f(0.693147180559945286226764f
);
345 const gmx_simd_float_t CL9
= gmx_simd_set1_f(0.2371599674224853515625f
);
346 const gmx_simd_float_t CL7
= gmx_simd_set1_f(0.285279005765914916992188f
);
347 const gmx_simd_float_t CL5
= gmx_simd_set1_f(0.400005519390106201171875f
);
348 const gmx_simd_float_t CL3
= gmx_simd_set1_f(0.666666567325592041015625f
);
349 const gmx_simd_float_t CL1
= gmx_simd_set1_f(2.0f
);
350 gmx_simd_float_t fexp
, x2
, p
;
351 gmx_simd_fbool_t mask
;
353 fexp
= gmx_simd_get_exponent_f(x
);
354 x
= gmx_simd_get_mantissa_f(x
);
356 mask
= gmx_simd_cmplt_f(sqrt2
, x
);
357 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
358 fexp
= gmx_simd_add_f(fexp
, gmx_simd_blendzero_f(one
, mask
));
359 x
= gmx_simd_mul_f(x
, gmx_simd_blendv_f(one
, half
, mask
));
361 x
= gmx_simd_mul_f( gmx_simd_sub_f(x
, one
), gmx_simd_inv_f( gmx_simd_add_f(x
, one
) ) );
362 x2
= gmx_simd_mul_f(x
, x
);
364 p
= gmx_simd_fmadd_f(CL9
, x2
, CL7
);
365 p
= gmx_simd_fmadd_f(p
, x2
, CL5
);
366 p
= gmx_simd_fmadd_f(p
, x2
, CL3
);
367 p
= gmx_simd_fmadd_f(p
, x2
, CL1
);
368 p
= gmx_simd_fmadd_f(p
, x
, gmx_simd_mul_f(corr
, fexp
));
374 #ifndef gmx_simd_exp2_f
375 /*! \brief SIMD float 2^x.
377 * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
380 * \result 2^x. Undefined if input argument caused overflow.
382 static gmx_inline gmx_simd_float_t gmx_simdcall
383 gmx_simd_exp2_f(gmx_simd_float_t x
)
385 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
386 const gmx_simd_float_t arglimit
= gmx_simd_set1_f(126.0f
);
387 const gmx_simd_float_t CC6
= gmx_simd_set1_f(0.0001534581200287996416911311);
388 const gmx_simd_float_t CC5
= gmx_simd_set1_f(0.001339993121934088894618990);
389 const gmx_simd_float_t CC4
= gmx_simd_set1_f(0.009618488957115180159497841);
390 const gmx_simd_float_t CC3
= gmx_simd_set1_f(0.05550328776964726865751735);
391 const gmx_simd_float_t CC2
= gmx_simd_set1_f(0.2402264689063408646490722);
392 const gmx_simd_float_t CC1
= gmx_simd_set1_f(0.6931472057372680777553816);
393 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
395 gmx_simd_float_t fexppart
;
396 gmx_simd_float_t intpart
;
398 gmx_simd_fbool_t valuemask
;
400 fexppart
= gmx_simd_set_exponent_f(x
);
401 intpart
= gmx_simd_round_f(x
);
402 valuemask
= gmx_simd_cmple_f(gmx_simd_fabs_f(x
), arglimit
);
403 fexppart
= gmx_simd_blendzero_f(fexppart
, valuemask
);
404 x
= gmx_simd_sub_f(x
, intpart
);
406 p
= gmx_simd_fmadd_f(CC6
, x
, CC5
);
407 p
= gmx_simd_fmadd_f(p
, x
, CC4
);
408 p
= gmx_simd_fmadd_f(p
, x
, CC3
);
409 p
= gmx_simd_fmadd_f(p
, x
, CC2
);
410 p
= gmx_simd_fmadd_f(p
, x
, CC1
);
411 p
= gmx_simd_fmadd_f(p
, x
, one
);
412 x
= gmx_simd_mul_f(p
, fexppart
);
417 #ifndef gmx_simd_exp_f
418 /*! \brief SIMD float exp(x).
420 * You should normally call the real-precision routine \ref gmx_simd_exp_r.
422 * In addition to scaling the argument for 2^x this routine correctly does
423 * extended precision arithmetics to improve accuracy.
426 * \result exp(x). Undefined if input argument caused overflow,
427 * which can happen if abs(x) \> 7e13.
429 static gmx_inline gmx_simd_float_t gmx_simdcall
430 gmx_simd_exp_f(gmx_simd_float_t x
)
432 const gmx_simd_float_t argscale
= gmx_simd_set1_f(1.44269504088896341f
);
433 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
434 const gmx_simd_float_t arglimit
= gmx_simd_set1_f(126.0f
);
435 const gmx_simd_float_t invargscale0
= gmx_simd_set1_f(-0.693145751953125f
);
436 const gmx_simd_float_t invargscale1
= gmx_simd_set1_f(-1.428606765330187045e-06f
);
437 const gmx_simd_float_t CC4
= gmx_simd_set1_f(0.00136324646882712841033936f
);
438 const gmx_simd_float_t CC3
= gmx_simd_set1_f(0.00836596917361021041870117f
);
439 const gmx_simd_float_t CC2
= gmx_simd_set1_f(0.0416710823774337768554688f
);
440 const gmx_simd_float_t CC1
= gmx_simd_set1_f(0.166665524244308471679688f
);
441 const gmx_simd_float_t CC0
= gmx_simd_set1_f(0.499999850988388061523438f
);
442 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
443 gmx_simd_float_t fexppart
;
444 gmx_simd_float_t intpart
;
445 gmx_simd_float_t y
, p
;
446 gmx_simd_fbool_t valuemask
;
448 y
= gmx_simd_mul_f(x
, argscale
);
449 fexppart
= gmx_simd_set_exponent_f(y
); /* rounds to nearest int internally */
450 intpart
= gmx_simd_round_f(y
); /* use same rounding algorithm here */
451 valuemask
= gmx_simd_cmple_f(gmx_simd_fabs_f(y
), arglimit
);
452 fexppart
= gmx_simd_blendzero_f(fexppart
, valuemask
);
454 /* Extended precision arithmetics */
455 x
= gmx_simd_fmadd_f(invargscale0
, intpart
, x
);
456 x
= gmx_simd_fmadd_f(invargscale1
, intpart
, x
);
458 p
= gmx_simd_fmadd_f(CC4
, x
, CC3
);
459 p
= gmx_simd_fmadd_f(p
, x
, CC2
);
460 p
= gmx_simd_fmadd_f(p
, x
, CC1
);
461 p
= gmx_simd_fmadd_f(p
, x
, CC0
);
462 p
= gmx_simd_fmadd_f(gmx_simd_mul_f(x
, x
), p
, x
);
463 p
= gmx_simd_add_f(p
, one
);
464 x
= gmx_simd_mul_f(p
, fexppart
);
469 /*! \brief SIMD float erf(x).
471 * You should normally call the real-precision routine \ref gmx_simd_erf_r.
473 * \param x The value to calculate erf(x) for.
476 * This routine achieves very close to full precision, but we do not care about
477 * the last bit or the subnormal result range.
479 static gmx_inline gmx_simd_float_t gmx_simdcall
480 gmx_simd_erf_f(gmx_simd_float_t x
)
482 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
483 const gmx_simd_float_t CA6
= gmx_simd_set1_f(7.853861353153693e-5f
);
484 const gmx_simd_float_t CA5
= gmx_simd_set1_f(-8.010193625184903e-4f
);
485 const gmx_simd_float_t CA4
= gmx_simd_set1_f(5.188327685732524e-3f
);
486 const gmx_simd_float_t CA3
= gmx_simd_set1_f(-2.685381193529856e-2f
);
487 const gmx_simd_float_t CA2
= gmx_simd_set1_f(1.128358514861418e-1f
);
488 const gmx_simd_float_t CA1
= gmx_simd_set1_f(-3.761262582423300e-1f
);
489 const gmx_simd_float_t CA0
= gmx_simd_set1_f(1.128379165726710f
);
490 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
491 const gmx_simd_float_t CB9
= gmx_simd_set1_f(-0.0018629930017603923f
);
492 const gmx_simd_float_t CB8
= gmx_simd_set1_f(0.003909821287598495f
);
493 const gmx_simd_float_t CB7
= gmx_simd_set1_f(-0.0052094582210355615f
);
494 const gmx_simd_float_t CB6
= gmx_simd_set1_f(0.005685614362160572f
);
495 const gmx_simd_float_t CB5
= gmx_simd_set1_f(-0.0025367682853477272f
);
496 const gmx_simd_float_t CB4
= gmx_simd_set1_f(-0.010199799682318782f
);
497 const gmx_simd_float_t CB3
= gmx_simd_set1_f(0.04369575504816542f
);
498 const gmx_simd_float_t CB2
= gmx_simd_set1_f(-0.11884063474674492f
);
499 const gmx_simd_float_t CB1
= gmx_simd_set1_f(0.2732120154030589f
);
500 const gmx_simd_float_t CB0
= gmx_simd_set1_f(0.42758357702025784f
);
501 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
502 const gmx_simd_float_t CC10
= gmx_simd_set1_f(-0.0445555913112064f
);
503 const gmx_simd_float_t CC9
= gmx_simd_set1_f(0.21376355144663348f
);
504 const gmx_simd_float_t CC8
= gmx_simd_set1_f(-0.3473187200259257f
);
505 const gmx_simd_float_t CC7
= gmx_simd_set1_f(0.016690861551248114f
);
506 const gmx_simd_float_t CC6
= gmx_simd_set1_f(0.7560973182491192f
);
507 const gmx_simd_float_t CC5
= gmx_simd_set1_f(-1.2137903600145787f
);
508 const gmx_simd_float_t CC4
= gmx_simd_set1_f(0.8411872321232948f
);
509 const gmx_simd_float_t CC3
= gmx_simd_set1_f(-0.08670413896296343f
);
510 const gmx_simd_float_t CC2
= gmx_simd_set1_f(-0.27124782687240334f
);
511 const gmx_simd_float_t CC1
= gmx_simd_set1_f(-0.0007502488047806069f
);
512 const gmx_simd_float_t CC0
= gmx_simd_set1_f(0.5642114853803148f
);
513 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
514 const gmx_simd_float_t two
= gmx_simd_set1_f(2.0f
);
516 gmx_simd_float_t x2
, x4
, y
;
517 gmx_simd_float_t t
, t2
, w
, w2
;
518 gmx_simd_float_t pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
519 gmx_simd_float_t expmx2
;
520 gmx_simd_float_t res_erf
, res_erfc
, res
;
521 gmx_simd_fbool_t mask
, msk_erf
;
523 /* Calculate erf() */
524 x2
= gmx_simd_mul_f(x
, x
);
525 x4
= gmx_simd_mul_f(x2
, x2
);
527 pA0
= gmx_simd_fmadd_f(CA6
, x4
, CA4
);
528 pA1
= gmx_simd_fmadd_f(CA5
, x4
, CA3
);
529 pA0
= gmx_simd_fmadd_f(pA0
, x4
, CA2
);
530 pA1
= gmx_simd_fmadd_f(pA1
, x4
, CA1
);
531 pA0
= gmx_simd_mul_f(pA0
, x4
);
532 pA0
= gmx_simd_fmadd_f(pA1
, x2
, pA0
);
533 /* Constant term must come last for precision reasons */
534 pA0
= gmx_simd_add_f(pA0
, CA0
);
536 res_erf
= gmx_simd_mul_f(x
, pA0
);
539 y
= gmx_simd_fabs_f(x
);
540 msk_erf
= gmx_simd_cmplt_f(y
, gmx_simd_set1_f(0.75f
));
541 t
= gmx_simd_inv_notmaskfpe_f(y
, msk_erf
);
542 w
= gmx_simd_sub_f(t
, one
);
543 t2
= gmx_simd_mul_f(t
, t
);
544 w2
= gmx_simd_mul_f(w
, w
);
546 /* No need for a floating-point sieve here (as in erfc), since erf()
547 * will never return values that are extremely small for large args.
549 expmx2
= gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y
, y
)));
551 pB1
= gmx_simd_fmadd_f(CB9
, w2
, CB7
);
552 pB0
= gmx_simd_fmadd_f(CB8
, w2
, CB6
);
553 pB1
= gmx_simd_fmadd_f(pB1
, w2
, CB5
);
554 pB0
= gmx_simd_fmadd_f(pB0
, w2
, CB4
);
555 pB1
= gmx_simd_fmadd_f(pB1
, w2
, CB3
);
556 pB0
= gmx_simd_fmadd_f(pB0
, w2
, CB2
);
557 pB1
= gmx_simd_fmadd_f(pB1
, w2
, CB1
);
558 pB0
= gmx_simd_fmadd_f(pB0
, w2
, CB0
);
559 pB0
= gmx_simd_fmadd_f(pB1
, w
, pB0
);
561 pC0
= gmx_simd_fmadd_f(CC10
, t2
, CC8
);
562 pC1
= gmx_simd_fmadd_f(CC9
, t2
, CC7
);
563 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC6
);
564 pC1
= gmx_simd_fmadd_f(pC1
, t2
, CC5
);
565 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC4
);
566 pC1
= gmx_simd_fmadd_f(pC1
, t2
, CC3
);
567 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC2
);
568 pC1
= gmx_simd_fmadd_f(pC1
, t2
, CC1
);
570 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC0
);
571 pC0
= gmx_simd_fmadd_f(pC1
, t
, pC0
);
572 pC0
= gmx_simd_mul_f(pC0
, t
);
574 /* SELECT pB0 or pC0 for erfc() */
575 mask
= gmx_simd_cmplt_f(two
, y
);
576 res_erfc
= gmx_simd_blendv_f(pB0
, pC0
, mask
);
577 res_erfc
= gmx_simd_mul_f(res_erfc
, expmx2
);
579 /* erfc(x<0) = 2-erfc(|x|) */
580 mask
= gmx_simd_cmplt_f(x
, gmx_simd_setzero_f());
581 res_erfc
= gmx_simd_blendv_f(res_erfc
, gmx_simd_sub_f(two
, res_erfc
), mask
);
583 /* Select erf() or erfc() */
584 res
= gmx_simd_blendv_f(gmx_simd_sub_f(one
, res_erfc
), res_erf
, msk_erf
);
589 /*! \brief SIMD float erfc(x).
591 * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
593 * \param x The value to calculate erfc(x) for.
596 * This routine achieves full precision (bar the last bit) over most of the
597 * input range, but for large arguments where the result is getting close
598 * to the minimum representable numbers we accept slightly larger errors
599 * (think results that are in the ballpark of 10^-30 for single precision,
600 * or 10^-200 for double) since that is not relevant for MD.
602 static gmx_inline gmx_simd_float_t gmx_simdcall
603 gmx_simd_erfc_f(gmx_simd_float_t x
)
605 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
606 const gmx_simd_float_t CA6
= gmx_simd_set1_f(7.853861353153693e-5f
);
607 const gmx_simd_float_t CA5
= gmx_simd_set1_f(-8.010193625184903e-4f
);
608 const gmx_simd_float_t CA4
= gmx_simd_set1_f(5.188327685732524e-3f
);
609 const gmx_simd_float_t CA3
= gmx_simd_set1_f(-2.685381193529856e-2f
);
610 const gmx_simd_float_t CA2
= gmx_simd_set1_f(1.128358514861418e-1f
);
611 const gmx_simd_float_t CA1
= gmx_simd_set1_f(-3.761262582423300e-1f
);
612 const gmx_simd_float_t CA0
= gmx_simd_set1_f(1.128379165726710f
);
613 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
614 const gmx_simd_float_t CB9
= gmx_simd_set1_f(-0.0018629930017603923f
);
615 const gmx_simd_float_t CB8
= gmx_simd_set1_f(0.003909821287598495f
);
616 const gmx_simd_float_t CB7
= gmx_simd_set1_f(-0.0052094582210355615f
);
617 const gmx_simd_float_t CB6
= gmx_simd_set1_f(0.005685614362160572f
);
618 const gmx_simd_float_t CB5
= gmx_simd_set1_f(-0.0025367682853477272f
);
619 const gmx_simd_float_t CB4
= gmx_simd_set1_f(-0.010199799682318782f
);
620 const gmx_simd_float_t CB3
= gmx_simd_set1_f(0.04369575504816542f
);
621 const gmx_simd_float_t CB2
= gmx_simd_set1_f(-0.11884063474674492f
);
622 const gmx_simd_float_t CB1
= gmx_simd_set1_f(0.2732120154030589f
);
623 const gmx_simd_float_t CB0
= gmx_simd_set1_f(0.42758357702025784f
);
624 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
625 const gmx_simd_float_t CC10
= gmx_simd_set1_f(-0.0445555913112064f
);
626 const gmx_simd_float_t CC9
= gmx_simd_set1_f(0.21376355144663348f
);
627 const gmx_simd_float_t CC8
= gmx_simd_set1_f(-0.3473187200259257f
);
628 const gmx_simd_float_t CC7
= gmx_simd_set1_f(0.016690861551248114f
);
629 const gmx_simd_float_t CC6
= gmx_simd_set1_f(0.7560973182491192f
);
630 const gmx_simd_float_t CC5
= gmx_simd_set1_f(-1.2137903600145787f
);
631 const gmx_simd_float_t CC4
= gmx_simd_set1_f(0.8411872321232948f
);
632 const gmx_simd_float_t CC3
= gmx_simd_set1_f(-0.08670413896296343f
);
633 const gmx_simd_float_t CC2
= gmx_simd_set1_f(-0.27124782687240334f
);
634 const gmx_simd_float_t CC1
= gmx_simd_set1_f(-0.0007502488047806069f
);
635 const gmx_simd_float_t CC0
= gmx_simd_set1_f(0.5642114853803148f
);
636 /* Coefficients for expansion of exp(x) in [0,0.1] */
637 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
638 const gmx_simd_float_t CD2
= gmx_simd_set1_f(0.5000066608081202f
);
639 const gmx_simd_float_t CD3
= gmx_simd_set1_f(0.1664795422874624f
);
640 const gmx_simd_float_t CD4
= gmx_simd_set1_f(0.04379839977652482f
);
641 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
642 const gmx_simd_float_t two
= gmx_simd_set1_f(2.0f
);
644 /* We need to use a small trick here, since we cannot assume all SIMD
645 * architectures support integers, and the flag we want (0xfffff000) would
646 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
647 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
648 * fp numbers, and perform a logical or. Since the expression is constant,
649 * we can at least hope it is evaluated at compile-time.
651 #ifdef GMX_SIMD_HAVE_LOGICAL
652 const gmx_simd_float_t sieve
= gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f
), gmx_simd_set1_f(7.05044434e-30f
));
654 const int isieve
= 0xFFFFF000;
655 float mem
[GMX_SIMD_REAL_WIDTH
*2];
656 float * pmem
= gmx_simd_align_f(mem
);
663 gmx_simd_float_t x2
, x4
, y
;
664 gmx_simd_float_t q
, z
, t
, t2
, w
, w2
;
665 gmx_simd_float_t pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
666 gmx_simd_float_t expmx2
, corr
;
667 gmx_simd_float_t res_erf
, res_erfc
, res
;
668 gmx_simd_fbool_t mask
, msk_erf
;
670 /* Calculate erf() */
671 x2
= gmx_simd_mul_f(x
, x
);
672 x4
= gmx_simd_mul_f(x2
, x2
);
674 pA0
= gmx_simd_fmadd_f(CA6
, x4
, CA4
);
675 pA1
= gmx_simd_fmadd_f(CA5
, x4
, CA3
);
676 pA0
= gmx_simd_fmadd_f(pA0
, x4
, CA2
);
677 pA1
= gmx_simd_fmadd_f(pA1
, x4
, CA1
);
678 pA1
= gmx_simd_mul_f(pA1
, x2
);
679 pA0
= gmx_simd_fmadd_f(pA0
, x4
, pA1
);
680 /* Constant term must come last for precision reasons */
681 pA0
= gmx_simd_add_f(pA0
, CA0
);
683 res_erf
= gmx_simd_mul_f(x
, pA0
);
686 y
= gmx_simd_fabs_f(x
);
687 msk_erf
= gmx_simd_cmplt_f(y
, gmx_simd_set1_f(0.75f
));
688 t
= gmx_simd_inv_notmaskfpe_f(y
, msk_erf
);
689 w
= gmx_simd_sub_f(t
, one
);
690 t2
= gmx_simd_mul_f(t
, t
);
691 w2
= gmx_simd_mul_f(w
, w
);
693 * We cannot simply calculate exp(-y2) directly in single precision, since
694 * that will lose a couple of bits of precision due to the multiplication.
695 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
696 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
698 * The only drawback with this is that it requires TWO separate exponential
699 * evaluations, which would be horrible performance-wise. However, the argument
700 * for the second exp() call is always small, so there we simply use a
701 * low-order minimax expansion on [0,0.1].
703 * However, this neat idea requires support for logical ops (and) on
704 * FP numbers, which some vendors decided isn't necessary in their SIMD
705 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
706 * in double, but we still need memory as a backup when that is not available,
707 * and this case is rare enough that we go directly there...
709 #ifdef GMX_SIMD_HAVE_LOGICAL
710 z
= gmx_simd_and_f(y
, sieve
);
712 gmx_simd_store_f(pmem
, y
);
713 for (i
= 0; i
< GMX_SIMD_FLOAT_WIDTH
; i
++)
716 conv
.i
= conv
.i
& isieve
;
719 z
= gmx_simd_load_f(pmem
);
721 q
= gmx_simd_mul_f( gmx_simd_sub_f(z
, y
), gmx_simd_add_f(z
, y
) );
722 corr
= gmx_simd_fmadd_f(CD4
, q
, CD3
);
723 corr
= gmx_simd_fmadd_f(corr
, q
, CD2
);
724 corr
= gmx_simd_fmadd_f(corr
, q
, one
);
725 corr
= gmx_simd_fmadd_f(corr
, q
, one
);
727 expmx2
= gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z
, z
) ) );
728 expmx2
= gmx_simd_mul_f(expmx2
, corr
);
730 pB1
= gmx_simd_fmadd_f(CB9
, w2
, CB7
);
731 pB0
= gmx_simd_fmadd_f(CB8
, w2
, CB6
);
732 pB1
= gmx_simd_fmadd_f(pB1
, w2
, CB5
);
733 pB0
= gmx_simd_fmadd_f(pB0
, w2
, CB4
);
734 pB1
= gmx_simd_fmadd_f(pB1
, w2
, CB3
);
735 pB0
= gmx_simd_fmadd_f(pB0
, w2
, CB2
);
736 pB1
= gmx_simd_fmadd_f(pB1
, w2
, CB1
);
737 pB0
= gmx_simd_fmadd_f(pB0
, w2
, CB0
);
738 pB0
= gmx_simd_fmadd_f(pB1
, w
, pB0
);
740 pC0
= gmx_simd_fmadd_f(CC10
, t2
, CC8
);
741 pC1
= gmx_simd_fmadd_f(CC9
, t2
, CC7
);
742 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC6
);
743 pC1
= gmx_simd_fmadd_f(pC1
, t2
, CC5
);
744 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC4
);
745 pC1
= gmx_simd_fmadd_f(pC1
, t2
, CC3
);
746 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC2
);
747 pC1
= gmx_simd_fmadd_f(pC1
, t2
, CC1
);
749 pC0
= gmx_simd_fmadd_f(pC0
, t2
, CC0
);
750 pC0
= gmx_simd_fmadd_f(pC1
, t
, pC0
);
751 pC0
= gmx_simd_mul_f(pC0
, t
);
753 /* SELECT pB0 or pC0 for erfc() */
754 mask
= gmx_simd_cmplt_f(two
, y
);
755 res_erfc
= gmx_simd_blendv_f(pB0
, pC0
, mask
);
756 res_erfc
= gmx_simd_mul_f(res_erfc
, expmx2
);
758 /* erfc(x<0) = 2-erfc(|x|) */
759 mask
= gmx_simd_cmplt_f(x
, gmx_simd_setzero_f());
760 res_erfc
= gmx_simd_blendv_f(res_erfc
, gmx_simd_sub_f(two
, res_erfc
), mask
);
762 /* Select erf() or erfc() */
763 res
= gmx_simd_blendv_f(res_erfc
, gmx_simd_sub_f(one
, res_erf
), msk_erf
);
768 /*! \brief SIMD float sin \& cos.
770 * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
772 * \param x The argument to evaluate sin/cos for
773 * \param[out] sinval Sin(x)
774 * \param[out] cosval Cos(x)
776 * This version achieves close to machine precision, but for very large
777 * magnitudes of the argument we inherently begin to lose accuracy due to the
778 * argument reduction, despite using extended precision arithmetics internally.
780 static gmx_inline
void gmx_simdcall
781 gmx_simd_sincos_f(gmx_simd_float_t x
, gmx_simd_float_t
*sinval
, gmx_simd_float_t
*cosval
)
783 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
784 const gmx_simd_float_t argred0
= gmx_simd_set1_f(-1.5703125);
785 const gmx_simd_float_t argred1
= gmx_simd_set1_f(-4.83751296997070312500e-04f
);
786 const gmx_simd_float_t argred2
= gmx_simd_set1_f(-7.54953362047672271729e-08f
);
787 const gmx_simd_float_t argred3
= gmx_simd_set1_f(-2.56334406825708960298e-12f
);
788 const gmx_simd_float_t two_over_pi
= gmx_simd_set1_f(2.0f
/M_PI
);
789 const gmx_simd_float_t const_sin2
= gmx_simd_set1_f(-1.9515295891e-4f
);
790 const gmx_simd_float_t const_sin1
= gmx_simd_set1_f( 8.3321608736e-3f
);
791 const gmx_simd_float_t const_sin0
= gmx_simd_set1_f(-1.6666654611e-1f
);
792 const gmx_simd_float_t const_cos2
= gmx_simd_set1_f( 2.443315711809948e-5f
);
793 const gmx_simd_float_t const_cos1
= gmx_simd_set1_f(-1.388731625493765e-3f
);
794 const gmx_simd_float_t const_cos0
= gmx_simd_set1_f( 4.166664568298827e-2f
);
795 const gmx_simd_float_t half
= gmx_simd_set1_f(0.5f
);
796 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
797 gmx_simd_float_t ssign
, csign
;
798 gmx_simd_float_t x2
, y
, z
, psin
, pcos
, sss
, ccc
;
799 gmx_simd_fbool_t mask
;
800 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
801 const gmx_simd_fint32_t ione
= gmx_simd_set1_fi(1);
802 const gmx_simd_fint32_t itwo
= gmx_simd_set1_fi(2);
803 gmx_simd_fint32_t iy
;
805 z
= gmx_simd_mul_f(x
, two_over_pi
);
806 iy
= gmx_simd_cvt_f2i(z
);
807 y
= gmx_simd_round_f(z
);
809 mask
= gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy
, ione
), gmx_simd_setzero_fi()));
810 ssign
= gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO
), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy
, itwo
), itwo
)));
811 csign
= gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO
), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy
, ione
), itwo
), itwo
)));
813 const gmx_simd_float_t quarter
= gmx_simd_set1_f(0.25f
);
814 const gmx_simd_float_t minusquarter
= gmx_simd_set1_f(-0.25f
);
816 gmx_simd_fbool_t m1
, m2
, m3
;
818 /* The most obvious way to find the arguments quadrant in the unit circle
819 * to calculate the sign is to use integer arithmetic, but that is not
820 * present in all SIMD implementations. As an alternative, we have devised a
821 * pure floating-point algorithm that uses truncation for argument reduction
822 * so that we get a new value 0<=q<1 over the unit circle, and then
823 * do floating-point comparisons with fractions. This is likely to be
824 * slightly slower (~10%) due to the longer latencies of floating-point, so
825 * we only use it when integer SIMD arithmetic is not present.
828 x
= gmx_simd_fabs_f(x
);
829 /* It is critical that half-way cases are rounded down */
830 z
= gmx_simd_fmadd_f(x
, two_over_pi
, half
);
831 y
= gmx_simd_trunc_f(z
);
832 q
= gmx_simd_mul_f(z
, quarter
);
833 q
= gmx_simd_sub_f(q
, gmx_simd_trunc_f(q
));
834 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
835 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
836 * This removes the 2*Pi periodicity without using any integer arithmetic.
837 * First check if y had the value 2 or 3, set csign if true.
839 q
= gmx_simd_sub_f(q
, half
);
840 /* If we have logical operations we can work directly on the signbit, which
841 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
842 * Thus, if you are altering defines to debug alternative code paths, the
843 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
844 * active or inactive - you will get errors if only one is used.
846 # ifdef GMX_SIMD_HAVE_LOGICAL
847 ssign
= gmx_simd_and_f(ssign
, gmx_simd_set1_f(GMX_FLOAT_NEGZERO
));
848 csign
= gmx_simd_andnot_f(q
, gmx_simd_set1_f(GMX_FLOAT_NEGZERO
));
849 ssign
= gmx_simd_xor_f(ssign
, csign
);
851 csign
= gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f
), q
);
852 // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
854 ssign
= gmx_simd_xor_sign_f(ssign
, csign
); /* swap ssign if csign was set. */
856 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
857 m1
= gmx_simd_cmplt_f(q
, minusquarter
);
858 m2
= gmx_simd_cmple_f(gmx_simd_setzero_f(), q
);
859 m3
= gmx_simd_cmplt_f(q
, quarter
);
860 m2
= gmx_simd_and_fb(m2
, m3
);
861 mask
= gmx_simd_or_fb(m1
, m2
);
862 /* where mask is FALSE, set sign. */
863 csign
= gmx_simd_xor_sign_f(csign
, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f
), one
, mask
));
865 x
= gmx_simd_fmadd_f(y
, argred0
, x
);
866 x
= gmx_simd_fmadd_f(y
, argred1
, x
);
867 x
= gmx_simd_fmadd_f(y
, argred2
, x
);
868 x
= gmx_simd_fmadd_f(y
, argred3
, x
);
869 x2
= gmx_simd_mul_f(x
, x
);
871 psin
= gmx_simd_fmadd_f(const_sin2
, x2
, const_sin1
);
872 psin
= gmx_simd_fmadd_f(psin
, x2
, const_sin0
);
873 psin
= gmx_simd_fmadd_f(psin
, gmx_simd_mul_f(x
, x2
), x
);
874 pcos
= gmx_simd_fmadd_f(const_cos2
, x2
, const_cos1
);
875 pcos
= gmx_simd_fmadd_f(pcos
, x2
, const_cos0
);
876 pcos
= gmx_simd_fmsub_f(pcos
, x2
, half
);
877 pcos
= gmx_simd_fmadd_f(pcos
, x2
, one
);
879 sss
= gmx_simd_blendv_f(pcos
, psin
, mask
);
880 ccc
= gmx_simd_blendv_f(psin
, pcos
, mask
);
881 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
882 #ifdef GMX_SIMD_HAVE_LOGICAL
883 *sinval
= gmx_simd_xor_f(sss
, ssign
);
884 *cosval
= gmx_simd_xor_f(ccc
, csign
);
886 *sinval
= gmx_simd_xor_sign_f(sss
, ssign
);
887 *cosval
= gmx_simd_xor_sign_f(ccc
, csign
);
891 /*! \brief SIMD float sin(x).
893 * You should normally call the real-precision routine \ref gmx_simd_sin_r.
895 * \param x The argument to evaluate sin for
898 * \attention Do NOT call both sin & cos if you need both results, since each of them
899 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
901 static gmx_inline gmx_simd_float_t gmx_simdcall
902 gmx_simd_sin_f(gmx_simd_float_t x
)
904 gmx_simd_float_t s
, c
;
905 gmx_simd_sincos_f(x
, &s
, &c
);
909 /*! \brief SIMD float cos(x).
911 * You should normally call the real-precision routine \ref gmx_simd_cos_r.
913 * \param x The argument to evaluate cos for
916 * \attention Do NOT call both sin & cos if you need both results, since each of them
917 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
919 static gmx_inline gmx_simd_float_t gmx_simdcall
920 gmx_simd_cos_f(gmx_simd_float_t x
)
922 gmx_simd_float_t s
, c
;
923 gmx_simd_sincos_f(x
, &s
, &c
);
927 /*! \brief SIMD float tan(x).
929 * You should normally call the real-precision routine \ref gmx_simd_tan_r.
931 * \param x The argument to evaluate tan for
934 static gmx_inline gmx_simd_float_t gmx_simdcall
935 gmx_simd_tan_f(gmx_simd_float_t x
)
937 const gmx_simd_float_t argred0
= gmx_simd_set1_f(-1.5703125);
938 const gmx_simd_float_t argred1
= gmx_simd_set1_f(-4.83751296997070312500e-04f
);
939 const gmx_simd_float_t argred2
= gmx_simd_set1_f(-7.54953362047672271729e-08f
);
940 const gmx_simd_float_t argred3
= gmx_simd_set1_f(-2.56334406825708960298e-12f
);
941 const gmx_simd_float_t two_over_pi
= gmx_simd_set1_f(2.0f
/M_PI
);
942 const gmx_simd_float_t CT6
= gmx_simd_set1_f(0.009498288995810566122993911);
943 const gmx_simd_float_t CT5
= gmx_simd_set1_f(0.002895755790837379295226923);
944 const gmx_simd_float_t CT4
= gmx_simd_set1_f(0.02460087336161924491836265);
945 const gmx_simd_float_t CT3
= gmx_simd_set1_f(0.05334912882656359828045988);
946 const gmx_simd_float_t CT2
= gmx_simd_set1_f(0.1333989091464957704418495);
947 const gmx_simd_float_t CT1
= gmx_simd_set1_f(0.3333307599244198227797507);
949 gmx_simd_float_t x2
, p
, y
, z
;
950 gmx_simd_fbool_t mask
;
952 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
953 gmx_simd_fint32_t iy
;
954 gmx_simd_fint32_t ione
= gmx_simd_set1_fi(1);
956 z
= gmx_simd_mul_f(x
, two_over_pi
);
957 iy
= gmx_simd_cvt_f2i(z
);
958 y
= gmx_simd_round_f(z
);
959 mask
= gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy
, ione
), ione
));
961 x
= gmx_simd_fmadd_f(y
, argred0
, x
);
962 x
= gmx_simd_fmadd_f(y
, argred1
, x
);
963 x
= gmx_simd_fmadd_f(y
, argred2
, x
);
964 x
= gmx_simd_fmadd_f(y
, argred3
, x
);
965 x
= gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO
), mask
), x
);
967 const gmx_simd_float_t quarter
= gmx_simd_set1_f(0.25f
);
968 const gmx_simd_float_t half
= gmx_simd_set1_f(0.5f
);
969 const gmx_simd_float_t threequarter
= gmx_simd_set1_f(0.75f
);
970 gmx_simd_float_t w
, q
;
971 gmx_simd_fbool_t m1
, m2
, m3
;
973 w
= gmx_simd_fabs_f(x
);
974 z
= gmx_simd_fmadd_f(w
, two_over_pi
, half
);
975 y
= gmx_simd_trunc_f(z
);
976 q
= gmx_simd_mul_f(z
, quarter
);
977 q
= gmx_simd_sub_f(q
, gmx_simd_trunc_f(q
));
978 m1
= gmx_simd_cmple_f(quarter
, q
);
979 m2
= gmx_simd_cmplt_f(q
, half
);
980 m3
= gmx_simd_cmple_f(threequarter
, q
);
981 m1
= gmx_simd_and_fb(m1
, m2
);
982 mask
= gmx_simd_or_fb(m1
, m3
);
983 w
= gmx_simd_fmadd_f(y
, argred0
, w
);
984 w
= gmx_simd_fmadd_f(y
, argred1
, w
);
985 w
= gmx_simd_fmadd_f(y
, argred2
, w
);
986 w
= gmx_simd_fmadd_f(y
, argred3
, w
);
988 w
= gmx_simd_blendv_f(w
, gmx_simd_fneg_f(w
), mask
);
989 x
= gmx_simd_xor_sign_f(w
, x
);
991 x2
= gmx_simd_mul_f(x
, x
);
992 p
= gmx_simd_fmadd_f(CT6
, x2
, CT5
);
993 p
= gmx_simd_fmadd_f(p
, x2
, CT4
);
994 p
= gmx_simd_fmadd_f(p
, x2
, CT3
);
995 p
= gmx_simd_fmadd_f(p
, x2
, CT2
);
996 p
= gmx_simd_fmadd_f(p
, x2
, CT1
);
997 p
= gmx_simd_fmadd_f(x2
, gmx_simd_mul_f(p
, x
), x
);
999 p
= gmx_simd_blendv_f( p
, gmx_simd_inv_maskfpe_f(p
, mask
), mask
);
1003 /*! \brief SIMD float asin(x).
1005 * You should normally call the real-precision routine \ref gmx_simd_asin_r.
1007 * \param x The argument to evaluate asin for
1010 static gmx_inline gmx_simd_float_t gmx_simdcall
1011 gmx_simd_asin_f(gmx_simd_float_t x
)
1013 const gmx_simd_float_t limitlow
= gmx_simd_set1_f(1e-4f
);
1014 const gmx_simd_float_t half
= gmx_simd_set1_f(0.5f
);
1015 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
1016 const gmx_simd_float_t halfpi
= gmx_simd_set1_f((float)M_PI
/2.0f
);
1017 const gmx_simd_float_t CC5
= gmx_simd_set1_f(4.2163199048E-2f
);
1018 const gmx_simd_float_t CC4
= gmx_simd_set1_f(2.4181311049E-2f
);
1019 const gmx_simd_float_t CC3
= gmx_simd_set1_f(4.5470025998E-2f
);
1020 const gmx_simd_float_t CC2
= gmx_simd_set1_f(7.4953002686E-2f
);
1021 const gmx_simd_float_t CC1
= gmx_simd_set1_f(1.6666752422E-1f
);
1022 gmx_simd_float_t xabs
;
1023 gmx_simd_float_t z
, z1
, z2
, q
, q1
, q2
;
1024 gmx_simd_float_t pA
, pB
;
1025 gmx_simd_fbool_t mask
, mask2
;
1027 xabs
= gmx_simd_fabs_f(x
);
1028 mask
= gmx_simd_cmplt_f(half
, xabs
);
1029 z1
= gmx_simd_mul_f(half
, gmx_simd_sub_f(one
, xabs
));
1030 mask2
= gmx_simd_cmpeq_f(xabs
, one
);
1031 q1
= gmx_simd_mul_f(z1
, gmx_simd_invsqrt_notmaskfpe_f(z1
, mask2
));
1032 q1
= gmx_simd_blendnotzero_f(q1
, mask2
);
1034 z2
= gmx_simd_mul_f(q2
, q2
);
1035 z
= gmx_simd_blendv_f(z2
, z1
, mask
);
1036 q
= gmx_simd_blendv_f(q2
, q1
, mask
);
1038 z2
= gmx_simd_mul_f(z
, z
);
1039 pA
= gmx_simd_fmadd_f(CC5
, z2
, CC3
);
1040 pB
= gmx_simd_fmadd_f(CC4
, z2
, CC2
);
1041 pA
= gmx_simd_fmadd_f(pA
, z2
, CC1
);
1042 pA
= gmx_simd_mul_f(pA
, z
);
1043 z
= gmx_simd_fmadd_f(pB
, z2
, pA
);
1044 z
= gmx_simd_fmadd_f(z
, q
, q
);
1045 q2
= gmx_simd_sub_f(halfpi
, z
);
1046 q2
= gmx_simd_sub_f(q2
, z
);
1047 z
= gmx_simd_blendv_f(z
, q2
, mask
);
1049 mask
= gmx_simd_cmplt_f(limitlow
, xabs
);
1050 z
= gmx_simd_blendv_f( xabs
, z
, mask
);
1051 z
= gmx_simd_xor_sign_f(z
, x
);
1056 /*! \brief SIMD float acos(x).
1058 * You should normally call the real-precision routine \ref gmx_simd_acos_r.
1060 * \param x The argument to evaluate acos for
1063 static gmx_inline gmx_simd_float_t gmx_simdcall
1064 gmx_simd_acos_f(gmx_simd_float_t x
)
1066 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
1067 const gmx_simd_float_t half
= gmx_simd_set1_f(0.5f
);
1068 const gmx_simd_float_t pi
= gmx_simd_set1_f((float)M_PI
);
1069 const gmx_simd_float_t halfpi
= gmx_simd_set1_f((float)M_PI
/2.0f
);
1070 gmx_simd_float_t xabs
;
1071 gmx_simd_float_t z
, z1
, z2
, z3
;
1072 gmx_simd_fbool_t mask1
, mask2
, mask3
;
1074 xabs
= gmx_simd_fabs_f(x
);
1075 mask1
= gmx_simd_cmplt_f(half
, xabs
);
1076 mask2
= gmx_simd_cmplt_f(gmx_simd_setzero_f(), x
);
1078 z
= gmx_simd_mul_f(half
, gmx_simd_sub_f(one
, xabs
));
1079 mask3
= gmx_simd_cmpeq_f(xabs
, one
);
1080 z
= gmx_simd_mul_f(z
, gmx_simd_invsqrt_notmaskfpe_f(z
, mask3
));
1081 z
= gmx_simd_blendnotzero_f(z
, mask3
);
1082 z
= gmx_simd_blendv_f(x
, z
, mask1
);
1083 z
= gmx_simd_asin_f(z
);
1085 z2
= gmx_simd_add_f(z
, z
);
1086 z1
= gmx_simd_sub_f(pi
, z2
);
1087 z3
= gmx_simd_sub_f(halfpi
, z
);
1088 z
= gmx_simd_blendv_f(z1
, z2
, mask2
);
1089 z
= gmx_simd_blendv_f(z3
, z
, mask1
);
1094 /*! \brief SIMD float asin(x).
1096 * You should normally call the real-precision routine \ref gmx_simd_atan_r.
1098 * \param x The argument to evaluate atan for
1099 * \result Atan(x), same argument/value range as standard math library.
1101 static gmx_inline gmx_simd_float_t gmx_simdcall
1102 gmx_simd_atan_f(gmx_simd_float_t x
)
1104 const gmx_simd_float_t halfpi
= gmx_simd_set1_f(M_PI
/2);
1105 const gmx_simd_float_t CA17
= gmx_simd_set1_f(0.002823638962581753730774f
);
1106 const gmx_simd_float_t CA15
= gmx_simd_set1_f(-0.01595690287649631500244f
);
1107 const gmx_simd_float_t CA13
= gmx_simd_set1_f(0.04250498861074447631836f
);
1108 const gmx_simd_float_t CA11
= gmx_simd_set1_f(-0.07489009201526641845703f
);
1109 const gmx_simd_float_t CA9
= gmx_simd_set1_f(0.1063479334115982055664f
);
1110 const gmx_simd_float_t CA7
= gmx_simd_set1_f(-0.1420273631811141967773f
);
1111 const gmx_simd_float_t CA5
= gmx_simd_set1_f(0.1999269574880599975585f
);
1112 const gmx_simd_float_t CA3
= gmx_simd_set1_f(-0.3333310186862945556640f
);
1113 const gmx_simd_float_t one
= gmx_simd_set1_f(1.0f
);
1114 gmx_simd_float_t x2
, x3
, x4
, pA
, pB
;
1115 gmx_simd_fbool_t mask
, mask2
;
1117 mask
= gmx_simd_cmplt_f(x
, gmx_simd_setzero_f());
1118 x
= gmx_simd_fabs_f(x
);
1119 mask2
= gmx_simd_cmplt_f(one
, x
);
1120 x
= gmx_simd_blendv_f(x
, gmx_simd_inv_maskfpe_f(x
, mask2
), mask2
);
1122 x2
= gmx_simd_mul_f(x
, x
);
1123 x3
= gmx_simd_mul_f(x2
, x
);
1124 x4
= gmx_simd_mul_f(x2
, x2
);
1125 pA
= gmx_simd_fmadd_f(CA17
, x4
, CA13
);
1126 pB
= gmx_simd_fmadd_f(CA15
, x4
, CA11
);
1127 pA
= gmx_simd_fmadd_f(pA
, x4
, CA9
);
1128 pB
= gmx_simd_fmadd_f(pB
, x4
, CA7
);
1129 pA
= gmx_simd_fmadd_f(pA
, x4
, CA5
);
1130 pB
= gmx_simd_fmadd_f(pB
, x4
, CA3
);
1131 pA
= gmx_simd_fmadd_f(pA
, x2
, pB
);
1132 pA
= gmx_simd_fmadd_f(pA
, x3
, x
);
1134 pA
= gmx_simd_blendv_f(pA
, gmx_simd_sub_f(halfpi
, pA
), mask2
);
1135 pA
= gmx_simd_blendv_f(pA
, gmx_simd_fneg_f(pA
), mask
);
1140 /*! \brief SIMD float atan2(y,x).
1142 * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
1144 * \param y Y component of vector, any quartile
1145 * \param x X component of vector, any quartile
1146 * \result Atan(y,x), same argument/value range as standard math library.
1148 * \note This routine should provide correct results for all finite
1149 * non-zero or positive-zero arguments. However, negative zero arguments will
1150 * be treated as positive zero, which means the return value will deviate from
1151 * the standard math library atan2(y,x) for those cases. That should not be
1152 * of any concern in Gromacs, and in particular it will not affect calculations
1153 * of angles from vectors.
1155 static gmx_inline gmx_simd_float_t gmx_simdcall
1156 gmx_simd_atan2_f(gmx_simd_float_t y
, gmx_simd_float_t x
)
1158 const gmx_simd_float_t pi
= gmx_simd_set1_f(M_PI
);
1159 const gmx_simd_float_t halfpi
= gmx_simd_set1_f(M_PI
/2.0);
1160 gmx_simd_float_t xinv
, p
, aoffset
;
1161 gmx_simd_fbool_t mask_x0
, mask_y0
, mask_xlt0
, mask_ylt0
;
1163 mask_x0
= gmx_simd_cmpeq_f(x
, gmx_simd_setzero_f());
1164 mask_y0
= gmx_simd_cmpeq_f(y
, gmx_simd_setzero_f());
1165 mask_xlt0
= gmx_simd_cmplt_f(x
, gmx_simd_setzero_f());
1166 mask_ylt0
= gmx_simd_cmplt_f(y
, gmx_simd_setzero_f());
1168 aoffset
= gmx_simd_blendzero_f(halfpi
, mask_x0
);
1169 aoffset
= gmx_simd_blendnotzero_f(aoffset
, mask_y0
);
1171 aoffset
= gmx_simd_blendv_f(aoffset
, pi
, mask_xlt0
);
1172 aoffset
= gmx_simd_blendv_f(aoffset
, gmx_simd_fneg_f(aoffset
), mask_ylt0
);
1174 xinv
= gmx_simd_blendnotzero_f(gmx_simd_inv_notmaskfpe_f(x
, mask_x0
), mask_x0
);
1175 p
= gmx_simd_mul_f(y
, xinv
);
1176 p
= gmx_simd_atan_f(p
);
1177 p
= gmx_simd_add_f(p
, aoffset
);
1182 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1184 * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
1186 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1187 * \result Correction factor to coulomb force - see below for details.
1189 * This routine is meant to enable analytical evaluation of the
1190 * direct-space PME electrostatic force to avoid tables.
1192 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1193 * are some problems evaluating that:
1195 * First, the error function is difficult (read: expensive) to
1196 * approxmiate accurately for intermediate to large arguments, and
1197 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1198 * Second, we now try to avoid calculating potentials in Gromacs but
1199 * use forces directly.
1201 * We can simply things slight by noting that the PME part is really
1202 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1204 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1206 * The first term we already have from the inverse square root, so
1207 * that we can leave out of this routine.
1209 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1210 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1211 * the range used for the minimax fit. Use your favorite plotting program
1212 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1214 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1215 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1216 * then only use even powers. This is another minor optimization, since
1217 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1218 * the vector between the two atoms to get the vectorial force. The
1219 * fastest flops are the ones we can avoid calculating!
1221 * So, here's how it should be used:
1223 * 1. Calculate \f$r^2\f$.
1224 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1225 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1226 * 4. The return value is the expression:
1229 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1232 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1235 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1238 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1241 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1244 * With a bit of math exercise you should be able to confirm that
1248 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1251 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1252 * and you have your force (divided by \f$r\f$). A final multiplication
1253 * with the vector connecting the two particles and you have your
1254 * vectorial force to add to the particles.
1256 * This approximation achieves an error slightly lower than 1e-6
1257 * in single precision and 1e-11 in double precision
1258 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1259 * when added to \f$1/r\f$ the error will be insignificant.
1260 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1263 static gmx_inline gmx_simd_float_t gmx_simdcall
1264 gmx_simd_pmecorrF_f(gmx_simd_float_t z2
)
1266 const gmx_simd_float_t FN6
= gmx_simd_set1_f(-1.7357322914161492954e-8f
);
1267 const gmx_simd_float_t FN5
= gmx_simd_set1_f(1.4703624142580877519e-6f
);
1268 const gmx_simd_float_t FN4
= gmx_simd_set1_f(-0.000053401640219807709149f
);
1269 const gmx_simd_float_t FN3
= gmx_simd_set1_f(0.0010054721316683106153f
);
1270 const gmx_simd_float_t FN2
= gmx_simd_set1_f(-0.019278317264888380590f
);
1271 const gmx_simd_float_t FN1
= gmx_simd_set1_f(0.069670166153766424023f
);
1272 const gmx_simd_float_t FN0
= gmx_simd_set1_f(-0.75225204789749321333f
);
1274 const gmx_simd_float_t FD4
= gmx_simd_set1_f(0.0011193462567257629232f
);
1275 const gmx_simd_float_t FD3
= gmx_simd_set1_f(0.014866955030185295499f
);
1276 const gmx_simd_float_t FD2
= gmx_simd_set1_f(0.11583842382862377919f
);
1277 const gmx_simd_float_t FD1
= gmx_simd_set1_f(0.50736591960530292870f
);
1278 const gmx_simd_float_t FD0
= gmx_simd_set1_f(1.0f
);
1280 gmx_simd_float_t z4
;
1281 gmx_simd_float_t polyFN0
, polyFN1
, polyFD0
, polyFD1
;
1283 z4
= gmx_simd_mul_f(z2
, z2
);
1285 polyFD0
= gmx_simd_fmadd_f(FD4
, z4
, FD2
);
1286 polyFD1
= gmx_simd_fmadd_f(FD3
, z4
, FD1
);
1287 polyFD0
= gmx_simd_fmadd_f(polyFD0
, z4
, FD0
);
1288 polyFD0
= gmx_simd_fmadd_f(polyFD1
, z2
, polyFD0
);
1290 polyFD0
= gmx_simd_inv_f(polyFD0
);
1292 polyFN0
= gmx_simd_fmadd_f(FN6
, z4
, FN4
);
1293 polyFN1
= gmx_simd_fmadd_f(FN5
, z4
, FN3
);
1294 polyFN0
= gmx_simd_fmadd_f(polyFN0
, z4
, FN2
);
1295 polyFN1
= gmx_simd_fmadd_f(polyFN1
, z4
, FN1
);
1296 polyFN0
= gmx_simd_fmadd_f(polyFN0
, z4
, FN0
);
1297 polyFN0
= gmx_simd_fmadd_f(polyFN1
, z2
, polyFN0
);
1299 return gmx_simd_mul_f(polyFN0
, polyFD0
);
1304 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1306 * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
1308 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1309 * \result Correction factor to coulomb potential - see below for details.
1311 * See \ref gmx_simd_pmecorrF_f for details about the approximation.
1313 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1314 * as the input argument.
1316 * Here's how it should be used:
1318 * 1. Calculate \f$r^2\f$.
1319 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1320 * 3. Evaluate this routine with z^2 as the argument.
1321 * 4. The return value is the expression:
1324 * \frac{\mbox{erf}(z)}{z}
1327 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1330 * \frac{\mbox{erf}(r \beta)}{r}
1333 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1334 * and you have your potential.
1336 * This approximation achieves an error slightly lower than 1e-6
1337 * in single precision and 4e-11 in double precision
1338 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1339 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1340 * when added to \f$1/r\f$ the error will be insignificant.
1341 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1343 static gmx_inline gmx_simd_float_t gmx_simdcall
1344 gmx_simd_pmecorrV_f(gmx_simd_float_t z2
)
1346 const gmx_simd_float_t VN6
= gmx_simd_set1_f(1.9296833005951166339e-8f
);
1347 const gmx_simd_float_t VN5
= gmx_simd_set1_f(-1.4213390571557850962e-6f
);
1348 const gmx_simd_float_t VN4
= gmx_simd_set1_f(0.000041603292906656984871f
);
1349 const gmx_simd_float_t VN3
= gmx_simd_set1_f(-0.00013134036773265025626f
);
1350 const gmx_simd_float_t VN2
= gmx_simd_set1_f(0.038657983986041781264f
);
1351 const gmx_simd_float_t VN1
= gmx_simd_set1_f(0.11285044772717598220f
);
1352 const gmx_simd_float_t VN0
= gmx_simd_set1_f(1.1283802385263030286f
);
1354 const gmx_simd_float_t VD3
= gmx_simd_set1_f(0.0066752224023576045451f
);
1355 const gmx_simd_float_t VD2
= gmx_simd_set1_f(0.078647795836373922256f
);
1356 const gmx_simd_float_t VD1
= gmx_simd_set1_f(0.43336185284710920150f
);
1357 const gmx_simd_float_t VD0
= gmx_simd_set1_f(1.0f
);
1359 gmx_simd_float_t z4
;
1360 gmx_simd_float_t polyVN0
, polyVN1
, polyVD0
, polyVD1
;
1362 z4
= gmx_simd_mul_f(z2
, z2
);
1364 polyVD1
= gmx_simd_fmadd_f(VD3
, z4
, VD1
);
1365 polyVD0
= gmx_simd_fmadd_f(VD2
, z4
, VD0
);
1366 polyVD0
= gmx_simd_fmadd_f(polyVD1
, z2
, polyVD0
);
1368 polyVD0
= gmx_simd_inv_f(polyVD0
);
1370 polyVN0
= gmx_simd_fmadd_f(VN6
, z4
, VN4
);
1371 polyVN1
= gmx_simd_fmadd_f(VN5
, z4
, VN3
);
1372 polyVN0
= gmx_simd_fmadd_f(polyVN0
, z4
, VN2
);
1373 polyVN1
= gmx_simd_fmadd_f(polyVN1
, z4
, VN1
);
1374 polyVN0
= gmx_simd_fmadd_f(polyVN0
, z4
, VN0
);
1375 polyVN0
= gmx_simd_fmadd_f(polyVN1
, z2
, polyVN0
);
1377 return gmx_simd_mul_f(polyVN0
, polyVD0
);
1383 #ifdef GMX_SIMD_HAVE_DOUBLE
1385 /*! \name Double precision SIMD math functions
1387 * \note In most cases you should use the real-precision functions instead.
1391 /****************************************
1392 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1393 ****************************************/
1395 /*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
1397 * \copydetails gmx_simd_sum4_f
1399 static gmx_inline gmx_simd_double_t gmx_simdcall
1400 gmx_simd_sum4_d(gmx_simd_double_t a
, gmx_simd_double_t b
,
1401 gmx_simd_double_t c
, gmx_simd_double_t d
)
1403 return gmx_simd_add_d(gmx_simd_add_d(a
, b
), gmx_simd_add_d(c
, d
));
1406 /*! \brief Return -a if b is negative, SIMD double.
1408 * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
1410 * \param a Values to set sign for
1411 * \param b Values used to set sign
1412 * \return if b is negative, the sign of a will be changed.
1414 * This is equivalent to doing an xor operation on a with the sign bit of b,
1415 * with the exception that negative zero is not considered to be negative
1416 * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
1418 static gmx_inline gmx_simd_double_t gmx_simdcall
1419 gmx_simd_xor_sign_d(gmx_simd_double_t a
, gmx_simd_double_t b
)
1421 #ifdef GMX_SIMD_HAVE_LOGICAL
1422 return gmx_simd_xor_d(a
, gmx_simd_and_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO
), b
));
1424 return gmx_simd_blendv_d(a
, gmx_simd_fneg_d(a
), gmx_simd_cmplt_d(b
, gmx_simd_setzero_d()));
1428 #ifndef gmx_simd_rsqrt_iter_d
1429 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1431 * \copydetails gmx_simd_rsqrt_iter_f
1433 static gmx_inline gmx_simd_double_t gmx_simdcall
1434 gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu
, gmx_simd_double_t x
)
1436 #ifdef GMX_SIMD_HAVE_FMA
1437 return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x
, gmx_simd_mul_d(lu
, lu
), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu
, gmx_simd_set1_d(0.5)), lu
);
1439 return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu
, lu
), x
)), lu
));
1444 /*! \brief Calculate 1/sqrt(x) for SIMD double
1446 * \copydetails gmx_simd_invsqrt_f
1448 static gmx_inline gmx_simd_double_t gmx_simdcall
1449 gmx_simd_invsqrt_d(gmx_simd_double_t x
)
1451 gmx_simd_double_t lu
= gmx_simd_rsqrt_d(x
);
1452 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1453 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
1455 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1456 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
1458 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1459 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
1461 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1462 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
1467 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1469 * \copydetails gmx_simd_invsqrt_maskfpe_f
1471 static gmx_inline gmx_simd_double_t
1472 gmx_simd_invsqrt_maskfpe_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
1475 return gmx_simd_invsqrt_d(x
);
1477 return gmx_simd_invsqrt_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x
, m
));
1481 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD double.
1483 * \copydetails gmx_simd_invsqrt_notmaskfpe_f
1485 static gmx_inline gmx_simd_double_t
1486 gmx_simd_invsqrt_notmaskfpe_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
1489 return gmx_simd_invsqrt_d(x
);
1491 return gmx_simd_invsqrt_d(gmx_simd_blendv_d(x
, gmx_simd_set1_d(1.0), m
));
1495 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1497 * \copydetails gmx_simd_invsqrt_pair_f
1499 static gmx_inline
void gmx_simdcall
1500 gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0
, gmx_simd_double_t x1
,
1501 gmx_simd_double_t
*out0
, gmx_simd_double_t
*out1
)
1503 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1504 gmx_simd_float_t xf
= gmx_simd_cvt_dd2f(x0
, x1
);
1505 gmx_simd_float_t luf
= gmx_simd_rsqrt_f(xf
);
1506 gmx_simd_double_t lu0
, lu1
;
1507 /* Intermediate target is single - mantissa+1 bits */
1508 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1509 luf
= gmx_simd_rsqrt_iter_f(luf
, xf
);
1511 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1512 luf
= gmx_simd_rsqrt_iter_f(luf
, xf
);
1514 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1515 luf
= gmx_simd_rsqrt_iter_f(luf
, xf
);
1517 gmx_simd_cvt_f2dd(luf
, &lu0
, &lu1
);
1518 /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
1519 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1520 lu0
= gmx_simd_rsqrt_iter_d(lu0
, x0
);
1521 lu1
= gmx_simd_rsqrt_iter_d(lu1
, x1
);
1523 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1524 lu0
= gmx_simd_rsqrt_iter_d(lu0
, x0
);
1525 lu1
= gmx_simd_rsqrt_iter_d(lu1
, x1
);
1530 *out0
= gmx_simd_invsqrt_d(x0
);
1531 *out1
= gmx_simd_invsqrt_d(x1
);
1535 #ifndef gmx_simd_rcp_iter_d
1536 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1538 * \copydetails gmx_simd_rcp_iter_f
1540 static gmx_inline gmx_simd_double_t gmx_simdcall
1541 gmx_simd_rcp_iter_d(gmx_simd_double_t lu
, gmx_simd_double_t x
)
1543 return gmx_simd_mul_d(lu
, gmx_simd_fnmadd_d(lu
, x
, gmx_simd_set1_d(2.0)));
1547 /*! \brief Calculate 1/x for SIMD double.
1549 * \copydetails gmx_simd_inv_f
1551 static gmx_inline gmx_simd_double_t gmx_simdcall
1552 gmx_simd_inv_d(gmx_simd_double_t x
)
1554 gmx_simd_double_t lu
= gmx_simd_rcp_d(x
);
1555 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1556 lu
= gmx_simd_rcp_iter_d(lu
, x
);
1558 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1559 lu
= gmx_simd_rcp_iter_d(lu
, x
);
1561 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1562 lu
= gmx_simd_rcp_iter_d(lu
, x
);
1564 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1565 lu
= gmx_simd_rcp_iter_d(lu
, x
);
1570 /*! \brief Calculate 1/x for masked entries of SIMD double.
1572 * \copydetails gmx_simd_inv_maskfpe_f
1574 static gmx_inline gmx_simd_double_t
1575 gmx_simd_inv_maskfpe_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
1578 return gmx_simd_inv_d(x
);
1580 return gmx_simd_inv_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x
, m
));
1584 /*! \brief Calculate 1/x for non-masked entries of SIMD double.
1586 * \copydetails gmx_simd_inv_notmaskfpe_f
1588 static gmx_inline gmx_simd_double_t
1589 gmx_simd_inv_notmaskfpe_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
1592 return gmx_simd_inv_d(x
);
1594 return gmx_simd_inv_d(gmx_simd_blendv_d(x
, gmx_simd_set1_d(1.0), m
));
1598 /*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
1600 * \copydetails gmx_simd_sqrt_f
1602 static gmx_inline gmx_simd_double_t gmx_simdcall
1603 gmx_simd_sqrt_d(gmx_simd_double_t x
)
1605 gmx_simd_dbool_t mask
;
1606 gmx_simd_double_t res
;
1608 mask
= gmx_simd_cmpeq_d(x
, gmx_simd_setzero_d());
1609 res
= gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_d(x
, mask
), mask
);
1610 return gmx_simd_mul_d(res
, x
);
1613 /*! \brief SIMD double log(x). This is the natural logarithm.
1615 * \copydetails gmx_simd_log_f
1617 static gmx_inline gmx_simd_double_t gmx_simdcall
1618 gmx_simd_log_d(gmx_simd_double_t x
)
1620 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
1621 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
1622 const gmx_simd_double_t sqrt2
= gmx_simd_set1_d(sqrt(2.0));
1623 const gmx_simd_double_t corr
= gmx_simd_set1_d(0.693147180559945286226764);
1624 const gmx_simd_double_t CL15
= gmx_simd_set1_d(0.148197055177935105296783);
1625 const gmx_simd_double_t CL13
= gmx_simd_set1_d(0.153108178020442575739679);
1626 const gmx_simd_double_t CL11
= gmx_simd_set1_d(0.181837339521549679055568);
1627 const gmx_simd_double_t CL9
= gmx_simd_set1_d(0.22222194152736701733275);
1628 const gmx_simd_double_t CL7
= gmx_simd_set1_d(0.285714288030134544449368);
1629 const gmx_simd_double_t CL5
= gmx_simd_set1_d(0.399999999989941956712869);
1630 const gmx_simd_double_t CL3
= gmx_simd_set1_d(0.666666666666685503450651);
1631 const gmx_simd_double_t CL1
= gmx_simd_set1_d(2.0);
1632 gmx_simd_double_t fexp
, x2
, p
;
1633 gmx_simd_dbool_t mask
;
1635 fexp
= gmx_simd_get_exponent_d(x
);
1636 x
= gmx_simd_get_mantissa_d(x
);
1638 mask
= gmx_simd_cmplt_d(sqrt2
, x
);
1639 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
1640 fexp
= gmx_simd_add_d(fexp
, gmx_simd_blendzero_d(one
, mask
));
1641 x
= gmx_simd_mul_d(x
, gmx_simd_blendv_d(one
, half
, mask
));
1643 x
= gmx_simd_mul_d( gmx_simd_sub_d(x
, one
), gmx_simd_inv_d( gmx_simd_add_d(x
, one
) ) );
1644 x2
= gmx_simd_mul_d(x
, x
);
1646 p
= gmx_simd_fmadd_d(CL15
, x2
, CL13
);
1647 p
= gmx_simd_fmadd_d(p
, x2
, CL11
);
1648 p
= gmx_simd_fmadd_d(p
, x2
, CL9
);
1649 p
= gmx_simd_fmadd_d(p
, x2
, CL7
);
1650 p
= gmx_simd_fmadd_d(p
, x2
, CL5
);
1651 p
= gmx_simd_fmadd_d(p
, x2
, CL3
);
1652 p
= gmx_simd_fmadd_d(p
, x2
, CL1
);
1653 p
= gmx_simd_fmadd_d(p
, x
, gmx_simd_mul_d(corr
, fexp
));
1658 /*! \brief SIMD double 2^x.
1660 * \copydetails gmx_simd_exp2_f
1662 static gmx_inline gmx_simd_double_t gmx_simdcall
1663 gmx_simd_exp2_d(gmx_simd_double_t x
)
1665 const gmx_simd_double_t arglimit
= gmx_simd_set1_d(1022.0);
1666 const gmx_simd_double_t CE11
= gmx_simd_set1_d(4.435280790452730022081181e-10);
1667 const gmx_simd_double_t CE10
= gmx_simd_set1_d(7.074105630863314448024247e-09);
1668 const gmx_simd_double_t CE9
= gmx_simd_set1_d(1.017819803432096698472621e-07);
1669 const gmx_simd_double_t CE8
= gmx_simd_set1_d(1.321543308956718799557863e-06);
1670 const gmx_simd_double_t CE7
= gmx_simd_set1_d(0.00001525273348995851746990884);
1671 const gmx_simd_double_t CE6
= gmx_simd_set1_d(0.0001540353046251466849082632);
1672 const gmx_simd_double_t CE5
= gmx_simd_set1_d(0.001333355814678995257307880);
1673 const gmx_simd_double_t CE4
= gmx_simd_set1_d(0.009618129107588335039176502);
1674 const gmx_simd_double_t CE3
= gmx_simd_set1_d(0.05550410866481992147457793);
1675 const gmx_simd_double_t CE2
= gmx_simd_set1_d(0.2402265069591015620470894);
1676 const gmx_simd_double_t CE1
= gmx_simd_set1_d(0.6931471805599453304615075);
1677 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
1678 gmx_simd_double_t fexppart
;
1679 gmx_simd_double_t intpart
;
1680 gmx_simd_double_t p
;
1681 gmx_simd_dbool_t valuemask
;
1683 fexppart
= gmx_simd_set_exponent_d(x
); /* rounds to nearest int internally */
1684 intpart
= gmx_simd_round_d(x
); /* use same rounding mode here */
1685 valuemask
= gmx_simd_cmple_d(gmx_simd_fabs_d(x
), arglimit
);
1686 fexppart
= gmx_simd_blendzero_d(fexppart
, valuemask
);
1687 x
= gmx_simd_sub_d(x
, intpart
);
1689 p
= gmx_simd_fmadd_d(CE11
, x
, CE10
);
1690 p
= gmx_simd_fmadd_d(p
, x
, CE9
);
1691 p
= gmx_simd_fmadd_d(p
, x
, CE8
);
1692 p
= gmx_simd_fmadd_d(p
, x
, CE7
);
1693 p
= gmx_simd_fmadd_d(p
, x
, CE6
);
1694 p
= gmx_simd_fmadd_d(p
, x
, CE5
);
1695 p
= gmx_simd_fmadd_d(p
, x
, CE4
);
1696 p
= gmx_simd_fmadd_d(p
, x
, CE3
);
1697 p
= gmx_simd_fmadd_d(p
, x
, CE2
);
1698 p
= gmx_simd_fmadd_d(p
, x
, CE1
);
1699 p
= gmx_simd_fmadd_d(p
, x
, one
);
1700 x
= gmx_simd_mul_d(p
, fexppart
);
1704 /*! \brief SIMD double exp(x).
1706 * \copydetails gmx_simd_exp_f
1708 static gmx_inline gmx_simd_double_t gmx_simdcall
1709 gmx_simd_exp_d(gmx_simd_double_t x
)
1711 const gmx_simd_double_t argscale
= gmx_simd_set1_d(1.44269504088896340735992468100);
1712 const gmx_simd_double_t arglimit
= gmx_simd_set1_d(1022.0);
1713 const gmx_simd_double_t invargscale0
= gmx_simd_set1_d(-0.69314718055966295651160180568695068359375);
1714 const gmx_simd_double_t invargscale1
= gmx_simd_set1_d(-2.8235290563031577122588448175013436025525412068e-13);
1715 const gmx_simd_double_t CE12
= gmx_simd_set1_d(2.078375306791423699350304e-09);
1716 const gmx_simd_double_t CE11
= gmx_simd_set1_d(2.518173854179933105218635e-08);
1717 const gmx_simd_double_t CE10
= gmx_simd_set1_d(2.755842049600488770111608e-07);
1718 const gmx_simd_double_t CE9
= gmx_simd_set1_d(2.755691815216689746619849e-06);
1719 const gmx_simd_double_t CE8
= gmx_simd_set1_d(2.480158383706245033920920e-05);
1720 const gmx_simd_double_t CE7
= gmx_simd_set1_d(0.0001984127043518048611841321);
1721 const gmx_simd_double_t CE6
= gmx_simd_set1_d(0.001388888889360258341755930);
1722 const gmx_simd_double_t CE5
= gmx_simd_set1_d(0.008333333332907368102819109);
1723 const gmx_simd_double_t CE4
= gmx_simd_set1_d(0.04166666666663836745814631);
1724 const gmx_simd_double_t CE3
= gmx_simd_set1_d(0.1666666666666796929434570);
1725 const gmx_simd_double_t CE2
= gmx_simd_set1_d(0.5);
1726 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
1727 gmx_simd_double_t fexppart
;
1728 gmx_simd_double_t intpart
;
1729 gmx_simd_double_t y
, p
;
1730 gmx_simd_dbool_t valuemask
;
1732 y
= gmx_simd_mul_d(x
, argscale
);
1733 fexppart
= gmx_simd_set_exponent_d(y
); /* rounds to nearest int internally */
1734 intpart
= gmx_simd_round_d(y
); /* use same rounding mode here */
1735 valuemask
= gmx_simd_cmple_d(gmx_simd_fabs_d(y
), arglimit
);
1736 fexppart
= gmx_simd_blendzero_d(fexppart
, valuemask
);
1738 /* Extended precision arithmetics */
1739 x
= gmx_simd_fmadd_d(invargscale0
, intpart
, x
);
1740 x
= gmx_simd_fmadd_d(invargscale1
, intpart
, x
);
1742 p
= gmx_simd_fmadd_d(CE12
, x
, CE11
);
1743 p
= gmx_simd_fmadd_d(p
, x
, CE10
);
1744 p
= gmx_simd_fmadd_d(p
, x
, CE9
);
1745 p
= gmx_simd_fmadd_d(p
, x
, CE8
);
1746 p
= gmx_simd_fmadd_d(p
, x
, CE7
);
1747 p
= gmx_simd_fmadd_d(p
, x
, CE6
);
1748 p
= gmx_simd_fmadd_d(p
, x
, CE5
);
1749 p
= gmx_simd_fmadd_d(p
, x
, CE4
);
1750 p
= gmx_simd_fmadd_d(p
, x
, CE3
);
1751 p
= gmx_simd_fmadd_d(p
, x
, CE2
);
1752 p
= gmx_simd_fmadd_d(p
, gmx_simd_mul_d(x
, x
), gmx_simd_add_d(x
, one
));
1753 x
= gmx_simd_mul_d(p
, fexppart
);
1757 /*! \brief SIMD double erf(x).
1759 * \copydetails gmx_simd_erf_f
1761 static gmx_inline gmx_simd_double_t gmx_simdcall
1762 gmx_simd_erf_d(gmx_simd_double_t x
)
1764 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1765 const gmx_simd_double_t CAP4
= gmx_simd_set1_d(-0.431780540597889301512e-4);
1766 const gmx_simd_double_t CAP3
= gmx_simd_set1_d(-0.00578562306260059236059);
1767 const gmx_simd_double_t CAP2
= gmx_simd_set1_d(-0.028593586920219752446);
1768 const gmx_simd_double_t CAP1
= gmx_simd_set1_d(-0.315924962948621698209);
1769 const gmx_simd_double_t CAP0
= gmx_simd_set1_d(0.14952975608477029151);
1771 const gmx_simd_double_t CAQ5
= gmx_simd_set1_d(-0.374089300177174709737e-5);
1772 const gmx_simd_double_t CAQ4
= gmx_simd_set1_d(0.00015126584532155383535);
1773 const gmx_simd_double_t CAQ3
= gmx_simd_set1_d(0.00536692680669480725423);
1774 const gmx_simd_double_t CAQ2
= gmx_simd_set1_d(0.0668686825594046122636);
1775 const gmx_simd_double_t CAQ1
= gmx_simd_set1_d(0.402604990869284362773);
1777 const gmx_simd_double_t CAoffset
= gmx_simd_set1_d(0.9788494110107421875);
1779 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1780 const gmx_simd_double_t CBP6
= gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1781 const gmx_simd_double_t CBP5
= gmx_simd_set1_d(0.00119770193298159629350136085658);
1782 const gmx_simd_double_t CBP4
= gmx_simd_set1_d(0.0164944422378370965881008942733);
1783 const gmx_simd_double_t CBP3
= gmx_simd_set1_d(0.0984581468691775932063932439252);
1784 const gmx_simd_double_t CBP2
= gmx_simd_set1_d(0.317364595806937763843589437418);
1785 const gmx_simd_double_t CBP1
= gmx_simd_set1_d(0.554167062641455850932670067075);
1786 const gmx_simd_double_t CBP0
= gmx_simd_set1_d(0.427583576155807163756925301060);
1787 const gmx_simd_double_t CBQ7
= gmx_simd_set1_d(0.00212288829699830145976198384930);
1788 const gmx_simd_double_t CBQ6
= gmx_simd_set1_d(0.0334810979522685300554606393425);
1789 const gmx_simd_double_t CBQ5
= gmx_simd_set1_d(0.2361713785181450957579508850717);
1790 const gmx_simd_double_t CBQ4
= gmx_simd_set1_d(0.955364736493055670530981883072);
1791 const gmx_simd_double_t CBQ3
= gmx_simd_set1_d(2.36815675631420037315349279199);
1792 const gmx_simd_double_t CBQ2
= gmx_simd_set1_d(3.55261649184083035537184223542);
1793 const gmx_simd_double_t CBQ1
= gmx_simd_set1_d(2.93501136050160872574376997993);
1796 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1797 const gmx_simd_double_t CCP6
= gmx_simd_set1_d(-2.8175401114513378771);
1798 const gmx_simd_double_t CCP5
= gmx_simd_set1_d(-3.22729451764143718517);
1799 const gmx_simd_double_t CCP4
= gmx_simd_set1_d(-2.5518551727311523996);
1800 const gmx_simd_double_t CCP3
= gmx_simd_set1_d(-0.687717681153649930619);
1801 const gmx_simd_double_t CCP2
= gmx_simd_set1_d(-0.212652252872804219852);
1802 const gmx_simd_double_t CCP1
= gmx_simd_set1_d(0.0175389834052493308818);
1803 const gmx_simd_double_t CCP0
= gmx_simd_set1_d(0.00628057170626964891937);
1805 const gmx_simd_double_t CCQ6
= gmx_simd_set1_d(5.48409182238641741584);
1806 const gmx_simd_double_t CCQ5
= gmx_simd_set1_d(13.5064170191802889145);
1807 const gmx_simd_double_t CCQ4
= gmx_simd_set1_d(22.9367376522880577224);
1808 const gmx_simd_double_t CCQ3
= gmx_simd_set1_d(15.930646027911794143);
1809 const gmx_simd_double_t CCQ2
= gmx_simd_set1_d(11.0567237927800161565);
1810 const gmx_simd_double_t CCQ1
= gmx_simd_set1_d(2.79257750980575282228);
1812 const gmx_simd_double_t CCoffset
= gmx_simd_set1_d(0.5579090118408203125);
1814 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
1815 const gmx_simd_double_t two
= gmx_simd_set1_d(2.0);
1817 gmx_simd_double_t xabs
, x2
, x4
, t
, t2
, w
, w2
;
1818 gmx_simd_double_t PolyAP0
, PolyAP1
, PolyAQ0
, PolyAQ1
;
1819 gmx_simd_double_t PolyBP0
, PolyBP1
, PolyBQ0
, PolyBQ1
;
1820 gmx_simd_double_t PolyCP0
, PolyCP1
, PolyCQ0
, PolyCQ1
;
1821 gmx_simd_double_t res_erf
, res_erfcB
, res_erfcC
, res_erfc
, res
;
1822 gmx_simd_double_t expmx2
;
1823 gmx_simd_dbool_t mask
, mask_erf
;
1825 /* Calculate erf() */
1826 xabs
= gmx_simd_fabs_d(x
);
1827 mask_erf
= gmx_simd_cmplt_d(xabs
, one
);
1828 x2
= gmx_simd_mul_d(x
, x
);
1829 x4
= gmx_simd_mul_d(x2
, x2
);
1831 PolyAP0
= gmx_simd_mul_d(CAP4
, x4
);
1832 PolyAP1
= gmx_simd_mul_d(CAP3
, x4
);
1833 PolyAP0
= gmx_simd_add_d(PolyAP0
, CAP2
);
1834 PolyAP1
= gmx_simd_add_d(PolyAP1
, CAP1
);
1835 PolyAP0
= gmx_simd_mul_d(PolyAP0
, x4
);
1836 PolyAP1
= gmx_simd_mul_d(PolyAP1
, x2
);
1837 PolyAP0
= gmx_simd_add_d(PolyAP0
, CAP0
);
1838 PolyAP0
= gmx_simd_add_d(PolyAP0
, PolyAP1
);
1840 PolyAQ1
= gmx_simd_mul_d(CAQ5
, x4
);
1841 PolyAQ0
= gmx_simd_mul_d(CAQ4
, x4
);
1842 PolyAQ1
= gmx_simd_add_d(PolyAQ1
, CAQ3
);
1843 PolyAQ0
= gmx_simd_add_d(PolyAQ0
, CAQ2
);
1844 PolyAQ1
= gmx_simd_mul_d(PolyAQ1
, x4
);
1845 PolyAQ0
= gmx_simd_mul_d(PolyAQ0
, x4
);
1846 PolyAQ1
= gmx_simd_add_d(PolyAQ1
, CAQ1
);
1847 PolyAQ0
= gmx_simd_add_d(PolyAQ0
, one
);
1848 PolyAQ1
= gmx_simd_mul_d(PolyAQ1
, x2
);
1849 PolyAQ0
= gmx_simd_add_d(PolyAQ0
, PolyAQ1
);
1851 res_erf
= gmx_simd_mul_d(PolyAP0
, gmx_simd_inv_maskfpe_d(PolyAQ0
, mask_erf
));
1852 res_erf
= gmx_simd_add_d(CAoffset
, res_erf
);
1853 res_erf
= gmx_simd_mul_d(x
, res_erf
);
1855 /* Calculate erfc() in range [1,4.5] */
1856 t
= gmx_simd_sub_d(xabs
, one
);
1857 t2
= gmx_simd_mul_d(t
, t
);
1859 PolyBP0
= gmx_simd_mul_d(CBP6
, t2
);
1860 PolyBP1
= gmx_simd_mul_d(CBP5
, t2
);
1861 PolyBP0
= gmx_simd_add_d(PolyBP0
, CBP4
);
1862 PolyBP1
= gmx_simd_add_d(PolyBP1
, CBP3
);
1863 PolyBP0
= gmx_simd_mul_d(PolyBP0
, t2
);
1864 PolyBP1
= gmx_simd_mul_d(PolyBP1
, t2
);
1865 PolyBP0
= gmx_simd_add_d(PolyBP0
, CBP2
);
1866 PolyBP1
= gmx_simd_add_d(PolyBP1
, CBP1
);
1867 PolyBP0
= gmx_simd_mul_d(PolyBP0
, t2
);
1868 PolyBP1
= gmx_simd_mul_d(PolyBP1
, t
);
1869 PolyBP0
= gmx_simd_add_d(PolyBP0
, CBP0
);
1870 PolyBP0
= gmx_simd_add_d(PolyBP0
, PolyBP1
);
1872 PolyBQ1
= gmx_simd_mul_d(CBQ7
, t2
);
1873 PolyBQ0
= gmx_simd_mul_d(CBQ6
, t2
);
1874 PolyBQ1
= gmx_simd_add_d(PolyBQ1
, CBQ5
);
1875 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, CBQ4
);
1876 PolyBQ1
= gmx_simd_mul_d(PolyBQ1
, t2
);
1877 PolyBQ0
= gmx_simd_mul_d(PolyBQ0
, t2
);
1878 PolyBQ1
= gmx_simd_add_d(PolyBQ1
, CBQ3
);
1879 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, CBQ2
);
1880 PolyBQ1
= gmx_simd_mul_d(PolyBQ1
, t2
);
1881 PolyBQ0
= gmx_simd_mul_d(PolyBQ0
, t2
);
1882 PolyBQ1
= gmx_simd_add_d(PolyBQ1
, CBQ1
);
1883 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, one
);
1884 PolyBQ1
= gmx_simd_mul_d(PolyBQ1
, t
);
1885 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, PolyBQ1
);
1887 res_erfcB
= gmx_simd_mul_d(PolyBP0
, gmx_simd_inv_notmaskfpe_d(PolyBQ0
, mask_erf
));
1889 res_erfcB
= gmx_simd_mul_d(res_erfcB
, xabs
);
1891 /* Calculate erfc() in range [4.5,inf] */
1892 w
= gmx_simd_inv_notmaskfpe_d(xabs
, mask_erf
);
1893 w2
= gmx_simd_mul_d(w
, w
);
1895 PolyCP0
= gmx_simd_mul_d(CCP6
, w2
);
1896 PolyCP1
= gmx_simd_mul_d(CCP5
, w2
);
1897 PolyCP0
= gmx_simd_add_d(PolyCP0
, CCP4
);
1898 PolyCP1
= gmx_simd_add_d(PolyCP1
, CCP3
);
1899 PolyCP0
= gmx_simd_mul_d(PolyCP0
, w2
);
1900 PolyCP1
= gmx_simd_mul_d(PolyCP1
, w2
);
1901 PolyCP0
= gmx_simd_add_d(PolyCP0
, CCP2
);
1902 PolyCP1
= gmx_simd_add_d(PolyCP1
, CCP1
);
1903 PolyCP0
= gmx_simd_mul_d(PolyCP0
, w2
);
1904 PolyCP1
= gmx_simd_mul_d(PolyCP1
, w
);
1905 PolyCP0
= gmx_simd_add_d(PolyCP0
, CCP0
);
1906 PolyCP0
= gmx_simd_add_d(PolyCP0
, PolyCP1
);
1908 PolyCQ0
= gmx_simd_mul_d(CCQ6
, w2
);
1909 PolyCQ1
= gmx_simd_mul_d(CCQ5
, w2
);
1910 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, CCQ4
);
1911 PolyCQ1
= gmx_simd_add_d(PolyCQ1
, CCQ3
);
1912 PolyCQ0
= gmx_simd_mul_d(PolyCQ0
, w2
);
1913 PolyCQ1
= gmx_simd_mul_d(PolyCQ1
, w2
);
1914 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, CCQ2
);
1915 PolyCQ1
= gmx_simd_add_d(PolyCQ1
, CCQ1
);
1916 PolyCQ0
= gmx_simd_mul_d(PolyCQ0
, w2
);
1917 PolyCQ1
= gmx_simd_mul_d(PolyCQ1
, w
);
1918 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, one
);
1919 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, PolyCQ1
);
1921 expmx2
= gmx_simd_exp_d( gmx_simd_fneg_d(x2
) );
1923 res_erfcC
= gmx_simd_mul_d(PolyCP0
, gmx_simd_inv_notmaskfpe_d(PolyCQ0
, mask_erf
));
1924 res_erfcC
= gmx_simd_add_d(res_erfcC
, CCoffset
);
1925 res_erfcC
= gmx_simd_mul_d(res_erfcC
, w
);
1927 mask
= gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs
);
1928 res_erfc
= gmx_simd_blendv_d(res_erfcB
, res_erfcC
, mask
);
1930 res_erfc
= gmx_simd_mul_d(res_erfc
, expmx2
);
1932 /* erfc(x<0) = 2-erfc(|x|) */
1933 mask
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
1934 res_erfc
= gmx_simd_blendv_d(res_erfc
, gmx_simd_sub_d(two
, res_erfc
), mask
);
1936 /* Select erf() or erfc() */
1937 res
= gmx_simd_blendv_d(gmx_simd_sub_d(one
, res_erfc
), res_erf
, mask_erf
);
1942 /*! \brief SIMD double erfc(x).
1944 * \copydetails gmx_simd_erfc_f
1946 static gmx_inline gmx_simd_double_t gmx_simdcall
1947 gmx_simd_erfc_d(gmx_simd_double_t x
)
1949 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1950 const gmx_simd_double_t CAP4
= gmx_simd_set1_d(-0.431780540597889301512e-4);
1951 const gmx_simd_double_t CAP3
= gmx_simd_set1_d(-0.00578562306260059236059);
1952 const gmx_simd_double_t CAP2
= gmx_simd_set1_d(-0.028593586920219752446);
1953 const gmx_simd_double_t CAP1
= gmx_simd_set1_d(-0.315924962948621698209);
1954 const gmx_simd_double_t CAP0
= gmx_simd_set1_d(0.14952975608477029151);
1956 const gmx_simd_double_t CAQ5
= gmx_simd_set1_d(-0.374089300177174709737e-5);
1957 const gmx_simd_double_t CAQ4
= gmx_simd_set1_d(0.00015126584532155383535);
1958 const gmx_simd_double_t CAQ3
= gmx_simd_set1_d(0.00536692680669480725423);
1959 const gmx_simd_double_t CAQ2
= gmx_simd_set1_d(0.0668686825594046122636);
1960 const gmx_simd_double_t CAQ1
= gmx_simd_set1_d(0.402604990869284362773);
1962 const gmx_simd_double_t CAoffset
= gmx_simd_set1_d(0.9788494110107421875);
1964 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1965 const gmx_simd_double_t CBP6
= gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1966 const gmx_simd_double_t CBP5
= gmx_simd_set1_d(0.00119770193298159629350136085658);
1967 const gmx_simd_double_t CBP4
= gmx_simd_set1_d(0.0164944422378370965881008942733);
1968 const gmx_simd_double_t CBP3
= gmx_simd_set1_d(0.0984581468691775932063932439252);
1969 const gmx_simd_double_t CBP2
= gmx_simd_set1_d(0.317364595806937763843589437418);
1970 const gmx_simd_double_t CBP1
= gmx_simd_set1_d(0.554167062641455850932670067075);
1971 const gmx_simd_double_t CBP0
= gmx_simd_set1_d(0.427583576155807163756925301060);
1972 const gmx_simd_double_t CBQ7
= gmx_simd_set1_d(0.00212288829699830145976198384930);
1973 const gmx_simd_double_t CBQ6
= gmx_simd_set1_d(0.0334810979522685300554606393425);
1974 const gmx_simd_double_t CBQ5
= gmx_simd_set1_d(0.2361713785181450957579508850717);
1975 const gmx_simd_double_t CBQ4
= gmx_simd_set1_d(0.955364736493055670530981883072);
1976 const gmx_simd_double_t CBQ3
= gmx_simd_set1_d(2.36815675631420037315349279199);
1977 const gmx_simd_double_t CBQ2
= gmx_simd_set1_d(3.55261649184083035537184223542);
1978 const gmx_simd_double_t CBQ1
= gmx_simd_set1_d(2.93501136050160872574376997993);
1981 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1982 const gmx_simd_double_t CCP6
= gmx_simd_set1_d(-2.8175401114513378771);
1983 const gmx_simd_double_t CCP5
= gmx_simd_set1_d(-3.22729451764143718517);
1984 const gmx_simd_double_t CCP4
= gmx_simd_set1_d(-2.5518551727311523996);
1985 const gmx_simd_double_t CCP3
= gmx_simd_set1_d(-0.687717681153649930619);
1986 const gmx_simd_double_t CCP2
= gmx_simd_set1_d(-0.212652252872804219852);
1987 const gmx_simd_double_t CCP1
= gmx_simd_set1_d(0.0175389834052493308818);
1988 const gmx_simd_double_t CCP0
= gmx_simd_set1_d(0.00628057170626964891937);
1990 const gmx_simd_double_t CCQ6
= gmx_simd_set1_d(5.48409182238641741584);
1991 const gmx_simd_double_t CCQ5
= gmx_simd_set1_d(13.5064170191802889145);
1992 const gmx_simd_double_t CCQ4
= gmx_simd_set1_d(22.9367376522880577224);
1993 const gmx_simd_double_t CCQ3
= gmx_simd_set1_d(15.930646027911794143);
1994 const gmx_simd_double_t CCQ2
= gmx_simd_set1_d(11.0567237927800161565);
1995 const gmx_simd_double_t CCQ1
= gmx_simd_set1_d(2.79257750980575282228);
1997 const gmx_simd_double_t CCoffset
= gmx_simd_set1_d(0.5579090118408203125);
1999 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
2000 const gmx_simd_double_t two
= gmx_simd_set1_d(2.0);
2002 gmx_simd_double_t xabs
, x2
, x4
, t
, t2
, w
, w2
;
2003 gmx_simd_double_t PolyAP0
, PolyAP1
, PolyAQ0
, PolyAQ1
;
2004 gmx_simd_double_t PolyBP0
, PolyBP1
, PolyBQ0
, PolyBQ1
;
2005 gmx_simd_double_t PolyCP0
, PolyCP1
, PolyCQ0
, PolyCQ1
;
2006 gmx_simd_double_t res_erf
, res_erfcB
, res_erfcC
, res_erfc
, res
;
2007 gmx_simd_double_t expmx2
;
2008 gmx_simd_dbool_t mask
, mask_erf
;
2010 /* Calculate erf() */
2011 xabs
= gmx_simd_fabs_d(x
);
2012 mask_erf
= gmx_simd_cmplt_d(xabs
, one
);
2013 x2
= gmx_simd_mul_d(x
, x
);
2014 x4
= gmx_simd_mul_d(x2
, x2
);
2016 PolyAP0
= gmx_simd_mul_d(CAP4
, x4
);
2017 PolyAP1
= gmx_simd_mul_d(CAP3
, x4
);
2018 PolyAP0
= gmx_simd_add_d(PolyAP0
, CAP2
);
2019 PolyAP1
= gmx_simd_add_d(PolyAP1
, CAP1
);
2020 PolyAP0
= gmx_simd_mul_d(PolyAP0
, x4
);
2021 PolyAP1
= gmx_simd_mul_d(PolyAP1
, x2
);
2022 PolyAP0
= gmx_simd_add_d(PolyAP0
, CAP0
);
2023 PolyAP0
= gmx_simd_add_d(PolyAP0
, PolyAP1
);
2025 PolyAQ1
= gmx_simd_mul_d(CAQ5
, x4
);
2026 PolyAQ0
= gmx_simd_mul_d(CAQ4
, x4
);
2027 PolyAQ1
= gmx_simd_add_d(PolyAQ1
, CAQ3
);
2028 PolyAQ0
= gmx_simd_add_d(PolyAQ0
, CAQ2
);
2029 PolyAQ1
= gmx_simd_mul_d(PolyAQ1
, x4
);
2030 PolyAQ0
= gmx_simd_mul_d(PolyAQ0
, x4
);
2031 PolyAQ1
= gmx_simd_add_d(PolyAQ1
, CAQ1
);
2032 PolyAQ0
= gmx_simd_add_d(PolyAQ0
, one
);
2033 PolyAQ1
= gmx_simd_mul_d(PolyAQ1
, x2
);
2034 PolyAQ0
= gmx_simd_add_d(PolyAQ0
, PolyAQ1
);
2036 res_erf
= gmx_simd_mul_d(PolyAP0
, gmx_simd_inv_maskfpe_d(PolyAQ0
, mask_erf
));
2037 res_erf
= gmx_simd_add_d(CAoffset
, res_erf
);
2038 res_erf
= gmx_simd_mul_d(x
, res_erf
);
2040 /* Calculate erfc() in range [1,4.5] */
2041 t
= gmx_simd_sub_d(xabs
, one
);
2042 t2
= gmx_simd_mul_d(t
, t
);
2044 PolyBP0
= gmx_simd_mul_d(CBP6
, t2
);
2045 PolyBP1
= gmx_simd_mul_d(CBP5
, t2
);
2046 PolyBP0
= gmx_simd_add_d(PolyBP0
, CBP4
);
2047 PolyBP1
= gmx_simd_add_d(PolyBP1
, CBP3
);
2048 PolyBP0
= gmx_simd_mul_d(PolyBP0
, t2
);
2049 PolyBP1
= gmx_simd_mul_d(PolyBP1
, t2
);
2050 PolyBP0
= gmx_simd_add_d(PolyBP0
, CBP2
);
2051 PolyBP1
= gmx_simd_add_d(PolyBP1
, CBP1
);
2052 PolyBP0
= gmx_simd_mul_d(PolyBP0
, t2
);
2053 PolyBP1
= gmx_simd_mul_d(PolyBP1
, t
);
2054 PolyBP0
= gmx_simd_add_d(PolyBP0
, CBP0
);
2055 PolyBP0
= gmx_simd_add_d(PolyBP0
, PolyBP1
);
2057 PolyBQ1
= gmx_simd_mul_d(CBQ7
, t2
);
2058 PolyBQ0
= gmx_simd_mul_d(CBQ6
, t2
);
2059 PolyBQ1
= gmx_simd_add_d(PolyBQ1
, CBQ5
);
2060 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, CBQ4
);
2061 PolyBQ1
= gmx_simd_mul_d(PolyBQ1
, t2
);
2062 PolyBQ0
= gmx_simd_mul_d(PolyBQ0
, t2
);
2063 PolyBQ1
= gmx_simd_add_d(PolyBQ1
, CBQ3
);
2064 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, CBQ2
);
2065 PolyBQ1
= gmx_simd_mul_d(PolyBQ1
, t2
);
2066 PolyBQ0
= gmx_simd_mul_d(PolyBQ0
, t2
);
2067 PolyBQ1
= gmx_simd_add_d(PolyBQ1
, CBQ1
);
2068 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, one
);
2069 PolyBQ1
= gmx_simd_mul_d(PolyBQ1
, t
);
2070 PolyBQ0
= gmx_simd_add_d(PolyBQ0
, PolyBQ1
);
2072 res_erfcB
= gmx_simd_mul_d(PolyBP0
, gmx_simd_inv_notmaskfpe_d(PolyBQ0
, mask_erf
));
2074 res_erfcB
= gmx_simd_mul_d(res_erfcB
, xabs
);
2076 /* Calculate erfc() in range [4.5,inf] */
2077 w
= gmx_simd_inv_notmaskfpe_d(xabs
, mask_erf
);
2078 w2
= gmx_simd_mul_d(w
, w
);
2080 PolyCP0
= gmx_simd_mul_d(CCP6
, w2
);
2081 PolyCP1
= gmx_simd_mul_d(CCP5
, w2
);
2082 PolyCP0
= gmx_simd_add_d(PolyCP0
, CCP4
);
2083 PolyCP1
= gmx_simd_add_d(PolyCP1
, CCP3
);
2084 PolyCP0
= gmx_simd_mul_d(PolyCP0
, w2
);
2085 PolyCP1
= gmx_simd_mul_d(PolyCP1
, w2
);
2086 PolyCP0
= gmx_simd_add_d(PolyCP0
, CCP2
);
2087 PolyCP1
= gmx_simd_add_d(PolyCP1
, CCP1
);
2088 PolyCP0
= gmx_simd_mul_d(PolyCP0
, w2
);
2089 PolyCP1
= gmx_simd_mul_d(PolyCP1
, w
);
2090 PolyCP0
= gmx_simd_add_d(PolyCP0
, CCP0
);
2091 PolyCP0
= gmx_simd_add_d(PolyCP0
, PolyCP1
);
2093 PolyCQ0
= gmx_simd_mul_d(CCQ6
, w2
);
2094 PolyCQ1
= gmx_simd_mul_d(CCQ5
, w2
);
2095 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, CCQ4
);
2096 PolyCQ1
= gmx_simd_add_d(PolyCQ1
, CCQ3
);
2097 PolyCQ0
= gmx_simd_mul_d(PolyCQ0
, w2
);
2098 PolyCQ1
= gmx_simd_mul_d(PolyCQ1
, w2
);
2099 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, CCQ2
);
2100 PolyCQ1
= gmx_simd_add_d(PolyCQ1
, CCQ1
);
2101 PolyCQ0
= gmx_simd_mul_d(PolyCQ0
, w2
);
2102 PolyCQ1
= gmx_simd_mul_d(PolyCQ1
, w
);
2103 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, one
);
2104 PolyCQ0
= gmx_simd_add_d(PolyCQ0
, PolyCQ1
);
2106 expmx2
= gmx_simd_exp_d( gmx_simd_fneg_d(x2
) );
2108 res_erfcC
= gmx_simd_mul_d(PolyCP0
, gmx_simd_inv_notmaskfpe_d(PolyCQ0
, mask_erf
));
2109 res_erfcC
= gmx_simd_add_d(res_erfcC
, CCoffset
);
2110 res_erfcC
= gmx_simd_mul_d(res_erfcC
, w
);
2112 mask
= gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs
);
2113 res_erfc
= gmx_simd_blendv_d(res_erfcB
, res_erfcC
, mask
);
2115 res_erfc
= gmx_simd_mul_d(res_erfc
, expmx2
);
2117 /* erfc(x<0) = 2-erfc(|x|) */
2118 mask
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
2119 res_erfc
= gmx_simd_blendv_d(res_erfc
, gmx_simd_sub_d(two
, res_erfc
), mask
);
2121 /* Select erf() or erfc() */
2122 res
= gmx_simd_blendv_d(res_erfc
, gmx_simd_sub_d(one
, res_erf
), mask_erf
);
2127 /*! \brief SIMD double sin \& cos.
2129 * \copydetails gmx_simd_sincos_f
2131 static gmx_inline
void gmx_simdcall
2132 gmx_simd_sincos_d(gmx_simd_double_t x
, gmx_simd_double_t
*sinval
, gmx_simd_double_t
*cosval
)
2134 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
2135 const gmx_simd_double_t argred0
= gmx_simd_set1_d(-2*0.78539816290140151978);
2136 const gmx_simd_double_t argred1
= gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2137 const gmx_simd_double_t argred2
= gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2138 const gmx_simd_double_t argred3
= gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2139 const gmx_simd_double_t two_over_pi
= gmx_simd_set1_d(2.0/M_PI
);
2140 const gmx_simd_double_t const_sin5
= gmx_simd_set1_d( 1.58938307283228937328511e-10);
2141 const gmx_simd_double_t const_sin4
= gmx_simd_set1_d(-2.50506943502539773349318e-08);
2142 const gmx_simd_double_t const_sin3
= gmx_simd_set1_d( 2.75573131776846360512547e-06);
2143 const gmx_simd_double_t const_sin2
= gmx_simd_set1_d(-0.000198412698278911770864914);
2144 const gmx_simd_double_t const_sin1
= gmx_simd_set1_d( 0.0083333333333191845961746);
2145 const gmx_simd_double_t const_sin0
= gmx_simd_set1_d(-0.166666666666666130709393);
2147 const gmx_simd_double_t const_cos7
= gmx_simd_set1_d(-1.13615350239097429531523e-11);
2148 const gmx_simd_double_t const_cos6
= gmx_simd_set1_d( 2.08757471207040055479366e-09);
2149 const gmx_simd_double_t const_cos5
= gmx_simd_set1_d(-2.75573144028847567498567e-07);
2150 const gmx_simd_double_t const_cos4
= gmx_simd_set1_d( 2.48015872890001867311915e-05);
2151 const gmx_simd_double_t const_cos3
= gmx_simd_set1_d(-0.00138888888888714019282329);
2152 const gmx_simd_double_t const_cos2
= gmx_simd_set1_d( 0.0416666666666665519592062);
2153 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
2154 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
2155 gmx_simd_double_t ssign
, csign
;
2156 gmx_simd_double_t x2
, y
, z
, psin
, pcos
, sss
, ccc
;
2157 gmx_simd_dbool_t mask
;
2158 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2159 const gmx_simd_dint32_t ione
= gmx_simd_set1_di(1);
2160 const gmx_simd_dint32_t itwo
= gmx_simd_set1_di(2);
2161 gmx_simd_dint32_t iy
;
2163 z
= gmx_simd_mul_d(x
, two_over_pi
);
2164 iy
= gmx_simd_cvt_d2i(z
);
2165 y
= gmx_simd_round_d(z
);
2167 mask
= gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy
, ione
), gmx_simd_setzero_di()));
2168 ssign
= gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO
), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy
, itwo
), itwo
)));
2169 csign
= gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO
), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy
, ione
), itwo
), itwo
)));
2171 const gmx_simd_double_t quarter
= gmx_simd_set1_d(0.25);
2172 const gmx_simd_double_t minusquarter
= gmx_simd_set1_d(-0.25);
2173 gmx_simd_double_t q
;
2174 gmx_simd_dbool_t m1
, m2
, m3
;
2176 /* The most obvious way to find the arguments quadrant in the unit circle
2177 * to calculate the sign is to use integer arithmetic, but that is not
2178 * present in all SIMD implementations. As an alternative, we have devised a
2179 * pure floating-point algorithm that uses truncation for argument reduction
2180 * so that we get a new value 0<=q<1 over the unit circle, and then
2181 * do floating-point comparisons with fractions. This is likely to be
2182 * slightly slower (~10%) due to the longer latencies of floating-point, so
2183 * we only use it when integer SIMD arithmetic is not present.
2186 x
= gmx_simd_fabs_d(x
);
2187 /* It is critical that half-way cases are rounded down */
2188 z
= gmx_simd_fmadd_d(x
, two_over_pi
, half
);
2189 y
= gmx_simd_trunc_d(z
);
2190 q
= gmx_simd_mul_d(z
, quarter
);
2191 q
= gmx_simd_sub_d(q
, gmx_simd_trunc_d(q
));
2192 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2193 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2194 * This removes the 2*Pi periodicity without using any integer arithmetic.
2195 * First check if y had the value 2 or 3, set csign if true.
2197 q
= gmx_simd_sub_d(q
, half
);
2198 /* If we have logical operations we can work directly on the signbit, which
2199 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2200 * Thus, if you are altering defines to debug alternative code paths, the
2201 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2202 * active or inactive - you will get errors if only one is used.
2204 # ifdef GMX_SIMD_HAVE_LOGICAL
2205 ssign
= gmx_simd_and_d(ssign
, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO
));
2206 csign
= gmx_simd_andnot_d(q
, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO
));
2207 ssign
= gmx_simd_xor_d(ssign
, csign
);
2209 csign
= gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q
);
2210 ssign
= gmx_simd_xor_sign_d(ssign
, csign
); /* swap ssign if csign was set. */
2212 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
2213 m1
= gmx_simd_cmplt_d(q
, minusquarter
);
2214 m2
= gmx_simd_cmple_d(gmx_simd_setzero_d(), q
);
2215 m3
= gmx_simd_cmplt_d(q
, quarter
);
2216 m2
= gmx_simd_and_db(m2
, m3
);
2217 mask
= gmx_simd_or_db(m1
, m2
);
2218 /* where mask is FALSE, set sign. */
2219 csign
= gmx_simd_xor_sign_d(csign
, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one
, mask
));
2221 x
= gmx_simd_fmadd_d(y
, argred0
, x
);
2222 x
= gmx_simd_fmadd_d(y
, argred1
, x
);
2223 x
= gmx_simd_fmadd_d(y
, argred2
, x
);
2224 x
= gmx_simd_fmadd_d(y
, argred3
, x
);
2225 x2
= gmx_simd_mul_d(x
, x
);
2227 psin
= gmx_simd_fmadd_d(const_sin5
, x2
, const_sin4
);
2228 psin
= gmx_simd_fmadd_d(psin
, x2
, const_sin3
);
2229 psin
= gmx_simd_fmadd_d(psin
, x2
, const_sin2
);
2230 psin
= gmx_simd_fmadd_d(psin
, x2
, const_sin1
);
2231 psin
= gmx_simd_fmadd_d(psin
, x2
, const_sin0
);
2232 psin
= gmx_simd_fmadd_d(psin
, gmx_simd_mul_d(x2
, x
), x
);
2234 pcos
= gmx_simd_fmadd_d(const_cos7
, x2
, const_cos6
);
2235 pcos
= gmx_simd_fmadd_d(pcos
, x2
, const_cos5
);
2236 pcos
= gmx_simd_fmadd_d(pcos
, x2
, const_cos4
);
2237 pcos
= gmx_simd_fmadd_d(pcos
, x2
, const_cos3
);
2238 pcos
= gmx_simd_fmadd_d(pcos
, x2
, const_cos2
);
2239 pcos
= gmx_simd_fmsub_d(pcos
, x2
, half
);
2240 pcos
= gmx_simd_fmadd_d(pcos
, x2
, one
);
2242 sss
= gmx_simd_blendv_d(pcos
, psin
, mask
);
2243 ccc
= gmx_simd_blendv_d(psin
, pcos
, mask
);
2244 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
2245 #ifdef GMX_SIMD_HAVE_LOGICAL
2246 *sinval
= gmx_simd_xor_d(sss
, ssign
);
2247 *cosval
= gmx_simd_xor_d(ccc
, csign
);
2249 *sinval
= gmx_simd_xor_sign_d(sss
, ssign
);
2250 *cosval
= gmx_simd_xor_sign_d(ccc
, csign
);
2254 /*! \brief SIMD double sin(x).
2256 * \copydetails gmx_simd_sin_f
2258 static gmx_inline gmx_simd_double_t gmx_simdcall
2259 gmx_simd_sin_d(gmx_simd_double_t x
)
2261 gmx_simd_double_t s
, c
;
2262 gmx_simd_sincos_d(x
, &s
, &c
);
2266 /*! \brief SIMD double cos(x).
2268 * \copydetails gmx_simd_cos_f
2270 static gmx_inline gmx_simd_double_t gmx_simdcall
2271 gmx_simd_cos_d(gmx_simd_double_t x
)
2273 gmx_simd_double_t s
, c
;
2274 gmx_simd_sincos_d(x
, &s
, &c
);
2278 /*! \brief SIMD double tan(x).
2280 * \copydetails gmx_simd_tan_f
2282 static gmx_inline gmx_simd_double_t gmx_simdcall
2283 gmx_simd_tan_d(gmx_simd_double_t x
)
2285 const gmx_simd_double_t argred0
= gmx_simd_set1_d(-2*0.78539816290140151978);
2286 const gmx_simd_double_t argred1
= gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2287 const gmx_simd_double_t argred2
= gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2288 const gmx_simd_double_t argred3
= gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2289 const gmx_simd_double_t two_over_pi
= gmx_simd_set1_d(2.0/M_PI
);
2290 const gmx_simd_double_t CT15
= gmx_simd_set1_d(1.01419718511083373224408e-05);
2291 const gmx_simd_double_t CT14
= gmx_simd_set1_d(-2.59519791585924697698614e-05);
2292 const gmx_simd_double_t CT13
= gmx_simd_set1_d(5.23388081915899855325186e-05);
2293 const gmx_simd_double_t CT12
= gmx_simd_set1_d(-3.05033014433946488225616e-05);
2294 const gmx_simd_double_t CT11
= gmx_simd_set1_d(7.14707504084242744267497e-05);
2295 const gmx_simd_double_t CT10
= gmx_simd_set1_d(8.09674518280159187045078e-05);
2296 const gmx_simd_double_t CT9
= gmx_simd_set1_d(0.000244884931879331847054404);
2297 const gmx_simd_double_t CT8
= gmx_simd_set1_d(0.000588505168743587154904506);
2298 const gmx_simd_double_t CT7
= gmx_simd_set1_d(0.00145612788922812427978848);
2299 const gmx_simd_double_t CT6
= gmx_simd_set1_d(0.00359208743836906619142924);
2300 const gmx_simd_double_t CT5
= gmx_simd_set1_d(0.00886323944362401618113356);
2301 const gmx_simd_double_t CT4
= gmx_simd_set1_d(0.0218694882853846389592078);
2302 const gmx_simd_double_t CT3
= gmx_simd_set1_d(0.0539682539781298417636002);
2303 const gmx_simd_double_t CT2
= gmx_simd_set1_d(0.133333333333125941821962);
2304 const gmx_simd_double_t CT1
= gmx_simd_set1_d(0.333333333333334980164153);
2306 gmx_simd_double_t x2
, p
, y
, z
;
2307 gmx_simd_dbool_t mask
;
2309 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2310 gmx_simd_dint32_t iy
;
2311 gmx_simd_dint32_t ione
= gmx_simd_set1_di(1);
2313 z
= gmx_simd_mul_d(x
, two_over_pi
);
2314 iy
= gmx_simd_cvt_d2i(z
);
2315 y
= gmx_simd_round_d(z
);
2316 mask
= gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy
, ione
), ione
));
2318 x
= gmx_simd_fmadd_d(y
, argred0
, x
);
2319 x
= gmx_simd_fmadd_d(y
, argred1
, x
);
2320 x
= gmx_simd_fmadd_d(y
, argred2
, x
);
2321 x
= gmx_simd_fmadd_d(y
, argred3
, x
);
2322 x
= gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO
), mask
), x
);
2324 const gmx_simd_double_t quarter
= gmx_simd_set1_d(0.25);
2325 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
2326 const gmx_simd_double_t threequarter
= gmx_simd_set1_d(0.75);
2327 gmx_simd_double_t w
, q
;
2328 gmx_simd_dbool_t m1
, m2
, m3
;
2330 w
= gmx_simd_fabs_d(x
);
2331 z
= gmx_simd_fmadd_d(w
, two_over_pi
, half
);
2332 y
= gmx_simd_trunc_d(z
);
2333 q
= gmx_simd_mul_d(z
, quarter
);
2334 q
= gmx_simd_sub_d(q
, gmx_simd_trunc_d(q
));
2335 m1
= gmx_simd_cmple_d(quarter
, q
);
2336 m2
= gmx_simd_cmplt_d(q
, half
);
2337 m3
= gmx_simd_cmple_d(threequarter
, q
);
2338 m1
= gmx_simd_and_db(m1
, m2
);
2339 mask
= gmx_simd_or_db(m1
, m3
);
2340 w
= gmx_simd_fmadd_d(y
, argred0
, w
);
2341 w
= gmx_simd_fmadd_d(y
, argred1
, w
);
2342 w
= gmx_simd_fmadd_d(y
, argred2
, w
);
2343 w
= gmx_simd_fmadd_d(y
, argred3
, w
);
2345 w
= gmx_simd_blendv_d(w
, gmx_simd_fneg_d(w
), mask
);
2346 x
= gmx_simd_xor_sign_d(w
, x
);
2348 x2
= gmx_simd_mul_d(x
, x
);
2349 p
= gmx_simd_fmadd_d(CT15
, x2
, CT14
);
2350 p
= gmx_simd_fmadd_d(p
, x2
, CT13
);
2351 p
= gmx_simd_fmadd_d(p
, x2
, CT12
);
2352 p
= gmx_simd_fmadd_d(p
, x2
, CT11
);
2353 p
= gmx_simd_fmadd_d(p
, x2
, CT10
);
2354 p
= gmx_simd_fmadd_d(p
, x2
, CT9
);
2355 p
= gmx_simd_fmadd_d(p
, x2
, CT8
);
2356 p
= gmx_simd_fmadd_d(p
, x2
, CT7
);
2357 p
= gmx_simd_fmadd_d(p
, x2
, CT6
);
2358 p
= gmx_simd_fmadd_d(p
, x2
, CT5
);
2359 p
= gmx_simd_fmadd_d(p
, x2
, CT4
);
2360 p
= gmx_simd_fmadd_d(p
, x2
, CT3
);
2361 p
= gmx_simd_fmadd_d(p
, x2
, CT2
);
2362 p
= gmx_simd_fmadd_d(p
, x2
, CT1
);
2363 p
= gmx_simd_fmadd_d(x2
, gmx_simd_mul_d(p
, x
), x
);
2365 p
= gmx_simd_blendv_d( p
, gmx_simd_inv_maskfpe_d(p
, mask
), mask
);
2369 /*! \brief SIMD double asin(x).
2371 * \copydetails gmx_simd_asin_f
2373 static gmx_inline gmx_simd_double_t gmx_simdcall
2374 gmx_simd_asin_d(gmx_simd_double_t x
)
2376 /* Same algorithm as cephes library */
2377 const gmx_simd_double_t limit1
= gmx_simd_set1_d(0.625);
2378 const gmx_simd_double_t limit2
= gmx_simd_set1_d(1e-8);
2379 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
2380 const gmx_simd_double_t quarterpi
= gmx_simd_set1_d(M_PI
/4.0);
2381 const gmx_simd_double_t morebits
= gmx_simd_set1_d(6.123233995736765886130e-17);
2383 const gmx_simd_double_t P5
= gmx_simd_set1_d(4.253011369004428248960e-3);
2384 const gmx_simd_double_t P4
= gmx_simd_set1_d(-6.019598008014123785661e-1);
2385 const gmx_simd_double_t P3
= gmx_simd_set1_d(5.444622390564711410273e0
);
2386 const gmx_simd_double_t P2
= gmx_simd_set1_d(-1.626247967210700244449e1
);
2387 const gmx_simd_double_t P1
= gmx_simd_set1_d(1.956261983317594739197e1
);
2388 const gmx_simd_double_t P0
= gmx_simd_set1_d(-8.198089802484824371615e0
);
2390 const gmx_simd_double_t Q4
= gmx_simd_set1_d(-1.474091372988853791896e1
);
2391 const gmx_simd_double_t Q3
= gmx_simd_set1_d(7.049610280856842141659e1
);
2392 const gmx_simd_double_t Q2
= gmx_simd_set1_d(-1.471791292232726029859e2
);
2393 const gmx_simd_double_t Q1
= gmx_simd_set1_d(1.395105614657485689735e2
);
2394 const gmx_simd_double_t Q0
= gmx_simd_set1_d(-4.918853881490881290097e1
);
2396 const gmx_simd_double_t R4
= gmx_simd_set1_d(2.967721961301243206100e-3);
2397 const gmx_simd_double_t R3
= gmx_simd_set1_d(-5.634242780008963776856e-1);
2398 const gmx_simd_double_t R2
= gmx_simd_set1_d(6.968710824104713396794e0
);
2399 const gmx_simd_double_t R1
= gmx_simd_set1_d(-2.556901049652824852289e1
);
2400 const gmx_simd_double_t R0
= gmx_simd_set1_d(2.853665548261061424989e1
);
2402 const gmx_simd_double_t S3
= gmx_simd_set1_d(-2.194779531642920639778e1
);
2403 const gmx_simd_double_t S2
= gmx_simd_set1_d(1.470656354026814941758e2
);
2404 const gmx_simd_double_t S1
= gmx_simd_set1_d(-3.838770957603691357202e2
);
2405 const gmx_simd_double_t S0
= gmx_simd_set1_d(3.424398657913078477438e2
);
2407 gmx_simd_double_t xabs
;
2408 gmx_simd_double_t zz
, ww
, z
, q
, w
, zz2
, ww2
;
2409 gmx_simd_double_t PA
, PB
;
2410 gmx_simd_double_t QA
, QB
;
2411 gmx_simd_double_t RA
, RB
;
2412 gmx_simd_double_t SA
, SB
;
2413 gmx_simd_double_t nom
, denom
;
2414 gmx_simd_dbool_t mask
, mask2
;
2416 xabs
= gmx_simd_fabs_d(x
);
2418 mask
= gmx_simd_cmplt_d(limit1
, xabs
);
2420 zz
= gmx_simd_sub_d(one
, xabs
);
2421 ww
= gmx_simd_mul_d(xabs
, xabs
);
2422 zz2
= gmx_simd_mul_d(zz
, zz
);
2423 ww2
= gmx_simd_mul_d(ww
, ww
);
2426 RA
= gmx_simd_mul_d(R4
, zz2
);
2427 RB
= gmx_simd_mul_d(R3
, zz2
);
2428 RA
= gmx_simd_add_d(RA
, R2
);
2429 RB
= gmx_simd_add_d(RB
, R1
);
2430 RA
= gmx_simd_mul_d(RA
, zz2
);
2431 RB
= gmx_simd_mul_d(RB
, zz
);
2432 RA
= gmx_simd_add_d(RA
, R0
);
2433 RA
= gmx_simd_add_d(RA
, RB
);
2436 SB
= gmx_simd_mul_d(S3
, zz2
);
2437 SA
= gmx_simd_add_d(zz2
, S2
);
2438 SB
= gmx_simd_add_d(SB
, S1
);
2439 SA
= gmx_simd_mul_d(SA
, zz2
);
2440 SB
= gmx_simd_mul_d(SB
, zz
);
2441 SA
= gmx_simd_add_d(SA
, S0
);
2442 SA
= gmx_simd_add_d(SA
, SB
);
2445 PA
= gmx_simd_mul_d(P5
, ww2
);
2446 PB
= gmx_simd_mul_d(P4
, ww2
);
2447 PA
= gmx_simd_add_d(PA
, P3
);
2448 PB
= gmx_simd_add_d(PB
, P2
);
2449 PA
= gmx_simd_mul_d(PA
, ww2
);
2450 PB
= gmx_simd_mul_d(PB
, ww2
);
2451 PA
= gmx_simd_add_d(PA
, P1
);
2452 PB
= gmx_simd_add_d(PB
, P0
);
2453 PA
= gmx_simd_mul_d(PA
, ww
);
2454 PA
= gmx_simd_add_d(PA
, PB
);
2457 QB
= gmx_simd_mul_d(Q4
, ww2
);
2458 QA
= gmx_simd_add_d(ww2
, Q3
);
2459 QB
= gmx_simd_add_d(QB
, Q2
);
2460 QA
= gmx_simd_mul_d(QA
, ww2
);
2461 QB
= gmx_simd_mul_d(QB
, ww2
);
2462 QA
= gmx_simd_add_d(QA
, Q1
);
2463 QB
= gmx_simd_add_d(QB
, Q0
);
2464 QA
= gmx_simd_mul_d(QA
, ww
);
2465 QA
= gmx_simd_add_d(QA
, QB
);
2467 RA
= gmx_simd_mul_d(RA
, zz
);
2468 PA
= gmx_simd_mul_d(PA
, ww
);
2470 nom
= gmx_simd_blendv_d( PA
, RA
, mask
);
2471 denom
= gmx_simd_blendv_d( QA
, SA
, mask
);
2473 mask2
= gmx_simd_cmplt_d(limit2
, xabs
);
2474 q
= gmx_simd_mul_d( nom
, gmx_simd_inv_maskfpe_d(denom
, mask2
) );
2476 zz
= gmx_simd_add_d(zz
, zz
);
2477 zz
= gmx_simd_sqrt_d(zz
);
2478 z
= gmx_simd_sub_d(quarterpi
, zz
);
2479 zz
= gmx_simd_mul_d(zz
, q
);
2480 zz
= gmx_simd_sub_d(zz
, morebits
);
2481 z
= gmx_simd_sub_d(z
, zz
);
2482 z
= gmx_simd_add_d(z
, quarterpi
);
2484 w
= gmx_simd_mul_d(xabs
, q
);
2485 w
= gmx_simd_add_d(w
, xabs
);
2487 z
= gmx_simd_blendv_d( w
, z
, mask
);
2489 z
= gmx_simd_blendv_d( xabs
, z
, mask2
);
2491 z
= gmx_simd_xor_sign_d(z
, x
);
2496 /*! \brief SIMD double acos(x).
2498 * \copydetails gmx_simd_acos_f
2500 static gmx_inline gmx_simd_double_t gmx_simdcall
2501 gmx_simd_acos_d(gmx_simd_double_t x
)
2503 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
2504 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
2505 const gmx_simd_double_t quarterpi0
= gmx_simd_set1_d(7.85398163397448309616e-1);
2506 const gmx_simd_double_t quarterpi1
= gmx_simd_set1_d(6.123233995736765886130e-17);
2508 gmx_simd_dbool_t mask1
;
2509 gmx_simd_double_t z
, z1
, z2
;
2511 mask1
= gmx_simd_cmplt_d(half
, x
);
2512 z1
= gmx_simd_mul_d(half
, gmx_simd_sub_d(one
, x
));
2513 z1
= gmx_simd_sqrt_d(z1
);
2514 z
= gmx_simd_blendv_d( x
, z1
, mask1
);
2516 z
= gmx_simd_asin_d(z
);
2518 z1
= gmx_simd_add_d(z
, z
);
2520 z2
= gmx_simd_sub_d(quarterpi0
, z
);
2521 z2
= gmx_simd_add_d(z2
, quarterpi1
);
2522 z2
= gmx_simd_add_d(z2
, quarterpi0
);
2524 z
= gmx_simd_blendv_d(z2
, z1
, mask1
);
2529 /*! \brief SIMD double atan(x).
2531 * \copydetails gmx_simd_atan_f
2533 static gmx_inline gmx_simd_double_t gmx_simdcall
2534 gmx_simd_atan_d(gmx_simd_double_t x
)
2536 /* Same algorithm as cephes library */
2537 const gmx_simd_double_t limit1
= gmx_simd_set1_d(0.66);
2538 const gmx_simd_double_t limit2
= gmx_simd_set1_d(2.41421356237309504880);
2539 const gmx_simd_double_t quarterpi
= gmx_simd_set1_d(M_PI
/4.0);
2540 const gmx_simd_double_t halfpi
= gmx_simd_set1_d(M_PI
/2.0);
2541 const gmx_simd_double_t mone
= gmx_simd_set1_d(-1.0);
2542 const gmx_simd_double_t morebits1
= gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
2543 const gmx_simd_double_t morebits2
= gmx_simd_set1_d(6.123233995736765886130E-17);
2545 const gmx_simd_double_t P4
= gmx_simd_set1_d(-8.750608600031904122785E-1);
2546 const gmx_simd_double_t P3
= gmx_simd_set1_d(-1.615753718733365076637E1
);
2547 const gmx_simd_double_t P2
= gmx_simd_set1_d(-7.500855792314704667340E1
);
2548 const gmx_simd_double_t P1
= gmx_simd_set1_d(-1.228866684490136173410E2
);
2549 const gmx_simd_double_t P0
= gmx_simd_set1_d(-6.485021904942025371773E1
);
2551 const gmx_simd_double_t Q4
= gmx_simd_set1_d(2.485846490142306297962E1
);
2552 const gmx_simd_double_t Q3
= gmx_simd_set1_d(1.650270098316988542046E2
);
2553 const gmx_simd_double_t Q2
= gmx_simd_set1_d(4.328810604912902668951E2
);
2554 const gmx_simd_double_t Q1
= gmx_simd_set1_d(4.853903996359136964868E2
);
2555 const gmx_simd_double_t Q0
= gmx_simd_set1_d(1.945506571482613964425E2
);
2557 gmx_simd_double_t y
, xabs
, t1
, t2
;
2558 gmx_simd_double_t z
, z2
;
2559 gmx_simd_double_t P_A
, P_B
, Q_A
, Q_B
;
2560 gmx_simd_dbool_t mask1
, mask2
;
2562 xabs
= gmx_simd_fabs_d(x
);
2564 mask1
= gmx_simd_cmplt_d(limit1
, xabs
);
2565 mask2
= gmx_simd_cmplt_d(limit2
, xabs
);
2567 t1
= gmx_simd_mul_d(gmx_simd_add_d(xabs
, mone
),
2568 gmx_simd_inv_maskfpe_d(gmx_simd_sub_d(xabs
, mone
), mask1
));
2569 t2
= gmx_simd_mul_d(mone
, gmx_simd_inv_maskfpe_d(xabs
, mask2
));
2571 y
= gmx_simd_blendzero_d(quarterpi
, mask1
);
2572 y
= gmx_simd_blendv_d(y
, halfpi
, mask2
);
2573 xabs
= gmx_simd_blendv_d(xabs
, t1
, mask1
);
2574 xabs
= gmx_simd_blendv_d(xabs
, t2
, mask2
);
2576 z
= gmx_simd_mul_d(xabs
, xabs
);
2577 z2
= gmx_simd_mul_d(z
, z
);
2579 P_A
= gmx_simd_mul_d(P4
, z2
);
2580 P_B
= gmx_simd_mul_d(P3
, z2
);
2581 P_A
= gmx_simd_add_d(P_A
, P2
);
2582 P_B
= gmx_simd_add_d(P_B
, P1
);
2583 P_A
= gmx_simd_mul_d(P_A
, z2
);
2584 P_B
= gmx_simd_mul_d(P_B
, z
);
2585 P_A
= gmx_simd_add_d(P_A
, P0
);
2586 P_A
= gmx_simd_add_d(P_A
, P_B
);
2589 Q_B
= gmx_simd_mul_d(Q4
, z2
);
2590 Q_A
= gmx_simd_add_d(z2
, Q3
);
2591 Q_B
= gmx_simd_add_d(Q_B
, Q2
);
2592 Q_A
= gmx_simd_mul_d(Q_A
, z2
);
2593 Q_B
= gmx_simd_mul_d(Q_B
, z2
);
2594 Q_A
= gmx_simd_add_d(Q_A
, Q1
);
2595 Q_B
= gmx_simd_add_d(Q_B
, Q0
);
2596 Q_A
= gmx_simd_mul_d(Q_A
, z
);
2597 Q_A
= gmx_simd_add_d(Q_A
, Q_B
);
2599 z
= gmx_simd_mul_d(z
, P_A
);
2600 z
= gmx_simd_mul_d(z
, gmx_simd_inv_d(Q_A
));
2601 z
= gmx_simd_mul_d(z
, xabs
);
2602 z
= gmx_simd_add_d(z
, xabs
);
2604 t1
= gmx_simd_blendzero_d(morebits1
, mask1
);
2605 t1
= gmx_simd_blendv_d(t1
, morebits2
, mask2
);
2607 z
= gmx_simd_add_d(z
, t1
);
2608 y
= gmx_simd_add_d(y
, z
);
2610 y
= gmx_simd_xor_sign_d(y
, x
);
2615 /*! \brief SIMD double atan2(y,x).
2617 * \copydetails gmx_simd_atan2_f
2619 static gmx_inline gmx_simd_double_t gmx_simdcall
2620 gmx_simd_atan2_d(gmx_simd_double_t y
, gmx_simd_double_t x
)
2622 const gmx_simd_double_t pi
= gmx_simd_set1_d(M_PI
);
2623 const gmx_simd_double_t halfpi
= gmx_simd_set1_d(M_PI
/2.0);
2624 gmx_simd_double_t xinv
, p
, aoffset
;
2625 gmx_simd_dbool_t mask_x0
, mask_y0
, mask_xlt0
, mask_ylt0
;
2627 mask_x0
= gmx_simd_cmpeq_d(x
, gmx_simd_setzero_d());
2628 mask_y0
= gmx_simd_cmpeq_d(y
, gmx_simd_setzero_d());
2629 mask_xlt0
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
2630 mask_ylt0
= gmx_simd_cmplt_d(y
, gmx_simd_setzero_d());
2632 aoffset
= gmx_simd_blendzero_d(halfpi
, mask_x0
);
2633 aoffset
= gmx_simd_blendnotzero_d(aoffset
, mask_y0
);
2635 aoffset
= gmx_simd_blendv_d(aoffset
, pi
, mask_xlt0
);
2636 aoffset
= gmx_simd_blendv_d(aoffset
, gmx_simd_fneg_d(aoffset
), mask_ylt0
);
2638 xinv
= gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_d(x
, mask_x0
), mask_x0
);
2639 p
= gmx_simd_mul_d(y
, xinv
);
2640 p
= gmx_simd_atan_d(p
);
2641 p
= gmx_simd_add_d(p
, aoffset
);
2647 /*! \brief Calculate the force correction due to PME analytically for SIMD double.
2649 * \copydetails gmx_simd_pmecorrF_f
2651 static gmx_inline gmx_simd_double_t gmx_simdcall
2652 gmx_simd_pmecorrF_d(gmx_simd_double_t z2
)
2654 const gmx_simd_double_t FN10
= gmx_simd_set1_d(-8.0072854618360083154e-14);
2655 const gmx_simd_double_t FN9
= gmx_simd_set1_d(1.1859116242260148027e-11);
2656 const gmx_simd_double_t FN8
= gmx_simd_set1_d(-8.1490406329798423616e-10);
2657 const gmx_simd_double_t FN7
= gmx_simd_set1_d(3.4404793543907847655e-8);
2658 const gmx_simd_double_t FN6
= gmx_simd_set1_d(-9.9471420832602741006e-7);
2659 const gmx_simd_double_t FN5
= gmx_simd_set1_d(0.000020740315999115847456);
2660 const gmx_simd_double_t FN4
= gmx_simd_set1_d(-0.00031991745139313364005);
2661 const gmx_simd_double_t FN3
= gmx_simd_set1_d(0.0035074449373659008203);
2662 const gmx_simd_double_t FN2
= gmx_simd_set1_d(-0.031750380176100813405);
2663 const gmx_simd_double_t FN1
= gmx_simd_set1_d(0.13884101728898463426);
2664 const gmx_simd_double_t FN0
= gmx_simd_set1_d(-0.75225277815249618847);
2666 const gmx_simd_double_t FD5
= gmx_simd_set1_d(0.000016009278224355026701);
2667 const gmx_simd_double_t FD4
= gmx_simd_set1_d(0.00051055686934806966046);
2668 const gmx_simd_double_t FD3
= gmx_simd_set1_d(0.0081803507497974289008);
2669 const gmx_simd_double_t FD2
= gmx_simd_set1_d(0.077181146026670287235);
2670 const gmx_simd_double_t FD1
= gmx_simd_set1_d(0.41543303143712535988);
2671 const gmx_simd_double_t FD0
= gmx_simd_set1_d(1.0);
2673 gmx_simd_double_t z4
;
2674 gmx_simd_double_t polyFN0
, polyFN1
, polyFD0
, polyFD1
;
2676 z4
= gmx_simd_mul_d(z2
, z2
);
2678 polyFD1
= gmx_simd_fmadd_d(FD5
, z4
, FD3
);
2679 polyFD1
= gmx_simd_fmadd_d(polyFD1
, z4
, FD1
);
2680 polyFD1
= gmx_simd_mul_d(polyFD1
, z2
);
2681 polyFD0
= gmx_simd_fmadd_d(FD4
, z4
, FD2
);
2682 polyFD0
= gmx_simd_fmadd_d(polyFD0
, z4
, FD0
);
2683 polyFD0
= gmx_simd_add_d(polyFD0
, polyFD1
);
2685 polyFD0
= gmx_simd_inv_d(polyFD0
);
2687 polyFN0
= gmx_simd_fmadd_d(FN10
, z4
, FN8
);
2688 polyFN0
= gmx_simd_fmadd_d(polyFN0
, z4
, FN6
);
2689 polyFN0
= gmx_simd_fmadd_d(polyFN0
, z4
, FN4
);
2690 polyFN0
= gmx_simd_fmadd_d(polyFN0
, z4
, FN2
);
2691 polyFN0
= gmx_simd_fmadd_d(polyFN0
, z4
, FN0
);
2692 polyFN1
= gmx_simd_fmadd_d(FN9
, z4
, FN7
);
2693 polyFN1
= gmx_simd_fmadd_d(polyFN1
, z4
, FN5
);
2694 polyFN1
= gmx_simd_fmadd_d(polyFN1
, z4
, FN3
);
2695 polyFN1
= gmx_simd_fmadd_d(polyFN1
, z4
, FN1
);
2696 polyFN0
= gmx_simd_fmadd_d(polyFN1
, z2
, polyFN0
);
2699 return gmx_simd_mul_d(polyFN0
, polyFD0
);
2704 /*! \brief Calculate the potential correction due to PME analytically for SIMD double.
2706 * \copydetails gmx_simd_pmecorrV_f
2708 static gmx_inline gmx_simd_double_t gmx_simdcall
2709 gmx_simd_pmecorrV_d(gmx_simd_double_t z2
)
2711 const gmx_simd_double_t VN9
= gmx_simd_set1_d(-9.3723776169321855475e-13);
2712 const gmx_simd_double_t VN8
= gmx_simd_set1_d(1.2280156762674215741e-10);
2713 const gmx_simd_double_t VN7
= gmx_simd_set1_d(-7.3562157912251309487e-9);
2714 const gmx_simd_double_t VN6
= gmx_simd_set1_d(2.6215886208032517509e-7);
2715 const gmx_simd_double_t VN5
= gmx_simd_set1_d(-4.9532491651265819499e-6);
2716 const gmx_simd_double_t VN4
= gmx_simd_set1_d(0.00025907400778966060389);
2717 const gmx_simd_double_t VN3
= gmx_simd_set1_d(0.0010585044856156469792);
2718 const gmx_simd_double_t VN2
= gmx_simd_set1_d(0.045247661136833092885);
2719 const gmx_simd_double_t VN1
= gmx_simd_set1_d(0.11643931522926034421);
2720 const gmx_simd_double_t VN0
= gmx_simd_set1_d(1.1283791671726767970);
2722 const gmx_simd_double_t VD5
= gmx_simd_set1_d(0.000021784709867336150342);
2723 const gmx_simd_double_t VD4
= gmx_simd_set1_d(0.00064293662010911388448);
2724 const gmx_simd_double_t VD3
= gmx_simd_set1_d(0.0096311444822588683504);
2725 const gmx_simd_double_t VD2
= gmx_simd_set1_d(0.085608012351550627051);
2726 const gmx_simd_double_t VD1
= gmx_simd_set1_d(0.43652499166614811084);
2727 const gmx_simd_double_t VD0
= gmx_simd_set1_d(1.0);
2729 gmx_simd_double_t z4
;
2730 gmx_simd_double_t polyVN0
, polyVN1
, polyVD0
, polyVD1
;
2732 z4
= gmx_simd_mul_d(z2
, z2
);
2734 polyVD1
= gmx_simd_fmadd_d(VD5
, z4
, VD3
);
2735 polyVD0
= gmx_simd_fmadd_d(VD4
, z4
, VD2
);
2736 polyVD1
= gmx_simd_fmadd_d(polyVD1
, z4
, VD1
);
2737 polyVD0
= gmx_simd_fmadd_d(polyVD0
, z4
, VD0
);
2738 polyVD0
= gmx_simd_fmadd_d(polyVD1
, z2
, polyVD0
);
2740 polyVD0
= gmx_simd_inv_d(polyVD0
);
2742 polyVN1
= gmx_simd_fmadd_d(VN9
, z4
, VN7
);
2743 polyVN0
= gmx_simd_fmadd_d(VN8
, z4
, VN6
);
2744 polyVN1
= gmx_simd_fmadd_d(polyVN1
, z4
, VN5
);
2745 polyVN0
= gmx_simd_fmadd_d(polyVN0
, z4
, VN4
);
2746 polyVN1
= gmx_simd_fmadd_d(polyVN1
, z4
, VN3
);
2747 polyVN0
= gmx_simd_fmadd_d(polyVN0
, z4
, VN2
);
2748 polyVN1
= gmx_simd_fmadd_d(polyVN1
, z4
, VN1
);
2749 polyVN0
= gmx_simd_fmadd_d(polyVN0
, z4
, VN0
);
2750 polyVN0
= gmx_simd_fmadd_d(polyVN1
, z2
, polyVN0
);
2752 return gmx_simd_mul_d(polyVN0
, polyVD0
);
2758 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2760 * \note In some cases we do not need full double accuracy of individual
2761 * SIMD math functions, although the data is stored in double precision
2762 * SIMD registers. This might be the case for special algorithms, or
2763 * if the architecture does not support single precision.
2764 * Since the full double precision evaluation of math functions
2765 * typically require much more expensive polynomial approximations
2766 * these functions implement the algorithms used in the single precision
2767 * SIMD math functions, but they operate on double precision
2770 * \note You should normally not use these functions directly, but the
2771 * real-precision wrappers instead. When Gromacs is compiled in single
2772 * precision, those will be aliases to the normal single precision
2773 * SIMD math functions.
2777 /*********************************************************************
2778 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2779 *********************************************************************/
2781 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2783 * You should normally call the real-precision routine
2784 * \ref gmx_simd_invsqrt_singleaccuracy_r.
2786 * \param x Argument that must be >0. This routine does not check arguments.
2787 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
2789 static gmx_inline gmx_simd_double_t gmx_simdcall
2790 gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_double_t x
)
2792 gmx_simd_double_t lu
= gmx_simd_rsqrt_d(x
);
2793 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2794 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
2796 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2797 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
2799 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2800 lu
= gmx_simd_rsqrt_iter_d(lu
, x
);
2805 /*! \brief 1/sqrt(x) for masked entries of SIMD double, but in single accuracy.
2807 * \copydetails gmx_simd_invsqrt_maskfpe_f
2809 static gmx_inline gmx_simd_double_t
2810 gmx_simd_invsqrt_maskfpe_singleaccuracy_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
2813 return gmx_simd_invsqrt_singleaccuracy_d(x
);
2815 return gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x
, m
));
2819 /*! \brief 1/sqrt(x) for non-masked entries of SIMD double, in single accuracy.
2821 * \copydetails gmx_simd_invsqrt_notmaskfpe_f
2823 static gmx_inline gmx_simd_double_t
2824 gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
2827 return gmx_simd_invsqrt_singleaccuracy_d(x
);
2829 return gmx_simd_invsqrt_singleaccuracy_d(gmx_simd_blendv_d(x
, gmx_simd_set1_d(1.0), m
));
2833 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2835 * You should normally call the real-precision routine
2836 * \ref gmx_simd_invsqrt_pair_singleaccuracy_r.
2838 * \param x0 First set of arguments, x0 must be positive - no argument checking.
2839 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
2840 * \param[out] out0 Result 1/sqrt(x0)
2841 * \param[out] out1 Result 1/sqrt(x1)
2843 * In particular for double precision we can sometimes calculate square root
2844 * pairs slightly faster by using single precision until the very last step.
2846 static gmx_inline
void gmx_simdcall
2847 gmx_simd_invsqrt_pair_singleaccuracy_d(gmx_simd_double_t x0
, gmx_simd_double_t x1
,
2848 gmx_simd_double_t
*out0
, gmx_simd_double_t
*out1
)
2850 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
2851 gmx_simd_float_t xf
= gmx_simd_cvt_dd2f(x0
, x1
);
2852 gmx_simd_float_t luf
= gmx_simd_rsqrt_f(xf
);
2853 gmx_simd_double_t lu0
, lu1
;
2854 /* Intermediate target is single - mantissa+1 bits */
2855 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2856 luf
= gmx_simd_rsqrt_iter_f(luf
, xf
);
2858 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2859 luf
= gmx_simd_rsqrt_iter_f(luf
, xf
);
2861 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2862 luf
= gmx_simd_rsqrt_iter_f(luf
, xf
);
2864 gmx_simd_cvt_f2dd(luf
, &lu0
, &lu1
);
2865 /* We now have single-precision accuracy values in lu0/lu1 */
2869 *out0
= gmx_simd_invsqrt_singleaccuracy_d(x0
);
2870 *out1
= gmx_simd_invsqrt_singleaccuracy_d(x1
);
2875 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
2877 * You should normally call the real-precision routine
2878 * \ref gmx_simd_inv_singleaccuracy_r.
2880 * \param x Argument that must be nonzero. This routine does not check arguments.
2881 * \return 1/x. Result is undefined if your argument was invalid.
2883 static gmx_inline gmx_simd_double_t gmx_simdcall
2884 gmx_simd_inv_singleaccuracy_d(gmx_simd_double_t x
)
2886 gmx_simd_double_t lu
= gmx_simd_rcp_d(x
);
2887 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2888 lu
= gmx_simd_rcp_iter_d(lu
, x
);
2890 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2891 lu
= gmx_simd_rcp_iter_d(lu
, x
);
2893 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2894 lu
= gmx_simd_rcp_iter_d(lu
, x
);
2899 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
2901 * \copydetails gmx_simd_inv_maskfpe_f
2903 static gmx_inline gmx_simd_double_t
2904 gmx_simd_inv_maskfpe_singleaccuracy_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
2907 return gmx_simd_inv_singleaccuracy_d(x
);
2909 return gmx_simd_inv_singleaccuracy_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0), x
, m
));
2913 /*! \brief 1/x for non-masked entries of SIMD double, single accuracy.
2915 * \copydetails gmx_simd_inv_notmaskfpe_f
2917 static gmx_inline gmx_simd_double_t
2918 gmx_simd_inv_notmaskfpe_singleaccuracy_d(gmx_simd_double_t x
, gmx_simd_dbool_t gmx_unused m
)
2921 return gmx_simd_inv_singleaccuracy_d(x
);
2923 return gmx_simd_inv_singleaccuracy_d(gmx_simd_blendv_d(x
, gmx_simd_set1_d(1.0), m
));
2927 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, single accuracy.
2929 * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
2931 * \param x Argument that must be >=0.
2932 * \return sqrt(x). If x=0, the result will correctly be set to 0.
2933 * The result is undefined if the input value is negative.
2935 static gmx_inline gmx_simd_double_t gmx_simdcall
2936 gmx_simd_sqrt_singleaccuracy_d(gmx_simd_double_t x
)
2938 gmx_simd_dbool_t mask
;
2939 gmx_simd_double_t res
;
2941 mask
= gmx_simd_cmpeq_d(x
, gmx_simd_setzero_d());
2942 res
= gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(x
, mask
), mask
);
2943 return gmx_simd_mul_d(res
, x
);
2946 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
2948 * You should normally call the real-precision routine
2949 * \ref gmx_simd_log_singleaccuracy_r.
2951 * \param x Argument, should be >0.
2952 * \result The natural logarithm of x. Undefined if argument is invalid.
2954 #ifndef gmx_simd_log_singleaccuracy_d
2955 static gmx_inline gmx_simd_double_t gmx_simdcall
2956 gmx_simd_log_singleaccuracy_d(gmx_simd_double_t x
)
2958 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
2959 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
2960 const gmx_simd_double_t sqrt2
= gmx_simd_set1_d(sqrt(2.0));
2961 const gmx_simd_double_t corr
= gmx_simd_set1_d(0.693147180559945286226764);
2962 const gmx_simd_double_t CL9
= gmx_simd_set1_d(0.2371599674224853515625);
2963 const gmx_simd_double_t CL7
= gmx_simd_set1_d(0.285279005765914916992188);
2964 const gmx_simd_double_t CL5
= gmx_simd_set1_d(0.400005519390106201171875);
2965 const gmx_simd_double_t CL3
= gmx_simd_set1_d(0.666666567325592041015625);
2966 const gmx_simd_double_t CL1
= gmx_simd_set1_d(2.0);
2967 gmx_simd_double_t fexp
, x2
, p
;
2968 gmx_simd_dbool_t mask
;
2970 fexp
= gmx_simd_get_exponent_d(x
);
2971 x
= gmx_simd_get_mantissa_d(x
);
2973 mask
= gmx_simd_cmplt_d(sqrt2
, x
);
2974 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
2975 fexp
= gmx_simd_add_d(fexp
, gmx_simd_blendzero_d(one
, mask
));
2976 x
= gmx_simd_mul_d(x
, gmx_simd_blendv_d(one
, half
, mask
));
2978 x
= gmx_simd_mul_d( gmx_simd_sub_d(x
, one
), gmx_simd_inv_singleaccuracy_d( gmx_simd_add_d(x
, one
) ) );
2979 x2
= gmx_simd_mul_d(x
, x
);
2981 p
= gmx_simd_fmadd_d(CL9
, x2
, CL7
);
2982 p
= gmx_simd_fmadd_d(p
, x2
, CL5
);
2983 p
= gmx_simd_fmadd_d(p
, x2
, CL3
);
2984 p
= gmx_simd_fmadd_d(p
, x2
, CL1
);
2985 p
= gmx_simd_fmadd_d(p
, x
, gmx_simd_mul_d(corr
, fexp
));
2991 #ifndef gmx_simd_exp2_singleaccuracy_d
2992 /*! \brief SIMD 2^x. Double precision SIMD data, single accuracy.
2994 * You should normally call the real-precision routine
2995 * \ref gmx_simd_exp2_singleaccuracy_r.
2997 * \param x Argument.
2998 * \result 2^x. Undefined if input argument caused overflow.
3000 static gmx_inline gmx_simd_double_t gmx_simdcall
3001 gmx_simd_exp2_singleaccuracy_d(gmx_simd_double_t x
)
3003 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
3004 const gmx_simd_double_t arglimit
= gmx_simd_set1_d(126.0);
3005 const gmx_simd_double_t CC6
= gmx_simd_set1_d(0.0001534581200287996416911311);
3006 const gmx_simd_double_t CC5
= gmx_simd_set1_d(0.001339993121934088894618990);
3007 const gmx_simd_double_t CC4
= gmx_simd_set1_d(0.009618488957115180159497841);
3008 const gmx_simd_double_t CC3
= gmx_simd_set1_d(0.05550328776964726865751735);
3009 const gmx_simd_double_t CC2
= gmx_simd_set1_d(0.2402264689063408646490722);
3010 const gmx_simd_double_t CC1
= gmx_simd_set1_d(0.6931472057372680777553816);
3011 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3013 gmx_simd_double_t fexppart
;
3014 gmx_simd_double_t intpart
;
3015 gmx_simd_double_t p
;
3016 gmx_simd_dbool_t valuemask
;
3018 fexppart
= gmx_simd_set_exponent_d(x
);
3019 intpart
= gmx_simd_round_d(x
);
3020 valuemask
= gmx_simd_cmple_d(gmx_simd_fabs_d(x
), arglimit
);
3021 fexppart
= gmx_simd_blendzero_d(fexppart
, valuemask
);
3022 x
= gmx_simd_sub_d(x
, intpart
);
3024 p
= gmx_simd_fmadd_d(CC6
, x
, CC5
);
3025 p
= gmx_simd_fmadd_d(p
, x
, CC4
);
3026 p
= gmx_simd_fmadd_d(p
, x
, CC3
);
3027 p
= gmx_simd_fmadd_d(p
, x
, CC2
);
3028 p
= gmx_simd_fmadd_d(p
, x
, CC1
);
3029 p
= gmx_simd_fmadd_d(p
, x
, one
);
3030 x
= gmx_simd_mul_d(p
, fexppart
);
3035 #ifndef gmx_simd_exp_singleaccuracy_d
3036 /*! \brief SIMD exp(x). Double precision SIMD data, single accuracy.
3038 * You should normally call the real-precision routine
3039 * \ref gmx_simd_exp_singleaccuracy_r.
3041 * \param x Argument.
3042 * \result exp(x). Undefined if input argument caused overflow.
3044 static gmx_inline gmx_simd_double_t gmx_simdcall
3045 gmx_simd_exp_singleaccuracy_d(gmx_simd_double_t x
)
3047 const gmx_simd_double_t argscale
= gmx_simd_set1_d(1.44269504088896341);
3048 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
3049 const gmx_simd_double_t arglimit
= gmx_simd_set1_d(126.0);
3050 const gmx_simd_double_t invargscale
= gmx_simd_set1_d(0.69314718055994528623);
3051 const gmx_simd_double_t CC4
= gmx_simd_set1_d(0.00136324646882712841033936);
3052 const gmx_simd_double_t CC3
= gmx_simd_set1_d(0.00836596917361021041870117);
3053 const gmx_simd_double_t CC2
= gmx_simd_set1_d(0.0416710823774337768554688);
3054 const gmx_simd_double_t CC1
= gmx_simd_set1_d(0.166665524244308471679688);
3055 const gmx_simd_double_t CC0
= gmx_simd_set1_d(0.499999850988388061523438);
3056 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3057 gmx_simd_double_t fexppart
;
3058 gmx_simd_double_t intpart
;
3059 gmx_simd_double_t y
, p
;
3060 gmx_simd_dbool_t valuemask
;
3062 y
= gmx_simd_mul_d(x
, argscale
);
3063 fexppart
= gmx_simd_set_exponent_d(y
); /* rounds to nearest int internally */
3064 intpart
= gmx_simd_round_d(y
); /* use same rounding algorithm here */
3065 valuemask
= gmx_simd_cmple_d(gmx_simd_fabs_d(y
), arglimit
);
3066 fexppart
= gmx_simd_blendzero_d(fexppart
, valuemask
);
3068 /* Extended precision arithmetics not needed since
3069 * we have double precision and only need single accuracy.
3071 x
= gmx_simd_fnmadd_d(invargscale
, intpart
, x
);
3073 p
= gmx_simd_fmadd_d(CC4
, x
, CC3
);
3074 p
= gmx_simd_fmadd_d(p
, x
, CC2
);
3075 p
= gmx_simd_fmadd_d(p
, x
, CC1
);
3076 p
= gmx_simd_fmadd_d(p
, x
, CC0
);
3077 p
= gmx_simd_fmadd_d(gmx_simd_mul_d(x
, x
), p
, x
);
3078 p
= gmx_simd_add_d(p
, one
);
3079 x
= gmx_simd_mul_d(p
, fexppart
);
3084 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3086 * You should normally call the real-precision routine
3087 * \ref gmx_simd_erf_singleaccuracy_r.
3089 * \param x The value to calculate erf(x) for.
3092 * This routine achieves very close to single precision, but we do not care about
3093 * the last bit or the subnormal result range.
3095 static gmx_inline gmx_simd_double_t gmx_simdcall
3096 gmx_simd_erf_singleaccuracy_d(gmx_simd_double_t x
)
3098 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
3099 const gmx_simd_double_t CA6
= gmx_simd_set1_d(7.853861353153693e-5);
3100 const gmx_simd_double_t CA5
= gmx_simd_set1_d(-8.010193625184903e-4);
3101 const gmx_simd_double_t CA4
= gmx_simd_set1_d(5.188327685732524e-3);
3102 const gmx_simd_double_t CA3
= gmx_simd_set1_d(-2.685381193529856e-2);
3103 const gmx_simd_double_t CA2
= gmx_simd_set1_d(1.128358514861418e-1);
3104 const gmx_simd_double_t CA1
= gmx_simd_set1_d(-3.761262582423300e-1);
3105 const gmx_simd_double_t CA0
= gmx_simd_set1_d(1.128379165726710);
3106 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
3107 const gmx_simd_double_t CB9
= gmx_simd_set1_d(-0.0018629930017603923);
3108 const gmx_simd_double_t CB8
= gmx_simd_set1_d(0.003909821287598495);
3109 const gmx_simd_double_t CB7
= gmx_simd_set1_d(-0.0052094582210355615);
3110 const gmx_simd_double_t CB6
= gmx_simd_set1_d(0.005685614362160572);
3111 const gmx_simd_double_t CB5
= gmx_simd_set1_d(-0.0025367682853477272);
3112 const gmx_simd_double_t CB4
= gmx_simd_set1_d(-0.010199799682318782);
3113 const gmx_simd_double_t CB3
= gmx_simd_set1_d(0.04369575504816542);
3114 const gmx_simd_double_t CB2
= gmx_simd_set1_d(-0.11884063474674492);
3115 const gmx_simd_double_t CB1
= gmx_simd_set1_d(0.2732120154030589);
3116 const gmx_simd_double_t CB0
= gmx_simd_set1_d(0.42758357702025784);
3117 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
3118 const gmx_simd_double_t CC10
= gmx_simd_set1_d(-0.0445555913112064);
3119 const gmx_simd_double_t CC9
= gmx_simd_set1_d(0.21376355144663348);
3120 const gmx_simd_double_t CC8
= gmx_simd_set1_d(-0.3473187200259257);
3121 const gmx_simd_double_t CC7
= gmx_simd_set1_d(0.016690861551248114);
3122 const gmx_simd_double_t CC6
= gmx_simd_set1_d(0.7560973182491192);
3123 const gmx_simd_double_t CC5
= gmx_simd_set1_d(-1.2137903600145787);
3124 const gmx_simd_double_t CC4
= gmx_simd_set1_d(0.8411872321232948);
3125 const gmx_simd_double_t CC3
= gmx_simd_set1_d(-0.08670413896296343);
3126 const gmx_simd_double_t CC2
= gmx_simd_set1_d(-0.27124782687240334);
3127 const gmx_simd_double_t CC1
= gmx_simd_set1_d(-0.0007502488047806069);
3128 const gmx_simd_double_t CC0
= gmx_simd_set1_d(0.5642114853803148);
3129 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3130 const gmx_simd_double_t two
= gmx_simd_set1_d(2.0);
3132 gmx_simd_double_t x2
, x4
, y
;
3133 gmx_simd_double_t t
, t2
, w
, w2
;
3134 gmx_simd_double_t pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
3135 gmx_simd_double_t expmx2
;
3136 gmx_simd_double_t res_erf
, res_erfc
, res
;
3137 gmx_simd_dbool_t mask
, msk_erf
;
3139 /* Calculate erf() */
3140 x2
= gmx_simd_mul_d(x
, x
);
3141 x4
= gmx_simd_mul_d(x2
, x2
);
3143 pA0
= gmx_simd_fmadd_d(CA6
, x4
, CA4
);
3144 pA1
= gmx_simd_fmadd_d(CA5
, x4
, CA3
);
3145 pA0
= gmx_simd_fmadd_d(pA0
, x4
, CA2
);
3146 pA1
= gmx_simd_fmadd_d(pA1
, x4
, CA1
);
3147 pA0
= gmx_simd_mul_d(pA0
, x4
);
3148 pA0
= gmx_simd_fmadd_d(pA1
, x2
, pA0
);
3149 /* Constant term must come last for precision reasons */
3150 pA0
= gmx_simd_add_d(pA0
, CA0
);
3152 res_erf
= gmx_simd_mul_d(x
, pA0
);
3154 /* Calculate erfc */
3155 y
= gmx_simd_fabs_d(x
);
3156 msk_erf
= gmx_simd_cmplt_d(y
, gmx_simd_set1_d(0.75));
3157 t
= gmx_simd_inv_notmaskfpe_singleaccuracy_d(y
, msk_erf
);
3158 w
= gmx_simd_sub_d(t
, one
);
3159 t2
= gmx_simd_mul_d(t
, t
);
3160 w2
= gmx_simd_mul_d(w
, w
);
3162 expmx2
= gmx_simd_exp_singleaccuracy_d( gmx_simd_fneg_d( gmx_simd_mul_d(y
, y
)));
3164 pB1
= gmx_simd_fmadd_d(CB9
, w2
, CB7
);
3165 pB0
= gmx_simd_fmadd_d(CB8
, w2
, CB6
);
3166 pB1
= gmx_simd_fmadd_d(pB1
, w2
, CB5
);
3167 pB0
= gmx_simd_fmadd_d(pB0
, w2
, CB4
);
3168 pB1
= gmx_simd_fmadd_d(pB1
, w2
, CB3
);
3169 pB0
= gmx_simd_fmadd_d(pB0
, w2
, CB2
);
3170 pB1
= gmx_simd_fmadd_d(pB1
, w2
, CB1
);
3171 pB0
= gmx_simd_fmadd_d(pB0
, w2
, CB0
);
3172 pB0
= gmx_simd_fmadd_d(pB1
, w
, pB0
);
3174 pC0
= gmx_simd_fmadd_d(CC10
, t2
, CC8
);
3175 pC1
= gmx_simd_fmadd_d(CC9
, t2
, CC7
);
3176 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC6
);
3177 pC1
= gmx_simd_fmadd_d(pC1
, t2
, CC5
);
3178 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC4
);
3179 pC1
= gmx_simd_fmadd_d(pC1
, t2
, CC3
);
3180 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC2
);
3181 pC1
= gmx_simd_fmadd_d(pC1
, t2
, CC1
);
3183 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC0
);
3184 pC0
= gmx_simd_fmadd_d(pC1
, t
, pC0
);
3185 pC0
= gmx_simd_mul_d(pC0
, t
);
3187 /* SELECT pB0 or pC0 for erfc() */
3188 mask
= gmx_simd_cmplt_d(two
, y
);
3189 res_erfc
= gmx_simd_blendv_d(pB0
, pC0
, mask
);
3190 res_erfc
= gmx_simd_mul_d(res_erfc
, expmx2
);
3192 /* erfc(x<0) = 2-erfc(|x|) */
3193 mask
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
3194 res_erfc
= gmx_simd_blendv_d(res_erfc
, gmx_simd_sub_d(two
, res_erfc
), mask
);
3196 /* Select erf() or erfc() */
3197 mask
= gmx_simd_cmplt_d(y
, gmx_simd_set1_d(0.75));
3198 res
= gmx_simd_blendv_d(gmx_simd_sub_d(one
, res_erfc
), res_erf
, mask
);
3203 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3205 * You should normally call the real-precision routine
3206 * \ref gmx_simd_erfc_singleaccuracy_r.
3208 * \param x The value to calculate erfc(x) for.
3211 * This routine achieves singleprecision (bar the last bit) over most of the
3212 * input range, but for large arguments where the result is getting close
3213 * to the minimum representable numbers we accept slightly larger errors
3214 * (think results that are in the ballpark of 10^-30) since that is not
3217 static gmx_inline gmx_simd_double_t gmx_simdcall
3218 gmx_simd_erfc_singleaccuracy_d(gmx_simd_double_t x
)
3220 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
3221 const gmx_simd_double_t CA6
= gmx_simd_set1_d(7.853861353153693e-5);
3222 const gmx_simd_double_t CA5
= gmx_simd_set1_d(-8.010193625184903e-4);
3223 const gmx_simd_double_t CA4
= gmx_simd_set1_d(5.188327685732524e-3);
3224 const gmx_simd_double_t CA3
= gmx_simd_set1_d(-2.685381193529856e-2);
3225 const gmx_simd_double_t CA2
= gmx_simd_set1_d(1.128358514861418e-1);
3226 const gmx_simd_double_t CA1
= gmx_simd_set1_d(-3.761262582423300e-1);
3227 const gmx_simd_double_t CA0
= gmx_simd_set1_d(1.128379165726710);
3228 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
3229 const gmx_simd_double_t CB9
= gmx_simd_set1_d(-0.0018629930017603923);
3230 const gmx_simd_double_t CB8
= gmx_simd_set1_d(0.003909821287598495);
3231 const gmx_simd_double_t CB7
= gmx_simd_set1_d(-0.0052094582210355615);
3232 const gmx_simd_double_t CB6
= gmx_simd_set1_d(0.005685614362160572);
3233 const gmx_simd_double_t CB5
= gmx_simd_set1_d(-0.0025367682853477272);
3234 const gmx_simd_double_t CB4
= gmx_simd_set1_d(-0.010199799682318782);
3235 const gmx_simd_double_t CB3
= gmx_simd_set1_d(0.04369575504816542);
3236 const gmx_simd_double_t CB2
= gmx_simd_set1_d(-0.11884063474674492);
3237 const gmx_simd_double_t CB1
= gmx_simd_set1_d(0.2732120154030589);
3238 const gmx_simd_double_t CB0
= gmx_simd_set1_d(0.42758357702025784);
3239 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
3240 const gmx_simd_double_t CC10
= gmx_simd_set1_d(-0.0445555913112064);
3241 const gmx_simd_double_t CC9
= gmx_simd_set1_d(0.21376355144663348);
3242 const gmx_simd_double_t CC8
= gmx_simd_set1_d(-0.3473187200259257);
3243 const gmx_simd_double_t CC7
= gmx_simd_set1_d(0.016690861551248114);
3244 const gmx_simd_double_t CC6
= gmx_simd_set1_d(0.7560973182491192);
3245 const gmx_simd_double_t CC5
= gmx_simd_set1_d(-1.2137903600145787);
3246 const gmx_simd_double_t CC4
= gmx_simd_set1_d(0.8411872321232948);
3247 const gmx_simd_double_t CC3
= gmx_simd_set1_d(-0.08670413896296343);
3248 const gmx_simd_double_t CC2
= gmx_simd_set1_d(-0.27124782687240334);
3249 const gmx_simd_double_t CC1
= gmx_simd_set1_d(-0.0007502488047806069);
3250 const gmx_simd_double_t CC0
= gmx_simd_set1_d(0.5642114853803148);
3251 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3252 const gmx_simd_double_t two
= gmx_simd_set1_d(2.0);
3254 gmx_simd_double_t x2
, x4
, y
;
3255 gmx_simd_double_t t
, t2
, w
, w2
;
3256 gmx_simd_double_t pA0
, pA1
, pB0
, pB1
, pC0
, pC1
;
3257 gmx_simd_double_t expmx2
;
3258 gmx_simd_double_t res_erf
, res_erfc
, res
;
3259 gmx_simd_dbool_t mask
, msk_erf
;
3261 /* Calculate erf() */
3262 x2
= gmx_simd_mul_d(x
, x
);
3263 x4
= gmx_simd_mul_d(x2
, x2
);
3265 pA0
= gmx_simd_fmadd_d(CA6
, x4
, CA4
);
3266 pA1
= gmx_simd_fmadd_d(CA5
, x4
, CA3
);
3267 pA0
= gmx_simd_fmadd_d(pA0
, x4
, CA2
);
3268 pA1
= gmx_simd_fmadd_d(pA1
, x4
, CA1
);
3269 pA1
= gmx_simd_mul_d(pA1
, x2
);
3270 pA0
= gmx_simd_fmadd_d(pA0
, x4
, pA1
);
3271 /* Constant term must come last for precision reasons */
3272 pA0
= gmx_simd_add_d(pA0
, CA0
);
3274 res_erf
= gmx_simd_mul_d(x
, pA0
);
3276 /* Calculate erfc */
3277 y
= gmx_simd_fabs_d(x
);
3278 msk_erf
= gmx_simd_cmplt_d(y
, gmx_simd_set1_d(0.75));
3279 t
= gmx_simd_inv_notmaskfpe_singleaccuracy_d(y
, msk_erf
);
3280 w
= gmx_simd_sub_d(t
, one
);
3281 t2
= gmx_simd_mul_d(t
, t
);
3282 w2
= gmx_simd_mul_d(w
, w
);
3284 expmx2
= gmx_simd_exp_singleaccuracy_d( gmx_simd_fneg_d( gmx_simd_mul_d(y
, y
) ) );
3286 pB1
= gmx_simd_fmadd_d(CB9
, w2
, CB7
);
3287 pB0
= gmx_simd_fmadd_d(CB8
, w2
, CB6
);
3288 pB1
= gmx_simd_fmadd_d(pB1
, w2
, CB5
);
3289 pB0
= gmx_simd_fmadd_d(pB0
, w2
, CB4
);
3290 pB1
= gmx_simd_fmadd_d(pB1
, w2
, CB3
);
3291 pB0
= gmx_simd_fmadd_d(pB0
, w2
, CB2
);
3292 pB1
= gmx_simd_fmadd_d(pB1
, w2
, CB1
);
3293 pB0
= gmx_simd_fmadd_d(pB0
, w2
, CB0
);
3294 pB0
= gmx_simd_fmadd_d(pB1
, w
, pB0
);
3296 pC0
= gmx_simd_fmadd_d(CC10
, t2
, CC8
);
3297 pC1
= gmx_simd_fmadd_d(CC9
, t2
, CC7
);
3298 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC6
);
3299 pC1
= gmx_simd_fmadd_d(pC1
, t2
, CC5
);
3300 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC4
);
3301 pC1
= gmx_simd_fmadd_d(pC1
, t2
, CC3
);
3302 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC2
);
3303 pC1
= gmx_simd_fmadd_d(pC1
, t2
, CC1
);
3305 pC0
= gmx_simd_fmadd_d(pC0
, t2
, CC0
);
3306 pC0
= gmx_simd_fmadd_d(pC1
, t
, pC0
);
3307 pC0
= gmx_simd_mul_d(pC0
, t
);
3309 /* SELECT pB0 or pC0 for erfc() */
3310 mask
= gmx_simd_cmplt_d(two
, y
);
3311 res_erfc
= gmx_simd_blendv_d(pB0
, pC0
, mask
);
3312 res_erfc
= gmx_simd_mul_d(res_erfc
, expmx2
);
3314 /* erfc(x<0) = 2-erfc(|x|) */
3315 mask
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
3316 res_erfc
= gmx_simd_blendv_d(res_erfc
, gmx_simd_sub_d(two
, res_erfc
), mask
);
3318 /* Select erf() or erfc() */
3319 mask
= gmx_simd_cmplt_d(y
, gmx_simd_set1_d(0.75));
3320 res
= gmx_simd_blendv_d(res_erfc
, gmx_simd_sub_d(one
, res_erf
), mask
);
3325 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3327 * You should normally call the real-precision routine
3328 * \ref gmx_simd_sincos_singleaccuracy_r.
3330 * \param x The argument to evaluate sin/cos for
3331 * \param[out] sinval Sin(x)
3332 * \param[out] cosval Cos(x)
3335 static gmx_inline
void gmx_simdcall
3336 gmx_simd_sincos_singleaccuracy_d(gmx_simd_double_t x
, gmx_simd_double_t
*sinval
, gmx_simd_double_t
*cosval
)
3338 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
3339 const gmx_simd_double_t argred0
= gmx_simd_set1_d(2*0.78539816290140151978);
3340 const gmx_simd_double_t argred1
= gmx_simd_set1_d(2*4.9604678871439933374e-10);
3341 const gmx_simd_double_t argred2
= gmx_simd_set1_d(2*1.1258708853173288931e-18);
3342 const gmx_simd_double_t two_over_pi
= gmx_simd_set1_d(2.0/M_PI
);
3343 const gmx_simd_double_t const_sin2
= gmx_simd_set1_d(-1.9515295891e-4);
3344 const gmx_simd_double_t const_sin1
= gmx_simd_set1_d( 8.3321608736e-3);
3345 const gmx_simd_double_t const_sin0
= gmx_simd_set1_d(-1.6666654611e-1);
3346 const gmx_simd_double_t const_cos2
= gmx_simd_set1_d( 2.443315711809948e-5);
3347 const gmx_simd_double_t const_cos1
= gmx_simd_set1_d(-1.388731625493765e-3);
3348 const gmx_simd_double_t const_cos0
= gmx_simd_set1_d( 4.166664568298827e-2);
3350 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
3351 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3352 gmx_simd_double_t ssign
, csign
;
3353 gmx_simd_double_t x2
, y
, z
, psin
, pcos
, sss
, ccc
;
3354 gmx_simd_dbool_t mask
;
3355 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
3356 const gmx_simd_dint32_t ione
= gmx_simd_set1_di(1);
3357 const gmx_simd_dint32_t itwo
= gmx_simd_set1_di(2);
3358 gmx_simd_dint32_t iy
;
3360 z
= gmx_simd_mul_d(x
, two_over_pi
);
3361 iy
= gmx_simd_cvt_d2i(z
);
3362 y
= gmx_simd_round_d(z
);
3364 mask
= gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy
, ione
), gmx_simd_setzero_di()));
3365 ssign
= gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy
, itwo
), itwo
)));
3366 csign
= gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy
, ione
), itwo
), itwo
)));
3368 const gmx_simd_double_t quarter
= gmx_simd_set1_d(0.25);
3369 const gmx_simd_double_t minusquarter
= gmx_simd_set1_d(-0.25);
3370 gmx_simd_double_t q
;
3371 gmx_simd_dbool_t m1
, m2
, m3
;
3373 /* The most obvious way to find the arguments quadrant in the unit circle
3374 * to calculate the sign is to use integer arithmetic, but that is not
3375 * present in all SIMD implementations. As an alternative, we have devised a
3376 * pure floating-point algorithm that uses truncation for argument reduction
3377 * so that we get a new value 0<=q<1 over the unit circle, and then
3378 * do floating-point comparisons with fractions. This is likely to be
3379 * slightly slower (~10%) due to the longer latencies of floating-point, so
3380 * we only use it when integer SIMD arithmetic is not present.
3383 x
= gmx_simd_fabs_d(x
);
3384 /* It is critical that half-way cases are rounded down */
3385 z
= gmx_simd_fmadd_d(x
, two_over_pi
, half
);
3386 y
= gmx_simd_trunc_d(z
);
3387 q
= gmx_simd_mul_d(z
, quarter
);
3388 q
= gmx_simd_sub_d(q
, gmx_simd_trunc_d(q
));
3389 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3390 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3391 * This removes the 2*Pi periodicity without using any integer arithmetic.
3392 * First check if y had the value 2 or 3, set csign if true.
3394 q
= gmx_simd_sub_d(q
, half
);
3395 /* If we have logical operations we can work directly on the signbit, which
3396 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3397 * Thus, if you are altering defines to debug alternative code paths, the
3398 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3399 * active or inactive - you will get errors if only one is used.
3401 # ifdef GMX_SIMD_HAVE_LOGICAL
3402 ssign
= gmx_simd_and_d(ssign
, gmx_simd_set1_d(-0.0));
3403 csign
= gmx_simd_andnot_d(q
, gmx_simd_set1_d(-0.0));
3404 ssign
= gmx_simd_xor_d(ssign
, csign
);
3406 csign
= gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q
);
3408 ssign
= gmx_simd_xor_sign_d(ssign
, csign
); /* swap ssign if csign was set. */
3410 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
3411 m1
= gmx_simd_cmplt_d(q
, minusquarter
);
3412 m2
= gmx_simd_cmple_d(gmx_simd_setzero_d(), q
);
3413 m3
= gmx_simd_cmplt_d(q
, quarter
);
3414 m2
= gmx_simd_and_db(m2
, m3
);
3415 mask
= gmx_simd_or_db(m1
, m2
);
3416 /* where mask is FALSE, set sign. */
3417 csign
= gmx_simd_xor_sign_d(csign
, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one
, mask
));
3419 x
= gmx_simd_fnmadd_d(y
, argred0
, x
);
3420 x
= gmx_simd_fnmadd_d(y
, argred1
, x
);
3421 x
= gmx_simd_fnmadd_d(y
, argred2
, x
);
3422 x2
= gmx_simd_mul_d(x
, x
);
3424 psin
= gmx_simd_fmadd_d(const_sin2
, x2
, const_sin1
);
3425 psin
= gmx_simd_fmadd_d(psin
, x2
, const_sin0
);
3426 psin
= gmx_simd_fmadd_d(psin
, gmx_simd_mul_d(x
, x2
), x
);
3427 pcos
= gmx_simd_fmadd_d(const_cos2
, x2
, const_cos1
);
3428 pcos
= gmx_simd_fmadd_d(pcos
, x2
, const_cos0
);
3429 pcos
= gmx_simd_fmsub_d(pcos
, x2
, half
);
3430 pcos
= gmx_simd_fmadd_d(pcos
, x2
, one
);
3432 sss
= gmx_simd_blendv_d(pcos
, psin
, mask
);
3433 ccc
= gmx_simd_blendv_d(psin
, pcos
, mask
);
3434 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
3435 #ifdef GMX_SIMD_HAVE_LOGICAL
3436 *sinval
= gmx_simd_xor_d(sss
, ssign
);
3437 *cosval
= gmx_simd_xor_d(ccc
, csign
);
3439 *sinval
= gmx_simd_xor_sign_d(sss
, ssign
);
3440 *cosval
= gmx_simd_xor_sign_d(ccc
, csign
);
3444 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3446 * You should normally call the real-precision routine
3447 * \ref gmx_simd_sin_singleaccuracy_r.
3449 * \param x The argument to evaluate sin for
3452 * \attention Do NOT call both sin & cos if you need both results, since each of them
3453 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
3455 static gmx_inline gmx_simd_double_t gmx_simdcall
3456 gmx_simd_sin_singleaccuracy_d(gmx_simd_double_t x
)
3458 gmx_simd_double_t s
, c
;
3459 gmx_simd_sincos_singleaccuracy_d(x
, &s
, &c
);
3463 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3465 * You should normally call the real-precision routine
3466 * \ref gmx_simd_cos_singleaccuracy_r.
3468 * \param x The argument to evaluate cos for
3471 * \attention Do NOT call both sin & cos if you need both results, since each of them
3472 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
3474 static gmx_inline gmx_simd_double_t gmx_simdcall
3475 gmx_simd_cos_singleaccuracy_d(gmx_simd_double_t x
)
3477 gmx_simd_double_t s
, c
;
3478 gmx_simd_sincos_singleaccuracy_d(x
, &s
, &c
);
3482 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3484 * You should normally call the real-precision routine
3485 * \ref gmx_simd_tan_singleaccuracy_r.
3487 * \param x The argument to evaluate tan for
3490 static gmx_inline gmx_simd_double_t gmx_simdcall
3491 gmx_simd_tan_singleaccuracy_d(gmx_simd_double_t x
)
3493 const gmx_simd_double_t argred0
= gmx_simd_set1_d(2*0.78539816290140151978);
3494 const gmx_simd_double_t argred1
= gmx_simd_set1_d(2*4.9604678871439933374e-10);
3495 const gmx_simd_double_t argred2
= gmx_simd_set1_d(2*1.1258708853173288931e-18);
3496 const gmx_simd_double_t two_over_pi
= gmx_simd_set1_d(2.0/M_PI
);
3497 const gmx_simd_double_t CT6
= gmx_simd_set1_d(0.009498288995810566122993911);
3498 const gmx_simd_double_t CT5
= gmx_simd_set1_d(0.002895755790837379295226923);
3499 const gmx_simd_double_t CT4
= gmx_simd_set1_d(0.02460087336161924491836265);
3500 const gmx_simd_double_t CT3
= gmx_simd_set1_d(0.05334912882656359828045988);
3501 const gmx_simd_double_t CT2
= gmx_simd_set1_d(0.1333989091464957704418495);
3502 const gmx_simd_double_t CT1
= gmx_simd_set1_d(0.3333307599244198227797507);
3504 gmx_simd_double_t x2
, p
, y
, z
;
3505 gmx_simd_dbool_t mask
;
3507 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
3508 gmx_simd_dint32_t iy
;
3509 gmx_simd_dint32_t ione
= gmx_simd_set1_di(1);
3511 z
= gmx_simd_mul_d(x
, two_over_pi
);
3512 iy
= gmx_simd_cvt_d2i(z
);
3513 y
= gmx_simd_round_d(z
);
3514 mask
= gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy
, ione
), ione
));
3516 x
= gmx_simd_fnmadd_d(y
, argred0
, x
);
3517 x
= gmx_simd_fnmadd_d(y
, argred1
, x
);
3518 x
= gmx_simd_fnmadd_d(y
, argred2
, x
);
3519 x
= gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), mask
), x
);
3521 const gmx_simd_double_t quarter
= gmx_simd_set1_d(0.25);
3522 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
3523 const gmx_simd_double_t threequarter
= gmx_simd_set1_d(0.75);
3524 gmx_simd_double_t w
, q
;
3525 gmx_simd_dbool_t m1
, m2
, m3
;
3527 w
= gmx_simd_fabs_d(x
);
3528 z
= gmx_simd_fmadd_d(w
, two_over_pi
, half
);
3529 y
= gmx_simd_trunc_d(z
);
3530 q
= gmx_simd_mul_d(z
, quarter
);
3531 q
= gmx_simd_sub_d(q
, gmx_simd_trunc_d(q
));
3532 m1
= gmx_simd_cmple_d(quarter
, q
);
3533 m2
= gmx_simd_cmplt_d(q
, half
);
3534 m3
= gmx_simd_cmple_d(threequarter
, q
);
3535 m1
= gmx_simd_and_db(m1
, m2
);
3536 mask
= gmx_simd_or_db(m1
, m3
);
3537 w
= gmx_simd_fnmadd_d(y
, argred0
, w
);
3538 w
= gmx_simd_fnmadd_d(y
, argred1
, w
);
3539 w
= gmx_simd_fnmadd_d(y
, argred2
, w
);
3541 w
= gmx_simd_blendv_d(w
, gmx_simd_fneg_d(w
), mask
);
3542 x
= gmx_simd_xor_sign_d(w
, x
);
3544 x2
= gmx_simd_mul_d(x
, x
);
3545 p
= gmx_simd_fmadd_d(CT6
, x2
, CT5
);
3546 p
= gmx_simd_fmadd_d(p
, x2
, CT4
);
3547 p
= gmx_simd_fmadd_d(p
, x2
, CT3
);
3548 p
= gmx_simd_fmadd_d(p
, x2
, CT2
);
3549 p
= gmx_simd_fmadd_d(p
, x2
, CT1
);
3550 p
= gmx_simd_fmadd_d(x2
, gmx_simd_mul_d(p
, x
), x
);
3552 p
= gmx_simd_blendv_d( p
, gmx_simd_inv_maskfpe_singleaccuracy_d(p
, mask
), mask
);
3556 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3558 * You should normally call the real-precision routine
3559 * \ref gmx_simd_asin_singleaccuracy_r.
3561 * \param x The argument to evaluate asin for
3564 static gmx_inline gmx_simd_double_t gmx_simdcall
3565 gmx_simd_asin_singleaccuracy_d(gmx_simd_double_t x
)
3567 const gmx_simd_double_t limitlow
= gmx_simd_set1_d(1e-4);
3568 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
3569 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3570 const gmx_simd_double_t halfpi
= gmx_simd_set1_d(M_PI
/2.0);
3571 const gmx_simd_double_t CC5
= gmx_simd_set1_d(4.2163199048E-2);
3572 const gmx_simd_double_t CC4
= gmx_simd_set1_d(2.4181311049E-2);
3573 const gmx_simd_double_t CC3
= gmx_simd_set1_d(4.5470025998E-2);
3574 const gmx_simd_double_t CC2
= gmx_simd_set1_d(7.4953002686E-2);
3575 const gmx_simd_double_t CC1
= gmx_simd_set1_d(1.6666752422E-1);
3576 gmx_simd_double_t xabs
;
3577 gmx_simd_double_t z
, z1
, z2
, q
, q1
, q2
;
3578 gmx_simd_double_t pA
, pB
;
3579 gmx_simd_dbool_t mask
, mask2
;
3581 xabs
= gmx_simd_fabs_d(x
);
3582 mask
= gmx_simd_cmplt_d(half
, xabs
);
3583 z1
= gmx_simd_mul_d(half
, gmx_simd_sub_d(one
, xabs
));
3584 mask2
= gmx_simd_cmpeq_d(xabs
, one
);
3585 q1
= gmx_simd_mul_d(z1
, gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(z1
, mask2
));
3586 q1
= gmx_simd_blendnotzero_d(q1
, gmx_simd_cmpeq_d(xabs
, one
));
3588 z2
= gmx_simd_mul_d(q2
, q2
);
3589 z
= gmx_simd_blendv_d(z2
, z1
, mask
);
3590 q
= gmx_simd_blendv_d(q2
, q1
, mask
);
3592 z2
= gmx_simd_mul_d(z
, z
);
3593 pA
= gmx_simd_fmadd_d(CC5
, z2
, CC3
);
3594 pB
= gmx_simd_fmadd_d(CC4
, z2
, CC2
);
3595 pA
= gmx_simd_fmadd_d(pA
, z2
, CC1
);
3596 pA
= gmx_simd_mul_d(pA
, z
);
3597 z
= gmx_simd_fmadd_d(pB
, z2
, pA
);
3598 z
= gmx_simd_fmadd_d(z
, q
, q
);
3599 q2
= gmx_simd_sub_d(halfpi
, z
);
3600 q2
= gmx_simd_sub_d(q2
, z
);
3601 z
= gmx_simd_blendv_d(z
, q2
, mask
);
3603 mask
= gmx_simd_cmplt_d(limitlow
, xabs
);
3604 z
= gmx_simd_blendv_d( xabs
, z
, mask
);
3605 z
= gmx_simd_xor_sign_d(z
, x
);
3610 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3612 * You should normally call the real-precision routine
3613 * \ref gmx_simd_acos_singleaccuracy_r.
3615 * \param x The argument to evaluate acos for
3618 static gmx_inline gmx_simd_double_t gmx_simdcall
3619 gmx_simd_acos_singleaccuracy_d(gmx_simd_double_t x
)
3621 const gmx_simd_double_t one
= gmx_simd_set1_d(1.0);
3622 const gmx_simd_double_t half
= gmx_simd_set1_d(0.5);
3623 const gmx_simd_double_t pi
= gmx_simd_set1_d(M_PI
);
3624 const gmx_simd_double_t halfpi
= gmx_simd_set1_d(M_PI
/2.0);
3625 gmx_simd_double_t xabs
;
3626 gmx_simd_double_t z
, z1
, z2
, z3
;
3627 gmx_simd_dbool_t mask1
, mask2
, mask3
;
3629 xabs
= gmx_simd_fabs_d(x
);
3630 mask1
= gmx_simd_cmplt_d(half
, xabs
);
3631 mask2
= gmx_simd_cmplt_d(gmx_simd_setzero_d(), x
);
3633 z
= gmx_simd_mul_d(half
, gmx_simd_sub_d(one
, xabs
));
3634 mask3
= gmx_simd_cmpeq_d(xabs
, one
);
3635 z
= gmx_simd_mul_d(z
, gmx_simd_invsqrt_notmaskfpe_singleaccuracy_d(z
, mask3
));
3636 z
= gmx_simd_blendnotzero_d(z
, gmx_simd_cmpeq_d(xabs
, one
));
3637 z
= gmx_simd_blendv_d(x
, z
, mask1
);
3638 z
= gmx_simd_asin_singleaccuracy_d(z
);
3640 z2
= gmx_simd_add_d(z
, z
);
3641 z1
= gmx_simd_sub_d(pi
, z2
);
3642 z3
= gmx_simd_sub_d(halfpi
, z
);
3643 z
= gmx_simd_blendv_d(z1
, z2
, mask2
);
3644 z
= gmx_simd_blendv_d(z3
, z
, mask1
);
3649 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3651 * You should normally call the real-precision routine
3652 * \ref gmx_simd_atan_singleaccuracy_r.
3654 * \param x The argument to evaluate atan for
3655 * \result Atan(x), same argument/value range as standard math library.
3657 static gmx_inline gmx_simd_double_t gmx_simdcall
3658 gmx_simd_atan_singleaccuracy_d(gmx_simd_double_t x
)
3660 const gmx_simd_double_t halfpi
= gmx_simd_set1_d(M_PI
/2);
3661 const gmx_simd_double_t CA17
= gmx_simd_set1_d(0.002823638962581753730774);
3662 const gmx_simd_double_t CA15
= gmx_simd_set1_d(-0.01595690287649631500244);
3663 const gmx_simd_double_t CA13
= gmx_simd_set1_d(0.04250498861074447631836);
3664 const gmx_simd_double_t CA11
= gmx_simd_set1_d(-0.07489009201526641845703);
3665 const gmx_simd_double_t CA9
= gmx_simd_set1_d(0.1063479334115982055664);
3666 const gmx_simd_double_t CA7
= gmx_simd_set1_d(-0.1420273631811141967773);
3667 const gmx_simd_double_t CA5
= gmx_simd_set1_d(0.1999269574880599975585);
3668 const gmx_simd_double_t CA3
= gmx_simd_set1_d(-0.3333310186862945556640);
3669 gmx_simd_double_t x2
, x3
, x4
, pA
, pB
;
3670 gmx_simd_dbool_t mask
, mask2
;
3672 mask
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
3673 x
= gmx_simd_fabs_d(x
);
3674 mask2
= gmx_simd_cmplt_d(gmx_simd_set1_d(1.0), x
);
3675 x
= gmx_simd_blendv_d(x
, gmx_simd_inv_maskfpe_singleaccuracy_d(x
, mask2
), mask2
);
3677 x2
= gmx_simd_mul_d(x
, x
);
3678 x3
= gmx_simd_mul_d(x2
, x
);
3679 x4
= gmx_simd_mul_d(x2
, x2
);
3680 pA
= gmx_simd_fmadd_d(CA17
, x4
, CA13
);
3681 pB
= gmx_simd_fmadd_d(CA15
, x4
, CA11
);
3682 pA
= gmx_simd_fmadd_d(pA
, x4
, CA9
);
3683 pB
= gmx_simd_fmadd_d(pB
, x4
, CA7
);
3684 pA
= gmx_simd_fmadd_d(pA
, x4
, CA5
);
3685 pB
= gmx_simd_fmadd_d(pB
, x4
, CA3
);
3686 pA
= gmx_simd_fmadd_d(pA
, x2
, pB
);
3687 pA
= gmx_simd_fmadd_d(pA
, x3
, x
);
3689 pA
= gmx_simd_blendv_d(pA
, gmx_simd_sub_d(halfpi
, pA
), mask2
);
3690 pA
= gmx_simd_blendv_d(pA
, gmx_simd_fneg_d(pA
), mask
);
3695 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3697 * You should normally call the real-precision routine
3698 * \ref gmx_simd_atan2_singleaccuracy_r.
3700 * \param y Y component of vector, any quartile
3701 * \param x X component of vector, any quartile
3702 * \result Atan(y,x), same argument/value range as standard math library.
3704 * \note This routine should provide correct results for all finite
3705 * non-zero or positive-zero arguments. However, negative zero arguments will
3706 * be treated as positive zero, which means the return value will deviate from
3707 * the standard math library atan2(y,x) for those cases. That should not be
3708 * of any concern in Gromacs, and in particular it will not affect calculations
3709 * of angles from vectors.
3711 static gmx_inline gmx_simd_double_t gmx_simdcall
3712 gmx_simd_atan2_singleaccuracy_d(gmx_simd_double_t y
, gmx_simd_double_t x
)
3714 const gmx_simd_double_t pi
= gmx_simd_set1_d(M_PI
);
3715 const gmx_simd_double_t halfpi
= gmx_simd_set1_d(M_PI
/2.0);
3716 gmx_simd_double_t xinv
, p
, aoffset
;
3717 gmx_simd_dbool_t mask_x0
, mask_y0
, mask_xlt0
, mask_ylt0
;
3719 mask_x0
= gmx_simd_cmpeq_d(x
, gmx_simd_setzero_d());
3720 mask_y0
= gmx_simd_cmpeq_d(y
, gmx_simd_setzero_d());
3721 mask_xlt0
= gmx_simd_cmplt_d(x
, gmx_simd_setzero_d());
3722 mask_ylt0
= gmx_simd_cmplt_d(y
, gmx_simd_setzero_d());
3724 aoffset
= gmx_simd_blendzero_d(halfpi
, mask_x0
);
3725 aoffset
= gmx_simd_blendnotzero_d(aoffset
, mask_y0
);
3727 aoffset
= gmx_simd_blendv_d(aoffset
, pi
, mask_xlt0
);
3728 aoffset
= gmx_simd_blendv_d(aoffset
, gmx_simd_fneg_d(aoffset
), mask_ylt0
);
3730 xinv
= gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_singleaccuracy_d(x
, mask_x0
), mask_x0
);
3731 p
= gmx_simd_mul_d(y
, xinv
);
3732 p
= gmx_simd_atan_singleaccuracy_d(p
);
3733 p
= gmx_simd_add_d(p
, aoffset
);
3738 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3740 * You should normally call the real-precision routine
3741 * \ref gmx_simd_pmecorrF_singleaccuracy_r.
3743 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3744 * \result Correction factor to coulomb force - see below for details.
3746 * This routine is meant to enable analytical evaluation of the
3747 * direct-space PME electrostatic force to avoid tables.
3749 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3750 * are some problems evaluating that:
3752 * First, the error function is difficult (read: expensive) to
3753 * approxmiate accurately for intermediate to large arguments, and
3754 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3755 * Second, we now try to avoid calculating potentials in Gromacs but
3756 * use forces directly.
3758 * We can simply things slight by noting that the PME part is really
3759 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3761 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3763 * The first term we already have from the inverse square root, so
3764 * that we can leave out of this routine.
3766 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3767 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3768 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3771 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3772 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3773 * then only use even powers. This is another minor optimization, since
3774 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3775 * the vector between the two atoms to get the vectorial force. The
3776 * fastest flops are the ones we can avoid calculating!
3778 * So, here's how it should be used:
3780 * 1. Calculate \f$r^2\f$.
3781 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3782 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3783 * 4. The return value is the expression:
3786 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3789 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3792 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3795 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3798 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3801 * With a bit of math exercise you should be able to confirm that
3805 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3808 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3809 * and you have your force (divided by \f$r\f$). A final multiplication
3810 * with the vector connecting the two particles and you have your
3811 * vectorial force to add to the particles.
3813 * This approximation achieves an accuracy slightly lower than 1e-6; when
3814 * added to \f$1/r\f$ the error will be insignificant.
3817 static gmx_simd_double_t gmx_simdcall
3818 gmx_simd_pmecorrF_singleaccuracy_d(gmx_simd_double_t z2
)
3820 const gmx_simd_double_t FN6
= gmx_simd_set1_d(-1.7357322914161492954e-8);
3821 const gmx_simd_double_t FN5
= gmx_simd_set1_d(1.4703624142580877519e-6);
3822 const gmx_simd_double_t FN4
= gmx_simd_set1_d(-0.000053401640219807709149);
3823 const gmx_simd_double_t FN3
= gmx_simd_set1_d(0.0010054721316683106153);
3824 const gmx_simd_double_t FN2
= gmx_simd_set1_d(-0.019278317264888380590);
3825 const gmx_simd_double_t FN1
= gmx_simd_set1_d(0.069670166153766424023);
3826 const gmx_simd_double_t FN0
= gmx_simd_set1_d(-0.75225204789749321333);
3828 const gmx_simd_double_t FD4
= gmx_simd_set1_d(0.0011193462567257629232);
3829 const gmx_simd_double_t FD3
= gmx_simd_set1_d(0.014866955030185295499);
3830 const gmx_simd_double_t FD2
= gmx_simd_set1_d(0.11583842382862377919);
3831 const gmx_simd_double_t FD1
= gmx_simd_set1_d(0.50736591960530292870);
3832 const gmx_simd_double_t FD0
= gmx_simd_set1_d(1.0);
3834 gmx_simd_double_t z4
;
3835 gmx_simd_double_t polyFN0
, polyFN1
, polyFD0
, polyFD1
;
3837 z4
= gmx_simd_mul_d(z2
, z2
);
3839 polyFD0
= gmx_simd_fmadd_d(FD4
, z4
, FD2
);
3840 polyFD1
= gmx_simd_fmadd_d(FD3
, z4
, FD1
);
3841 polyFD0
= gmx_simd_fmadd_d(polyFD0
, z4
, FD0
);
3842 polyFD0
= gmx_simd_fmadd_d(polyFD1
, z2
, polyFD0
);
3844 polyFD0
= gmx_simd_inv_singleaccuracy_d(polyFD0
);
3846 polyFN0
= gmx_simd_fmadd_d(FN6
, z4
, FN4
);
3847 polyFN1
= gmx_simd_fmadd_d(FN5
, z4
, FN3
);
3848 polyFN0
= gmx_simd_fmadd_d(polyFN0
, z4
, FN2
);
3849 polyFN1
= gmx_simd_fmadd_d(polyFN1
, z4
, FN1
);
3850 polyFN0
= gmx_simd_fmadd_d(polyFN0
, z4
, FN0
);
3851 polyFN0
= gmx_simd_fmadd_d(polyFN1
, z2
, polyFN0
);
3853 return gmx_simd_mul_d(polyFN0
, polyFD0
);
3858 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
3860 * You should normally call the real-precision routine
3861 * \ref gmx_simd_pmecorrV_singleaccuracy_r.
3863 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3864 * \result Correction factor to coulomb potential - see below for details.
3866 * See \ref gmx_simd_pmecorrF_f for details about the approximation.
3868 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
3869 * as the input argument.
3871 * Here's how it should be used:
3873 * 1. Calculate \f$r^2\f$.
3874 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
3875 * 3. Evaluate this routine with z^2 as the argument.
3876 * 4. The return value is the expression:
3879 * \frac{\mbox{erf}(z)}{z}
3882 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
3885 * \frac{\mbox{erf}(r \beta)}{r}
3888 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
3889 * and you have your potential.
3891 * This approximation achieves an accuracy slightly lower than 1e-6; when
3892 * added to \f$1/r\f$ the error will be insignificant.
3894 static gmx_simd_double_t gmx_simdcall
3895 gmx_simd_pmecorrV_singleaccuracy_d(gmx_simd_double_t z2
)
3897 const gmx_simd_double_t VN6
= gmx_simd_set1_d(1.9296833005951166339e-8);
3898 const gmx_simd_double_t VN5
= gmx_simd_set1_d(-1.4213390571557850962e-6);
3899 const gmx_simd_double_t VN4
= gmx_simd_set1_d(0.000041603292906656984871);
3900 const gmx_simd_double_t VN3
= gmx_simd_set1_d(-0.00013134036773265025626);
3901 const gmx_simd_double_t VN2
= gmx_simd_set1_d(0.038657983986041781264);
3902 const gmx_simd_double_t VN1
= gmx_simd_set1_d(0.11285044772717598220);
3903 const gmx_simd_double_t VN0
= gmx_simd_set1_d(1.1283802385263030286);
3905 const gmx_simd_double_t VD3
= gmx_simd_set1_d(0.0066752224023576045451);
3906 const gmx_simd_double_t VD2
= gmx_simd_set1_d(0.078647795836373922256);
3907 const gmx_simd_double_t VD1
= gmx_simd_set1_d(0.43336185284710920150);
3908 const gmx_simd_double_t VD0
= gmx_simd_set1_d(1.0);
3910 gmx_simd_double_t z4
;
3911 gmx_simd_double_t polyVN0
, polyVN1
, polyVD0
, polyVD1
;
3913 z4
= gmx_simd_mul_d(z2
, z2
);
3915 polyVD1
= gmx_simd_fmadd_d(VD3
, z4
, VD1
);
3916 polyVD0
= gmx_simd_fmadd_d(VD2
, z4
, VD0
);
3917 polyVD0
= gmx_simd_fmadd_d(polyVD1
, z2
, polyVD0
);
3919 polyVD0
= gmx_simd_inv_singleaccuracy_d(polyVD0
);
3921 polyVN0
= gmx_simd_fmadd_d(VN6
, z4
, VN4
);
3922 polyVN1
= gmx_simd_fmadd_d(VN5
, z4
, VN3
);
3923 polyVN0
= gmx_simd_fmadd_d(polyVN0
, z4
, VN2
);
3924 polyVN1
= gmx_simd_fmadd_d(polyVN1
, z4
, VN1
);
3925 polyVN0
= gmx_simd_fmadd_d(polyVN0
, z4
, VN0
);
3926 polyVN0
= gmx_simd_fmadd_d(polyVN1
, z2
, polyVN0
);
3928 return gmx_simd_mul_d(polyVN0
, polyVD0
);
3934 /*! \name SIMD4 math functions
3936 * \note Only a subset of the math functions are implemented for SIMD4.
3941 #ifdef GMX_SIMD4_HAVE_FLOAT
3943 /*************************************************************************
3944 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
3945 *************************************************************************/
3947 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
3949 * \copydetails gmx_simd_sum4_f
3951 static gmx_inline gmx_simd4_float_t gmx_simdcall
3952 gmx_simd4_sum4_f(gmx_simd4_float_t a
, gmx_simd4_float_t b
,
3953 gmx_simd4_float_t c
, gmx_simd4_float_t d
)
3955 return gmx_simd4_add_f(gmx_simd4_add_f(a
, b
), gmx_simd4_add_f(c
, d
));
3958 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
3960 * \copydetails gmx_simd_rsqrt_iter_f
3962 static gmx_inline gmx_simd4_float_t gmx_simdcall
3963 gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu
, gmx_simd4_float_t x
)
3965 # ifdef GMX_SIMD_HAVE_FMA
3966 return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x
, gmx_simd4_mul_f(lu
, lu
), gmx_simd4_set1_f(1.0f
)), gmx_simd4_mul_f(lu
, gmx_simd4_set1_f(0.5f
)), lu
);
3968 return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f
), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f
), gmx_simd4_mul_f(gmx_simd4_mul_f(lu
, lu
), x
)), lu
));
3972 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
3974 * \copydetails gmx_simd_invsqrt_f
3976 static gmx_inline gmx_simd4_float_t gmx_simdcall
3977 gmx_simd4_invsqrt_f(gmx_simd4_float_t x
)
3979 gmx_simd4_float_t lu
= gmx_simd4_rsqrt_f(x
);
3980 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3981 lu
= gmx_simd4_rsqrt_iter_f(lu
, x
);
3983 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3984 lu
= gmx_simd4_rsqrt_iter_f(lu
, x
);
3986 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3987 lu
= gmx_simd4_rsqrt_iter_f(lu
, x
);
3992 #endif /* GMX_SIMD4_HAVE_FLOAT */
3996 #ifdef GMX_SIMD4_HAVE_DOUBLE
3997 /*************************************************************************
3998 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
3999 *************************************************************************/
4002 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
4004 * \copydetails gmx_simd_sum4_f
4006 static gmx_inline gmx_simd4_double_t gmx_simdcall
4007 gmx_simd4_sum4_d(gmx_simd4_double_t a
, gmx_simd4_double_t b
,
4008 gmx_simd4_double_t c
, gmx_simd4_double_t d
)
4010 return gmx_simd4_add_d(gmx_simd4_add_d(a
, b
), gmx_simd4_add_d(c
, d
));
4013 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4015 * \copydetails gmx_simd_rsqrt_iter_f
4017 static gmx_inline gmx_simd4_double_t gmx_simdcall
4018 gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu
, gmx_simd4_double_t x
)
4020 #ifdef GMX_SIMD_HAVE_FMA
4021 return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x
, gmx_simd4_mul_d(lu
, lu
), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu
, gmx_simd4_set1_d(0.5)), lu
);
4023 return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu
, lu
), x
)), lu
));
4027 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4029 * \copydetails gmx_simd_invsqrt_f
4031 static gmx_inline gmx_simd4_double_t gmx_simdcall
4032 gmx_simd4_invsqrt_d(gmx_simd4_double_t x
)
4034 gmx_simd4_double_t lu
= gmx_simd4_rsqrt_d(x
);
4035 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4036 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4038 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4039 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4041 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4042 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4044 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4045 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4050 /**********************************************************************
4051 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4052 **********************************************************************/
4054 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4056 * \copydetails gmx_simd_invsqrt_singleaccuracy_d
4058 static gmx_inline gmx_simd4_double_t gmx_simdcall
4059 gmx_simd4_invsqrt_singleaccuracy_d(gmx_simd4_double_t x
)
4061 gmx_simd4_double_t lu
= gmx_simd4_rsqrt_d(x
);
4062 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4063 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4065 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4066 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4068 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4069 lu
= gmx_simd4_rsqrt_iter_d(lu
, x
);
4075 #endif /* GMX_SIMD4_HAVE_DOUBLE */
4080 /* Set defines based on default Gromacs precision */
4082 /* Documentation in single branch below */
4084 # define gmx_simd_sum4_r gmx_simd_sum4_d
4085 # define gmx_simd_xor_sign_r gmx_simd_xor_sign_d
4086 # define gmx_simd4_sum4_r gmx_simd4_sum4_d
4088 /* On hardware that only supports double precision SIMD it is possible to use
4089 * the faster _singleaccuracy_d routines everywhere by setting the requested SIMD
4090 * accuracy to single precision.
4092 #if (GMX_SIMD_ACCURACY_BITS_DOUBLE > GMX_SIMD_ACCURACY_BITS_SINGLE)
4094 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_d
4095 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_d
4096 # define gmx_simd_sqrt_r gmx_simd_sqrt_d
4097 # define gmx_simd_inv_r gmx_simd_inv_d
4098 # define gmx_simd_log_r gmx_simd_log_d
4099 # define gmx_simd_exp2_r gmx_simd_exp2_d
4100 # define gmx_simd_exp_r gmx_simd_exp_d
4101 # define gmx_simd_erf_r gmx_simd_erf_d
4102 # define gmx_simd_erfc_r gmx_simd_erfc_d
4103 # define gmx_simd_sincos_r gmx_simd_sincos_d
4104 # define gmx_simd_sin_r gmx_simd_sin_d
4105 # define gmx_simd_cos_r gmx_simd_cos_d
4106 # define gmx_simd_tan_r gmx_simd_tan_d
4107 # define gmx_simd_asin_r gmx_simd_asin_d
4108 # define gmx_simd_acos_r gmx_simd_acos_d
4109 # define gmx_simd_atan_r gmx_simd_atan_d
4110 # define gmx_simd_atan2_r gmx_simd_atan2_d
4111 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_d
4112 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_d
4113 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_d
4117 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_singleaccuracy_d
4118 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_singleaccuracy_d
4119 # define gmx_simd_sqrt_r gmx_simd_sqrt_singleaccuracy_d
4120 # define gmx_simd_inv_r gmx_simd_inv_singleaccuracy_d
4121 # define gmx_simd_log_r gmx_simd_log_singleaccuracy_d
4122 # define gmx_simd_exp2_r gmx_simd_exp2_singleaccuracy_d
4123 # define gmx_simd_exp_r gmx_simd_exp_singleaccuracy_d
4124 # define gmx_simd_erf_r gmx_simd_erf_singleaccuracy_d
4125 # define gmx_simd_erfc_r gmx_simd_erfc_singleaccuracy_d
4126 # define gmx_simd_sincos_r gmx_simd_sincos_singleaccuracy_d
4127 # define gmx_simd_sin_r gmx_simd_sin_singleaccuracy_d
4128 # define gmx_simd_cos_r gmx_simd_cos_singleaccuracy_d
4129 # define gmx_simd_tan_r gmx_simd_tan_singleaccuracy_d
4130 # define gmx_simd_asin_r gmx_simd_asin_singleaccuracy_d
4131 # define gmx_simd_acos_r gmx_simd_acos_singleaccuracy_d
4132 # define gmx_simd_atan_r gmx_simd_atan_singleaccuracy_d
4133 # define gmx_simd_atan2_r gmx_simd_atan2_singleaccuracy_d
4134 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_singleaccuracy_d
4135 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_singleaccuracy_d
4136 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_singleaccuracy_d
4140 # define gmx_simd_invsqrt_singleaccuracy_r gmx_simd_invsqrt_singleaccuracy_d
4141 # define gmx_simd_invsqrt_pair_singleaccuracy_r gmx_simd_invsqrt_pair_singleaccuracy_d
4142 # define gmx_simd_sqrt_singleaccuracy_r gmx_simd_sqrt_singleaccuracy_d
4143 # define gmx_simd_inv_singleaccuracy_r gmx_simd_inv_singleaccuracy_d
4144 # define gmx_simd_log_singleaccuracy_r gmx_simd_log_singleaccuracy_d
4145 # define gmx_simd_exp2_singleaccuracy_r gmx_simd_exp2_singleaccuracy_d
4146 # define gmx_simd_exp_singleaccuracy_r gmx_simd_exp_singleaccuracy_d
4147 # define gmx_simd_erf_singleaccuracy_r gmx_simd_erf_singleaccuracy_d
4148 # define gmx_simd_erfc_singleaccuracy_r gmx_simd_erfc_singleaccuracy_d
4149 # define gmx_simd_sincos_singleaccuracy_r gmx_simd_sincos_singleaccuracy_d
4150 # define gmx_simd_sin_singleaccuracy_r gmx_simd_sin_singleaccuracy_d
4151 # define gmx_simd_cos_singleaccuracy_r gmx_simd_cos_singleaccuracy_d
4152 # define gmx_simd_tan_singleaccuracy_r gmx_simd_tan_singleaccuracy_d
4153 # define gmx_simd_asin_singleaccuracy_r gmx_simd_asin_singleaccuracy_d
4154 # define gmx_simd_acos_singleaccuracy_r gmx_simd_acos_singleaccuracy_d
4155 # define gmx_simd_atan_singleaccuracy_r gmx_simd_atan_singleaccuracy_d
4156 # define gmx_simd_atan2_singleaccuracy_r gmx_simd_atan2_singleaccuracy_d
4157 # define gmx_simd_pmecorrF_singleaccuracy_r gmx_simd_pmecorrF_singleaccuracy_d
4158 # define gmx_simd_pmecorrV_singleaccuracy_r gmx_simd_pmecorrV_singleaccuracy_d
4159 # define gmx_simd4_invsqrt_singleaccuracy_r gmx_simd4_invsqrt_singleaccuracy_d
4161 #else /* GMX_DOUBLE */
4163 /*! \name Real-precision SIMD math functions
4165 * These are the ones you should typically call in Gromacs.
4169 /*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
4171 * \copydetails gmx_simd_sum4_f
4173 # define gmx_simd_sum4_r gmx_simd_sum4_f
4175 /*! \brief Return -a if b is negative, SIMD real.
4177 * \copydetails gmx_simd_xor_sign_f
4179 # define gmx_simd_xor_sign_r gmx_simd_xor_sign_f
4181 /*! \brief Calculate 1/sqrt(x) for SIMD real.
4183 * \copydetails gmx_simd_invsqrt_f
4185 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_f
4187 /*! \brief Calculate 1/sqrt(x) for two SIMD reals.
4189 * \copydetails gmx_simd_invsqrt_pair_f
4191 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_f
4193 /*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
4195 * \copydetails gmx_simd_sqrt_f
4197 # define gmx_simd_sqrt_r gmx_simd_sqrt_f
4199 /*! \brief Calculate 1/x for SIMD real.
4201 * \copydetails gmx_simd_inv_f
4203 # define gmx_simd_inv_r gmx_simd_inv_f
4205 /*! \brief SIMD real log(x). This is the natural logarithm.
4207 * \copydetails gmx_simd_log_f
4209 # define gmx_simd_log_r gmx_simd_log_f
4211 /*! \brief SIMD real 2^x.
4213 * \copydetails gmx_simd_exp2_f
4215 # define gmx_simd_exp2_r gmx_simd_exp2_f
4217 /*! \brief SIMD real e^x.
4219 * \copydetails gmx_simd_exp_f
4221 # define gmx_simd_exp_r gmx_simd_exp_f
4223 /*! \brief SIMD real erf(x).
4225 * \copydetails gmx_simd_erf_f
4227 # define gmx_simd_erf_r gmx_simd_erf_f
4229 /*! \brief SIMD real erfc(x).
4231 * \copydetails gmx_simd_erfc_f
4233 # define gmx_simd_erfc_r gmx_simd_erfc_f
4235 /*! \brief SIMD real sin \& cos.
4237 * \copydetails gmx_simd_sincos_f
4239 # define gmx_simd_sincos_r gmx_simd_sincos_f
4241 /*! \brief SIMD real sin(x).
4243 * \copydetails gmx_simd_sin_f
4245 # define gmx_simd_sin_r gmx_simd_sin_f
4247 /*! \brief SIMD real cos(x).
4249 * \copydetails gmx_simd_cos_f
4251 # define gmx_simd_cos_r gmx_simd_cos_f
4253 /*! \brief SIMD real tan(x).
4255 * \copydetails gmx_simd_tan_f
4257 # define gmx_simd_tan_r gmx_simd_tan_f
4259 /*! \brief SIMD real asin(x).
4261 * \copydetails gmx_simd_asin_f
4263 # define gmx_simd_asin_r gmx_simd_asin_f
4265 /*! \brief SIMD real acos(x).
4267 * \copydetails gmx_simd_acos_f
4269 # define gmx_simd_acos_r gmx_simd_acos_f
4271 /*! \brief SIMD real atan(x).
4273 * \copydetails gmx_simd_atan_f
4275 # define gmx_simd_atan_r gmx_simd_atan_f
4277 /*! \brief SIMD real atan2(y,x).
4279 * \copydetails gmx_simd_atan2_f
4281 # define gmx_simd_atan2_r gmx_simd_atan2_f
4283 /*! \brief SIMD Analytic PME force correction.
4285 * \copydetails gmx_simd_pmecorrF_f
4287 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_f
4289 /*! \brief SIMD Analytic PME potential correction.
4291 * \copydetails gmx_simd_pmecorrV_f
4293 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_f
4295 /*! \brief Calculate 1/sqrt(x) for SIMD, only targeting single accuracy.
4297 * \copydetails gmx_simd_invsqrt_r
4299 * \note This is a performance-targeted function that only achieves single
4300 * precision accuracy, even when the SIMD data is double precision.
4302 # define gmx_simd_invsqrt_singleaccuracy_r gmx_simd_invsqrt_f
4304 /*! \brief Calculate 1/sqrt(x) for SIMD pair, only targeting single accuracy.
4306 * \copydetails gmx_simd_invsqrt_pair_r
4308 * \note This is a performance-targeted function that only achieves single
4309 * precision accuracy, even when the SIMD data is double precision.
4311 # define gmx_simd_invsqrt_pair_singleaccuracy_r gmx_simd_invsqrt_pair_f
4313 /*! \brief Calculate sqrt(x), only targeting single accuracy.
4315 * \copydetails gmx_simd_sqrt_r
4317 * \note This is a performance-targeted function that only achieves single
4318 * precision accuracy, even when the SIMD data is double precision.
4320 # define gmx_simd_sqrt_singleaccuracy_r gmx_simd_sqrt_f
4322 /*! \brief Calculate 1/x for SIMD real, only targeting single accuracy.
4324 * \copydetails gmx_simd_inv_r
4326 * \note This is a performance-targeted function that only achieves single
4327 * precision accuracy, even when the SIMD data is double precision.
4329 # define gmx_simd_inv_singleaccuracy_r gmx_simd_inv_f
4331 /*! \brief SIMD real log(x), only targeting single accuracy.
4333 * \copydetails gmx_simd_log_r
4335 * \note This is a performance-targeted function that only achieves single
4336 * precision accuracy, even when the SIMD data is double precision.
4338 # define gmx_simd_log_singleaccuracy_r gmx_simd_log_f
4340 /*! \brief SIMD real 2^x, only targeting single accuracy.
4342 * \copydetails gmx_simd_exp2_r
4344 * \note This is a performance-targeted function that only achieves single
4345 * precision accuracy, even when the SIMD data is double precision.
4347 # define gmx_simd_exp2_singleaccuracy_r gmx_simd_exp2_f
4349 /*! \brief SIMD real e^x, only targeting single accuracy.
4351 * \copydetails gmx_simd_exp_r
4353 * \note This is a performance-targeted function that only achieves single
4354 * precision accuracy, even when the SIMD data is double precision.
4356 # define gmx_simd_exp_singleaccuracy_r gmx_simd_exp_f
4358 /*! \brief SIMD real erf(x), only targeting single accuracy.
4360 * \copydetails gmx_simd_erf_r
4362 * \note This is a performance-targeted function that only achieves single
4363 * precision accuracy, even when the SIMD data is double precision.
4365 # define gmx_simd_erf_singleaccuracy_r gmx_simd_erf_f
4367 /*! \brief SIMD real erfc(x), only targeting single accuracy.
4369 * \copydetails gmx_simd_erfc_r
4371 * \note This is a performance-targeted function that only achieves single
4372 * precision accuracy, even when the SIMD data is double precision.
4374 # define gmx_simd_erfc_singleaccuracy_r gmx_simd_erfc_f
4376 /*! \brief SIMD real sin \& cos, only targeting single accuracy.
4378 * \copydetails gmx_simd_sincos_r
4380 * \note This is a performance-targeted function that only achieves single
4381 * precision accuracy, even when the SIMD data is double precision.
4383 # define gmx_simd_sincos_singleaccuracy_r gmx_simd_sincos_f
4385 /*! \brief SIMD real sin(x), only targeting single accuracy.
4387 * \copydetails gmx_simd_sin_r
4389 * \note This is a performance-targeted function that only achieves single
4390 * precision accuracy, even when the SIMD data is double precision.
4392 # define gmx_simd_sin_singleaccuracy_r gmx_simd_sin_f
4394 /*! \brief SIMD real cos(x), only targeting single accuracy.
4396 * \copydetails gmx_simd_cos_r
4398 * \note This is a performance-targeted function that only achieves single
4399 * precision accuracy, even when the SIMD data is double precision.
4401 # define gmx_simd_cos_singleaccuracy_r gmx_simd_cos_f
4403 /*! \brief SIMD real tan(x), only targeting single accuracy.
4405 * \copydetails gmx_simd_tan_r
4407 * \note This is a performance-targeted function that only achieves single
4408 * precision accuracy, even when the SIMD data is double precision.
4410 # define gmx_simd_tan_singleaccuracy_r gmx_simd_tan_f
4412 /*! \brief SIMD real asin(x), only targeting single accuracy.
4414 * \copydetails gmx_simd_asin_r
4416 * \note This is a performance-targeted function that only achieves single
4417 * precision accuracy, even when the SIMD data is double precision.
4419 # define gmx_simd_asin_singleaccuracy_r gmx_simd_asin_f
4421 /*! \brief SIMD real acos(x), only targeting single accuracy.
4423 * \copydetails gmx_simd_acos_r
4425 * \note This is a performance-targeted function that only achieves single
4426 * precision accuracy, even when the SIMD data is double precision.
4428 # define gmx_simd_acos_singleaccuracy_r gmx_simd_acos_f
4430 /*! \brief SIMD real atan(x), only targeting single accuracy.
4432 * \copydetails gmx_simd_atan_r
4434 * \note This is a performance-targeted function that only achieves single
4435 * precision accuracy, even when the SIMD data is double precision.
4437 # define gmx_simd_atan_singleaccuracy_r gmx_simd_atan_f
4439 /*! \brief SIMD real atan2(y,x), only targeting single accuracy.
4441 * \copydetails gmx_simd_atan2_r
4443 * \note This is a performance-targeted function that only achieves single
4444 * precision accuracy, even when the SIMD data is double precision.
4446 # define gmx_simd_atan2_singleaccuracy_r gmx_simd_atan2_f
4448 /*! \brief SIMD Analytic PME force corr., only targeting single accuracy.
4450 * \copydetails gmx_simd_pmecorrF_r
4452 * \note This is a performance-targeted function that only achieves single
4453 * precision accuracy, even when the SIMD data is double precision.
4455 # define gmx_simd_pmecorrF_singleaccuracy_r gmx_simd_pmecorrF_f
4457 /*! \brief SIMD Analytic PME potential corr., only targeting single accuracy.
4459 * \copydetails gmx_simd_pmecorrV_r
4461 * \note This is a performance-targeted function that only achieves single
4462 * precision accuracy, even when the SIMD data is double precision.
4464 # define gmx_simd_pmecorrV_singleaccuracy_r gmx_simd_pmecorrV_f
4468 * \name SIMD4 math functions
4472 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
4474 * \copydetails gmx_simd_sum4_f
4476 # define gmx_simd4_sum4_r gmx_simd4_sum4_f
4478 /*! \brief Calculate 1/sqrt(x) for SIMD4 real.
4480 * \copydetails gmx_simd_invsqrt_f
4482 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_f
4484 /*! \brief 1/sqrt(x) for SIMD4 real. Single accuracy, even for double prec.
4486 * \copydetails gmx_simd4_invsqrt_r
4488 * \note This is a performance-targeted function that only achieves single
4489 * precision accuracy, even when the SIMD data is double precision.
4491 # define gmx_simd4_invsqrt_singleaccuracy_r gmx_simd4_invsqrt_f
4495 #endif /* GMX_DOUBLE */
4500 #endif /* GMX_SIMD_SIMD_MATH_H_ */