include/gmx_math_x86_sse2_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   6  * others, as listed in the AUTHORS file in the top-level source
   7  * directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _gmx_math_x86_sse2_double_h_
  36 #define _gmx_math_x86_sse2_double_h_
  37
  38
  39 #include <stdio.h>
  40 #include <math.h>
  41
  42 #include "gmx_x86_sse2.h"
  43
  44
  45 #ifndef M_PI
  46 #  define M_PI 3.14159265358979323846264338327950288
  47 #endif
  48
  49
  50
  51 /************************
  52  *                      *
  53  * Simple math routines *
  54  *                      *
  55  ************************/
  56
  57 /* 1.0/sqrt(x) */
  58 static gmx_inline __m128d
  59 gmx_mm_invsqrt_pd(__m128d x)
  60 {
  61     const __m128d half  = _mm_set1_pd(0.5);
  62     const __m128d three = _mm_set1_pd(3.0);
  63
  64     /* Lookup instruction only exists in single precision, convert back and forth... */
  65     __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
  66
  67     lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
  68     return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
  69 }
  70
  71 /* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
  72 static void
  73 gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
  74 {
  75     const __m128d half   = _mm_set1_pd(0.5);
  76     const __m128d three  = _mm_set1_pd(3.0);
  77     const __m128  halff  = _mm_set1_ps(0.5f);
  78     const __m128  threef = _mm_set1_ps(3.0f);
  79
  80     __m128        xf, luf;
  81     __m128d       lu1, lu2;
  82
  83     /* Do first N-R step in float for 2x throughput */
  84     xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
  85     luf = _mm_rsqrt_ps(xf);
  86     luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
  87
  88     lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
  89     lu1 = _mm_cvtps_pd(luf);
  90
  91     *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
  92     *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
  93 }
  94
  95
  96 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  97 static gmx_inline __m128d
  98 gmx_mm_sqrt_pd(__m128d x)
  99 {
 100     __m128d mask;
 101     __m128d res;
 102
 103     mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
 104     res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
 105
 106     res  = _mm_mul_pd(x, res);
 107
 108     return res;
 109 }
 110
 111 /* 1.0/x */
 112 static gmx_inline __m128d
 113 gmx_mm_inv_pd(__m128d x)
 114 {
 115     const __m128d two  = _mm_set1_pd(2.0);
 116
 117     /* Lookup instruction only exists in single precision, convert back and forth... */
 118     __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
 119
 120     /* Perform two N-R steps for double precision */
 121     lu         = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
 122     return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
 123 }
 124
 125 static gmx_inline __m128d
 126 gmx_mm_abs_pd(__m128d x)
 127 {
 128     const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 129
 130     return _mm_and_pd(x, signmask);
 131 }
 132
 133
 134 /*
 135  * 2^x function.
 136  *
 137  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 138  * [-0.5,0.5].
 139  *
 140  * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
 141  * according to the same algorithm as used in the Cephes/netlib math routines.
 142  */
 143 static __m128d
 144 gmx_mm_exp2_pd(__m128d x)
 145 {
 146     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 147     const __m128d arglimit = _mm_set1_pd(1022.0);
 148     const __m128i expbase  = _mm_set1_epi32(1023);
 149
 150     const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
 151     const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
 152     const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
 153     /* Q2 == 1.0 */
 154     const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
 155     const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
 156     const __m128d one      = _mm_set1_pd(1.0);
 157     const __m128d two      = _mm_set1_pd(2.0);
 158
 159     __m128d       valuemask;
 160     __m128i       iexppart;
 161     __m128d       fexppart;
 162     __m128d       intpart;
 163     __m128d       z, z2;
 164     __m128d       PolyP, PolyQ;
 165
 166     iexppart  = _mm_cvtpd_epi32(x);
 167     intpart   = _mm_cvtepi32_pd(iexppart);
 168
 169     /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
 170      * To be able to shift it into the exponent for a double precision number we first need to
 171      * shuffle so that the lower half contains the first element, and the upper half the second.
 172      * This should really be done as a zero-extension, but since the next instructions will shift
 173      * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
 174      * (thus we just use element 2 from iexppart).
 175      */
 176     iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
 177
 178     /* Do the shift operation on the 64-bit registers */
 179     iexppart  = _mm_add_epi32(iexppart, expbase);
 180     iexppart  = _mm_slli_epi64(iexppart, 52);
 181
 182     valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
 183     fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
 184
 185     z         = _mm_sub_pd(x, intpart);
 186     z2        = _mm_mul_pd(z, z);
 187
 188     PolyP     = _mm_mul_pd(P2, z2);
 189     PolyP     = _mm_add_pd(PolyP, P1);
 190     PolyQ     = _mm_add_pd(z2, Q1);
 191     PolyP     = _mm_mul_pd(PolyP, z2);
 192     PolyQ     = _mm_mul_pd(PolyQ, z2);
 193     PolyP     = _mm_add_pd(PolyP, P0);
 194     PolyQ     = _mm_add_pd(PolyQ, Q0);
 195     PolyP     = _mm_mul_pd(PolyP, z);
 196
 197     z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
 198     z         = _mm_add_pd(one, _mm_mul_pd(two, z));
 199
 200     z         = _mm_mul_pd(z, fexppart);
 201
 202     return z;
 203 }
 204
 205 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 206  * but there will then be a small rounding error since we lose some precision due to the
 207  * multiplication. This will then be magnified a lot by the exponential.
 208  *
 209  * Instead, we calculate the fractional part directly as a Padé approximation of
 210  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 211  * remaining after 2^y, which avoids the precision-loss.
 212  */
 213 static __m128d
 214 gmx_mm_exp_pd(__m128d exparg)
 215 {
 216     const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
 217     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 218     const __m128d arglimit = _mm_set1_pd(1022.0);
 219     const __m128i expbase  = _mm_set1_epi32(1023);
 220
 221     const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
 222     const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
 223
 224     const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
 225     const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
 226     /* P0 == 1.0 */
 227     const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
 228     const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
 229     const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
 230     /* Q0 == 2.0 */
 231     const __m128d one      = _mm_set1_pd(1.0);
 232     const __m128d two      = _mm_set1_pd(2.0);
 233
 234     __m128d       valuemask;
 235     __m128i       iexppart;
 236     __m128d       fexppart;
 237     __m128d       intpart;
 238     __m128d       x, z, z2;
 239     __m128d       PolyP, PolyQ;
 240
 241     x             = _mm_mul_pd(exparg, argscale);
 242
 243     iexppart  = _mm_cvtpd_epi32(x);
 244     intpart   = _mm_cvtepi32_pd(iexppart);
 245
 246     /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
 247      * To be able to shift it into the exponent for a double precision number we first need to
 248      * shuffle so that the lower half contains the first element, and the upper half the second.
 249      * This should really be done as a zero-extension, but since the next instructions will shift
 250      * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
 251      * (thus we just use element 2 from iexppart).
 252      */
 253     iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
 254
 255     /* Do the shift operation on the 64-bit registers */
 256     iexppart  = _mm_add_epi32(iexppart, expbase);
 257     iexppart  = _mm_slli_epi64(iexppart, 52);
 258
 259     valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
 260     fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
 261
 262     z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
 263     z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
 264
 265     z2        = _mm_mul_pd(z, z);
 266
 267     PolyQ     = _mm_mul_pd(Q3, z2);
 268     PolyQ     = _mm_add_pd(PolyQ, Q2);
 269     PolyP     = _mm_mul_pd(P2, z2);
 270     PolyQ     = _mm_mul_pd(PolyQ, z2);
 271     PolyP     = _mm_add_pd(PolyP, P1);
 272     PolyQ     = _mm_add_pd(PolyQ, Q1);
 273     PolyP     = _mm_mul_pd(PolyP, z2);
 274     PolyQ     = _mm_mul_pd(PolyQ, z2);
 275     PolyP     = _mm_add_pd(PolyP, one);
 276     PolyQ     = _mm_add_pd(PolyQ, two);
 277
 278     PolyP     = _mm_mul_pd(PolyP, z);
 279
 280     z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
 281     z         = _mm_add_pd(one, _mm_mul_pd(two, z));
 282
 283     z         = _mm_mul_pd(z, fexppart);
 284
 285     return z;
 286 }
 287
 288
 289
 290 static __m128d
 291 gmx_mm_log_pd(__m128d x)
 292 {
 293     /* Same algorithm as cephes library */
 294     const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
 295
 296     const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
 297
 298     const __m128d half       = _mm_set1_pd(0.5);
 299     const __m128d one        = _mm_set1_pd(1.0);
 300     const __m128d two        = _mm_set1_pd(2.0);
 301     const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
 302
 303     const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
 304     const __m128d corr2      = _mm_set1_pd(0.693359375);
 305
 306     const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
 307     const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
 308     const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
 309     const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
 310     const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
 311     const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
 312
 313     const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
 314     const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
 315     const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
 316     const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
 317     const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
 318
 319     const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
 320     const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
 321     const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
 322
 323     const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
 324     const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
 325     const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
 326
 327     __m128d       fexp;
 328     __m128i       iexp;
 329
 330     __m128d       mask1, mask2;
 331     __m128d       corr, t1, t2, q;
 332     __m128d       zA, yA, xA, zB, yB, xB, z;
 333     __m128d       polyR, polyS;
 334     __m128d       polyP1, polyP2, polyQ1, polyQ2;
 335
 336     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 337     fexp   = _mm_and_pd(x, expmask);
 338     iexp   = gmx_mm_castpd_si128(fexp);
 339     iexp   = _mm_srli_epi64(iexp, 52);
 340     iexp   = _mm_sub_epi32(iexp, expbase_m1);
 341     iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
 342     fexp   = _mm_cvtepi32_pd(iexp);
 343
 344     x      = _mm_andnot_pd(expmask, x);
 345     x      = _mm_or_pd(x, one);
 346     x      = _mm_mul_pd(x, half);
 347
 348     mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
 349     mask2     = _mm_cmplt_pd(x, invsq2);
 350
 351     fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
 352
 353     /* If mask1 is set ('A') */
 354     zA     = _mm_sub_pd(x, half);
 355     t1     = _mm_or_pd( _mm_andnot_pd(mask2, zA), _mm_and_pd(mask2, x) );
 356     zA     = _mm_sub_pd(t1, half);
 357     t2     = _mm_or_pd( _mm_andnot_pd(mask2, x), _mm_and_pd(mask2, zA) );
 358     yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
 359
 360     xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
 361     zA     = _mm_mul_pd(xA, xA);
 362
 363     /* EVALUATE POLY */
 364     polyR  = _mm_mul_pd(R2, zA);
 365     polyR  = _mm_add_pd(polyR, R1);
 366     polyR  = _mm_mul_pd(polyR, zA);
 367     polyR  = _mm_add_pd(polyR, R0);
 368
 369     polyS  = _mm_add_pd(zA, S2);
 370     polyS  = _mm_mul_pd(polyS, zA);
 371     polyS  = _mm_add_pd(polyS, S1);
 372     polyS  = _mm_mul_pd(polyS, zA);
 373     polyS  = _mm_add_pd(polyS, S0);
 374
 375     q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
 376     zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
 377
 378     zA     = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
 379     zA     = _mm_add_pd(zA, xA);
 380     zA     = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
 381
 382     /* If mask1 is not set ('B') */
 383     corr   = _mm_and_pd(mask2, x);
 384     xB     = _mm_add_pd(x, corr);
 385     xB     = _mm_sub_pd(xB, one);
 386     zB     = _mm_mul_pd(xB, xB);
 387
 388     polyP1 = _mm_mul_pd(P5, zB);
 389     polyP2 = _mm_mul_pd(P4, zB);
 390     polyP1 = _mm_add_pd(polyP1, P3);
 391     polyP2 = _mm_add_pd(polyP2, P2);
 392     polyP1 = _mm_mul_pd(polyP1, zB);
 393     polyP2 = _mm_mul_pd(polyP2, zB);
 394     polyP1 = _mm_add_pd(polyP1, P1);
 395     polyP2 = _mm_add_pd(polyP2, P0);
 396     polyP1 = _mm_mul_pd(polyP1, xB);
 397     polyP1 = _mm_add_pd(polyP1, polyP2);
 398
 399     polyQ2 = _mm_mul_pd(Q4, zB);
 400     polyQ1 = _mm_add_pd(zB, Q3);
 401     polyQ2 = _mm_add_pd(polyQ2, Q2);
 402     polyQ1 = _mm_mul_pd(polyQ1, zB);
 403     polyQ2 = _mm_mul_pd(polyQ2, zB);
 404     polyQ1 = _mm_add_pd(polyQ1, Q1);
 405     polyQ2 = _mm_add_pd(polyQ2, Q0);
 406     polyQ1 = _mm_mul_pd(polyQ1, xB);
 407     polyQ1 = _mm_add_pd(polyQ1, polyQ2);
 408
 409     fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
 410
 411     q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
 412     yB     = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
 413
 414     yB     = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
 415     yB     = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
 416     zB     = _mm_add_pd(xB, yB);
 417     zB     = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
 418
 419     z      = _mm_or_pd( _mm_andnot_pd(mask1, zB), _mm_and_pd(mask1, zA) );
 420
 421     return z;
 422 }
 423
 424
 425
 426 static __m128d
 427 gmx_mm_erf_pd(__m128d x)
 428 {
 429     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
 430     const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
 431     const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
 432     const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
 433     const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
 434     const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
 435
 436     const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
 437     const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
 438     const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
 439     const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
 440     const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
 441     /* CAQ0 == 1.0 */
 442     const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
 443
 444     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
 445     const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
 446     const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
 447     const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
 448     const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
 449     const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
 450     const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
 451     const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
 452     const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
 453     const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
 454     const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
 455     const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
 456     const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
 457     const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
 458     const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
 459     /* CBQ0 == 1.0 */
 460
 461     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
 462     const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
 463     const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
 464     const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
 465     const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
 466     const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
 467     const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
 468     const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
 469
 470     const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
 471     const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
 472     const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
 473     const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
 474     const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
 475     const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
 476     /* CCQ0 == 1.0 */
 477     const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
 478
 479     const __m128d one       = _mm_set1_pd(1.0);
 480     const __m128d two       = _mm_set1_pd(2.0);
 481
 482     const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
 483
 484     __m128d       xabs, x2, x4, t, t2, w, w2;
 485     __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
 486     __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
 487     __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
 488     __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
 489     __m128d       mask, expmx2;
 490
 491     /* Calculate erf() */
 492     xabs     = gmx_mm_abs_pd(x);
 493     x2       = _mm_mul_pd(x, x);
 494     x4       = _mm_mul_pd(x2, x2);
 495
 496     PolyAP0  = _mm_mul_pd(CAP4, x4);
 497     PolyAP1  = _mm_mul_pd(CAP3, x4);
 498     PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
 499     PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
 500     PolyAP0  = _mm_mul_pd(PolyAP0, x4);
 501     PolyAP1  = _mm_mul_pd(PolyAP1, x2);
 502     PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
 503     PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
 504
 505     PolyAQ1  = _mm_mul_pd(CAQ5, x4);
 506     PolyAQ0  = _mm_mul_pd(CAQ4, x4);
 507     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
 508     PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
 509     PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
 510     PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
 511     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
 512     PolyAQ0  = _mm_add_pd(PolyAQ0, one);
 513     PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
 514     PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
 515
 516     res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
 517     res_erf  = _mm_add_pd(CAoffset, res_erf);
 518     res_erf  = _mm_mul_pd(x, res_erf);
 519
 520     /* Calculate erfc() in range [1,4.5] */
 521     t       = _mm_sub_pd(xabs, one);
 522     t2      = _mm_mul_pd(t, t);
 523
 524     PolyBP0  = _mm_mul_pd(CBP6, t2);
 525     PolyBP1  = _mm_mul_pd(CBP5, t2);
 526     PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
 527     PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
 528     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 529     PolyBP1  = _mm_mul_pd(PolyBP1, t2);
 530     PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
 531     PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
 532     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 533     PolyBP1  = _mm_mul_pd(PolyBP1, t);
 534     PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
 535     PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
 536
 537     PolyBQ1 = _mm_mul_pd(CBQ7, t2);
 538     PolyBQ0 = _mm_mul_pd(CBQ6, t2);
 539     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
 540     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
 541     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 542     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 543     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
 544     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
 545     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 546     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 547     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
 548     PolyBQ0 = _mm_add_pd(PolyBQ0, one);
 549     PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
 550     PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
 551
 552     res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
 553
 554     res_erfcB = _mm_mul_pd(res_erfcB, xabs);
 555
 556     /* Calculate erfc() in range [4.5,inf] */
 557     w       = gmx_mm_inv_pd(xabs);
 558     w2      = _mm_mul_pd(w, w);
 559
 560     PolyCP0  = _mm_mul_pd(CCP6, w2);
 561     PolyCP1  = _mm_mul_pd(CCP5, w2);
 562     PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
 563     PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
 564     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 565     PolyCP1  = _mm_mul_pd(PolyCP1, w2);
 566     PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
 567     PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
 568     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 569     PolyCP1  = _mm_mul_pd(PolyCP1, w);
 570     PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
 571     PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
 572
 573     PolyCQ0  = _mm_mul_pd(CCQ6, w2);
 574     PolyCQ1  = _mm_mul_pd(CCQ5, w2);
 575     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
 576     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
 577     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 578     PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
 579     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
 580     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
 581     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 582     PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
 583     PolyCQ0  = _mm_add_pd(PolyCQ0, one);
 584     PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
 585
 586     expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
 587
 588     res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
 589     res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
 590     res_erfcC = _mm_mul_pd(res_erfcC, w);
 591
 592     mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
 593     res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfcB), _mm_and_pd(mask, res_erfcC));
 594
 595     res_erfc = _mm_mul_pd(res_erfc, expmx2);
 596
 597     /* erfc(x<0) = 2-erfc(|x|) */
 598     mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
 599     res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(two, res_erfc)));
 600
 601     /* Select erf() or erfc() */
 602     mask = _mm_cmplt_pd(xabs, one);
 603     res  = _mm_or_pd(_mm_andnot_pd(mask, _mm_sub_pd(one, res_erfc)), _mm_and_pd(mask, res_erf));
 604
 605     return res;
 606 }
 607
 608
 609 static __m128d
 610 gmx_mm_erfc_pd(__m128d x)
 611 {
 612     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
 613     const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
 614     const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
 615     const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
 616     const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
 617     const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
 618
 619     const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
 620     const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
 621     const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
 622     const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
 623     const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
 624     /* CAQ0 == 1.0 */
 625     const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
 626
 627     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
 628     const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
 629     const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
 630     const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
 631     const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
 632     const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
 633     const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
 634     const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
 635     const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
 636     const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
 637     const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
 638     const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
 639     const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
 640     const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
 641     const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
 642     /* CBQ0 == 1.0 */
 643
 644     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
 645     const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
 646     const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
 647     const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
 648     const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
 649     const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
 650     const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
 651     const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
 652
 653     const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
 654     const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
 655     const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
 656     const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
 657     const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
 658     const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
 659     /* CCQ0 == 1.0 */
 660     const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
 661
 662     const __m128d one       = _mm_set1_pd(1.0);
 663     const __m128d two       = _mm_set1_pd(2.0);
 664
 665     const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
 666
 667     __m128d       xabs, x2, x4, t, t2, w, w2;
 668     __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
 669     __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
 670     __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
 671     __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
 672     __m128d       mask, expmx2;
 673
 674     /* Calculate erf() */
 675     xabs     = gmx_mm_abs_pd(x);
 676     x2       = _mm_mul_pd(x, x);
 677     x4       = _mm_mul_pd(x2, x2);
 678
 679     PolyAP0  = _mm_mul_pd(CAP4, x4);
 680     PolyAP1  = _mm_mul_pd(CAP3, x4);
 681     PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
 682     PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
 683     PolyAP0  = _mm_mul_pd(PolyAP0, x4);
 684     PolyAP1  = _mm_mul_pd(PolyAP1, x2);
 685     PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
 686     PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
 687
 688     PolyAQ1  = _mm_mul_pd(CAQ5, x4);
 689     PolyAQ0  = _mm_mul_pd(CAQ4, x4);
 690     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
 691     PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
 692     PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
 693     PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
 694     PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
 695     PolyAQ0  = _mm_add_pd(PolyAQ0, one);
 696     PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
 697     PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
 698
 699     res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
 700     res_erf  = _mm_add_pd(CAoffset, res_erf);
 701     res_erf  = _mm_mul_pd(x, res_erf);
 702
 703     /* Calculate erfc() in range [1,4.5] */
 704     t       = _mm_sub_pd(xabs, one);
 705     t2      = _mm_mul_pd(t, t);
 706
 707     PolyBP0  = _mm_mul_pd(CBP6, t2);
 708     PolyBP1  = _mm_mul_pd(CBP5, t2);
 709     PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
 710     PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
 711     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 712     PolyBP1  = _mm_mul_pd(PolyBP1, t2);
 713     PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
 714     PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
 715     PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 716     PolyBP1  = _mm_mul_pd(PolyBP1, t);
 717     PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
 718     PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
 719
 720     PolyBQ1 = _mm_mul_pd(CBQ7, t2);
 721     PolyBQ0 = _mm_mul_pd(CBQ6, t2);
 722     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
 723     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
 724     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 725     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 726     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
 727     PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
 728     PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 729     PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 730     PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
 731     PolyBQ0 = _mm_add_pd(PolyBQ0, one);
 732     PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
 733     PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
 734
 735     res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
 736
 737     res_erfcB = _mm_mul_pd(res_erfcB, xabs);
 738
 739     /* Calculate erfc() in range [4.5,inf] */
 740     w       = gmx_mm_inv_pd(xabs);
 741     w2      = _mm_mul_pd(w, w);
 742
 743     PolyCP0  = _mm_mul_pd(CCP6, w2);
 744     PolyCP1  = _mm_mul_pd(CCP5, w2);
 745     PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
 746     PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
 747     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 748     PolyCP1  = _mm_mul_pd(PolyCP1, w2);
 749     PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
 750     PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
 751     PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 752     PolyCP1  = _mm_mul_pd(PolyCP1, w);
 753     PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
 754     PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
 755
 756     PolyCQ0  = _mm_mul_pd(CCQ6, w2);
 757     PolyCQ1  = _mm_mul_pd(CCQ5, w2);
 758     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
 759     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
 760     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 761     PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
 762     PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
 763     PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
 764     PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 765     PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
 766     PolyCQ0  = _mm_add_pd(PolyCQ0, one);
 767     PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
 768
 769     expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
 770
 771     res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
 772     res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
 773     res_erfcC = _mm_mul_pd(res_erfcC, w);
 774
 775     mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
 776     res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfcB), _mm_and_pd(mask, res_erfcC));
 777
 778     res_erfc = _mm_mul_pd(res_erfc, expmx2);
 779
 780     /* erfc(x<0) = 2-erfc(|x|) */
 781     mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
 782     res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(two, res_erfc)));
 783
 784     /* Select erf() or erfc() */
 785     mask = _mm_cmplt_pd(xabs, one);
 786     res  = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(one, res_erf)));
 787
 788     return res;
 789 }
 790
 791
 792 /* Calculate the force correction due to PME analytically.
 793  *
 794  * This routine is meant to enable analytical evaluation of the
 795  * direct-space PME electrostatic force to avoid tables.
 796  *
 797  * The direct-space potential should be Erfc(beta*r)/r, but there
 798  * are some problems evaluating that:
 799  *
 800  * First, the error function is difficult (read: expensive) to
 801  * approxmiate accurately for intermediate to large arguments, and
 802  * this happens already in ranges of beta*r that occur in simulations.
 803  * Second, we now try to avoid calculating potentials in Gromacs but
 804  * use forces directly.
 805  *
 806  * We can simply things slight by noting that the PME part is really
 807  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 808  *
 809  * V= 1/r - Erf(beta*r)/r
 810  *
 811  * The first term we already have from the inverse square root, so
 812  * that we can leave out of this routine.
 813  *
 814  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 815  * the argument beta*r will be in the range 0.15 to ~4. Use your
 816  * favorite plotting program to realize how well-behaved Erf(z)/z is
 817  * in this range!
 818  *
 819  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 820  * However, it turns out it is more efficient to approximate f(z)/z and
 821  * then only use even powers. This is another minor optimization, since
 822  * we actually WANT f(z)/z, because it is going to be multiplied by
 823  * the vector between the two atoms to get the vectorial force. The
 824  * fastest flops are the ones we can avoid calculating!
 825  *
 826  * So, here's how it should be used:
 827  *
 828  * 1. Calculate r^2.
 829  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 830  * 3. Evaluate this routine with z^2 as the argument.
 831  * 4. The return value is the expression:
 832  *
 833  *
 834  *       2*exp(-z^2)     erf(z)
 835  *       ------------ - --------
 836  *       sqrt(Pi)*z^2      z^3
 837  *
 838  * 5. Multiply the entire expression by beta^3. This will get you
 839  *
 840  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 841  *       ------------------  - ---------------
 842  *          sqrt(Pi)*z^2            z^3
 843  *
 844  *    or, switching back to r (z=r*beta):
 845  *
 846  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 847  *       ----------------------- - -----------
 848  *            sqrt(Pi)*r^2            r^3
 849  *
 850  *
 851  *    With a bit of math exercise you should be able to confirm that
 852  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 853  *
 854  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 855  *    and you have your force (divided by r). A final multiplication
 856  *    with the vector connecting the two particles and you have your
 857  *    vectorial force to add to the particles.
 858  *
 859  */
 860 static __m128d
 861 gmx_mm_pmecorrF_pd(__m128d z2)
 862 {
 863     const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
 864     const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
 865     const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
 866     const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
 867     const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
 868     const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
 869     const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
 870     const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
 871     const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
 872     const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
 873     const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
 874
 875     const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
 876     const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
 877     const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
 878     const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
 879     const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
 880     const __m128d  FD0      = _mm_set1_pd(1.0);
 881
 882     __m128d        z4;
 883     __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
 884
 885     z4             = _mm_mul_pd(z2, z2);
 886
 887     polyFD1        = _mm_mul_pd(FD5, z4);
 888     polyFD0        = _mm_mul_pd(FD4, z4);
 889     polyFD1        = _mm_add_pd(polyFD1, FD3);
 890     polyFD0        = _mm_add_pd(polyFD0, FD2);
 891     polyFD1        = _mm_mul_pd(polyFD1, z4);
 892     polyFD0        = _mm_mul_pd(polyFD0, z4);
 893     polyFD1        = _mm_add_pd(polyFD1, FD1);
 894     polyFD0        = _mm_add_pd(polyFD0, FD0);
 895     polyFD1        = _mm_mul_pd(polyFD1, z2);
 896     polyFD0        = _mm_add_pd(polyFD0, polyFD1);
 897
 898     polyFD0        = gmx_mm_inv_pd(polyFD0);
 899
 900     polyFN0        = _mm_mul_pd(FN10, z4);
 901     polyFN1        = _mm_mul_pd(FN9, z4);
 902     polyFN0        = _mm_add_pd(polyFN0, FN8);
 903     polyFN1        = _mm_add_pd(polyFN1, FN7);
 904     polyFN0        = _mm_mul_pd(polyFN0, z4);
 905     polyFN1        = _mm_mul_pd(polyFN1, z4);
 906     polyFN0        = _mm_add_pd(polyFN0, FN6);
 907     polyFN1        = _mm_add_pd(polyFN1, FN5);
 908     polyFN0        = _mm_mul_pd(polyFN0, z4);
 909     polyFN1        = _mm_mul_pd(polyFN1, z4);
 910     polyFN0        = _mm_add_pd(polyFN0, FN4);
 911     polyFN1        = _mm_add_pd(polyFN1, FN3);
 912     polyFN0        = _mm_mul_pd(polyFN0, z4);
 913     polyFN1        = _mm_mul_pd(polyFN1, z4);
 914     polyFN0        = _mm_add_pd(polyFN0, FN2);
 915     polyFN1        = _mm_add_pd(polyFN1, FN1);
 916     polyFN0        = _mm_mul_pd(polyFN0, z4);
 917     polyFN1        = _mm_mul_pd(polyFN1, z2);
 918     polyFN0        = _mm_add_pd(polyFN0, FN0);
 919     polyFN0        = _mm_add_pd(polyFN0, polyFN1);
 920
 921     return _mm_mul_pd(polyFN0, polyFD0);
 922 }
 923
 924
 925
 926
 927 /* Calculate the potential correction due to PME analytically.
 928  *
 929  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 930  *
 931  * This routine calculates Erf(z)/z, although you should provide z^2
 932  * as the input argument.
 933  *
 934  * Here's how it should be used:
 935  *
 936  * 1. Calculate r^2.
 937  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 938  * 3. Evaluate this routine with z^2 as the argument.
 939  * 4. The return value is the expression:
 940  *
 941  *
 942  *        erf(z)
 943  *       --------
 944  *          z
 945  *
 946  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 947  *
 948  *       erf(r*beta)
 949  *       -----------
 950  *           r
 951  *
 952  * 6. Subtract the result from 1/r, multiply by the product of the charges,
 953  *    and you have your potential.
 954  *
 955  */
 956 static __m128d
 957 gmx_mm_pmecorrV_pd(__m128d z2)
 958 {
 959     const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
 960     const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
 961     const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
 962     const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
 963     const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
 964     const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
 965     const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
 966     const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
 967     const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
 968     const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
 969
 970     const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
 971     const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
 972     const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
 973     const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
 974     const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
 975     const __m128d  VD0      = _mm_set1_pd(1.0);
 976
 977     __m128d        z4;
 978     __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
 979
 980     z4             = _mm_mul_pd(z2, z2);
 981
 982     polyVD1        = _mm_mul_pd(VD5, z4);
 983     polyVD0        = _mm_mul_pd(VD4, z4);
 984     polyVD1        = _mm_add_pd(polyVD1, VD3);
 985     polyVD0        = _mm_add_pd(polyVD0, VD2);
 986     polyVD1        = _mm_mul_pd(polyVD1, z4);
 987     polyVD0        = _mm_mul_pd(polyVD0, z4);
 988     polyVD1        = _mm_add_pd(polyVD1, VD1);
 989     polyVD0        = _mm_add_pd(polyVD0, VD0);
 990     polyVD1        = _mm_mul_pd(polyVD1, z2);
 991     polyVD0        = _mm_add_pd(polyVD0, polyVD1);
 992
 993     polyVD0        = gmx_mm_inv_pd(polyVD0);
 994
 995     polyVN1        = _mm_mul_pd(VN9, z4);
 996     polyVN0        = _mm_mul_pd(VN8, z4);
 997     polyVN1        = _mm_add_pd(polyVN1, VN7);
 998     polyVN0        = _mm_add_pd(polyVN0, VN6);
 999     polyVN1        = _mm_mul_pd(polyVN1, z4);
1000     polyVN0        = _mm_mul_pd(polyVN0, z4);
1001     polyVN1        = _mm_add_pd(polyVN1, VN5);
1002     polyVN0        = _mm_add_pd(polyVN0, VN4);
1003     polyVN1        = _mm_mul_pd(polyVN1, z4);
1004     polyVN0        = _mm_mul_pd(polyVN0, z4);
1005     polyVN1        = _mm_add_pd(polyVN1, VN3);
1006     polyVN0        = _mm_add_pd(polyVN0, VN2);
1007     polyVN1        = _mm_mul_pd(polyVN1, z4);
1008     polyVN0        = _mm_mul_pd(polyVN0, z4);
1009     polyVN1        = _mm_add_pd(polyVN1, VN1);
1010     polyVN0        = _mm_add_pd(polyVN0, VN0);
1011     polyVN1        = _mm_mul_pd(polyVN1, z2);
1012     polyVN0        = _mm_add_pd(polyVN0, polyVN1);
1013
1014     return _mm_mul_pd(polyVN0, polyVD0);
1015 }
1016
1017
1018
1019 static int
1020 gmx_mm_sincos_pd(__m128d  x,
1021                  __m128d *sinval,
1022                  __m128d *cosval)
1023 {
1024 #ifdef _MSC_VER
1025     __declspec(align(16))
1026     const double sintable[34] =
1027     {
1028         1.00000000000000000e+00, 0.00000000000000000e+00,
1029         9.95184726672196929e-01, 9.80171403295606036e-02,
1030         9.80785280403230431e-01, 1.95090322016128248e-01,
1031         9.56940335732208824e-01, 2.90284677254462331e-01,
1032         9.23879532511286738e-01, 3.82683432365089782e-01,
1033         8.81921264348355050e-01, 4.71396736825997642e-01,
1034         8.31469612302545236e-01, 5.55570233019602178e-01,
1035         7.73010453362736993e-01, 6.34393284163645488e-01,
1036         7.07106781186547573e-01, 7.07106781186547462e-01,
1037         6.34393284163645599e-01, 7.73010453362736882e-01,
1038         5.55570233019602289e-01, 8.31469612302545125e-01,
1039         4.71396736825997809e-01, 8.81921264348354939e-01,
1040         3.82683432365089837e-01, 9.23879532511286738e-01,
1041         2.90284677254462276e-01, 9.56940335732208935e-01,
1042         1.95090322016128304e-01, 9.80785280403230431e-01,
1043         9.80171403295607702e-02, 9.95184726672196818e-01,
1044         0.0, 1.00000000000000000e+00
1045     };
1046 #else
1047     const __m128d sintable[17] =
1048     {
1049         _mm_set_pd( 0.0, 1.0 ),
1050         _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
1051         _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
1052         _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
1053         _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
1054         _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
1055         _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
1056         _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
1057         _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
1058         _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
1059         _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
1060         _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
1061         _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
1062         _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
1063         _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
1064         _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
1065         _mm_set_pd(  1.0, 0.0 )
1066     };
1067 #endif
1068
1069     const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
1070     const __m128i signbit_epi32  = _mm_set1_epi32(0x80000000);
1071
1072     const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
1073     const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
1074     const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
1075     const __m128i ione          = _mm_set1_epi32(1);
1076     const __m128i i32           = _mm_set1_epi32(32);
1077     const __m128i i16           = _mm_set1_epi32(16);
1078     const __m128i tabmask       = _mm_set1_epi32(0x3F);
1079     const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
1080     const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
1081     const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
1082     const __m128d sinP1         = _mm_set1_pd(1.0);
1083
1084     const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
1085     const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
1086     const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
1087     const __m128d cosP0         = _mm_set1_pd(1.0);
1088
1089     __m128d       scalex;
1090     __m128i       tabidx, corridx;
1091     __m128d       xabs, z, z2, polySin, polyCos;
1092     __m128d       xpoint;
1093     __m128d       ypoint0, ypoint1;
1094
1095     __m128d       sinpoint, cospoint;
1096     __m128d       xsign, ssign, csign;
1097     __m128i       imask, sswapsign, cswapsign;
1098     __m128d       minusone;
1099
1100     xsign    = _mm_andnot_pd(signmask, x);
1101     xabs     = _mm_and_pd(x, signmask);
1102
1103     scalex   = _mm_mul_pd(tabscale, xabs);
1104     tabidx   = _mm_cvtpd_epi32(scalex);
1105
1106     xpoint   = _mm_cvtepi32_pd(tabidx);
1107
1108     /* Extended precision arithmetics */
1109     z        = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
1110     z        = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
1111
1112     /* Range reduction to 0..2*Pi */
1113     tabidx   = _mm_and_si128(tabidx, tabmask);
1114
1115     /* tabidx is now in range [0,..,64] */
1116     imask     = _mm_cmpgt_epi32(tabidx, i32);
1117     sswapsign = imask;
1118     cswapsign = imask;
1119     corridx   = _mm_and_si128(imask, i32);
1120     tabidx    = _mm_sub_epi32(tabidx, corridx);
1121
1122     /* tabidx is now in range [0..32] */
1123     imask     = _mm_cmpgt_epi32(tabidx, i16);
1124     cswapsign = _mm_xor_si128(cswapsign, imask);
1125     corridx   = _mm_sub_epi32(i32, tabidx);
1126     tabidx    = _mm_or_si128( _mm_and_si128(imask, corridx), _mm_andnot_si128(imask, tabidx) );
1127     /* tabidx is now in range [0..16] */
1128     ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
1129     csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
1130
1131 #ifdef _MSC_VER
1132     ypoint0  = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx, 0));
1133     ypoint1  = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx, 1));
1134 #else
1135     ypoint0  = sintable[gmx_mm_extract_epi32(tabidx, 0)];
1136     ypoint1  = sintable[gmx_mm_extract_epi32(tabidx, 1)];
1137 #endif
1138     sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
1139     cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
1140
1141     sinpoint = _mm_mul_pd(sinpoint, ssign);
1142     cospoint = _mm_mul_pd(cospoint, csign);
1143
1144     z2       = _mm_mul_pd(z, z);
1145
1146     polySin  = _mm_mul_pd(sinP7, z2);
1147     polySin  = _mm_add_pd(polySin, sinP5);
1148     polySin  = _mm_mul_pd(polySin, z2);
1149     polySin  = _mm_add_pd(polySin, sinP3);
1150     polySin  = _mm_mul_pd(polySin, z2);
1151     polySin  = _mm_add_pd(polySin, sinP1);
1152     polySin  = _mm_mul_pd(polySin, z);
1153
1154     polyCos  = _mm_mul_pd(cosP6, z2);
1155     polyCos  = _mm_add_pd(polyCos, cosP4);
1156     polyCos  = _mm_mul_pd(polyCos, z2);
1157     polyCos  = _mm_add_pd(polyCos, cosP2);
1158     polyCos  = _mm_mul_pd(polyCos, z2);
1159     polyCos  = _mm_add_pd(polyCos, cosP0);
1160
1161     *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
1162     *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
1163
1164     return 0;
1165 }
1166
1167 /*
1168  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1169  * will then call the sincos() routine and waste a factor 2 in performance!
1170  */
1171 static __m128d
1172 gmx_mm_sin_pd(__m128d x)
1173 {
1174     __m128d s, c;
1175     gmx_mm_sincos_pd(x, &s, &c);
1176     return s;
1177 }
1178
1179 /*
1180  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1181  * will then call the sincos() routine and waste a factor 2 in performance!
1182  */
1183 static __m128d
1184 gmx_mm_cos_pd(__m128d x)
1185 {
1186     __m128d s, c;
1187     gmx_mm_sincos_pd(x, &s, &c);
1188     return c;
1189 }
1190
1191
1192
1193 static __m128d
1194 gmx_mm_tan_pd(__m128d x)
1195 {
1196     __m128d sinval, cosval;
1197     __m128d tanval;
1198
1199     gmx_mm_sincos_pd(x, &sinval, &cosval);
1200
1201     tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
1202
1203     return tanval;
1204 }
1205
1206
1207
1208 static __m128d
1209 gmx_mm_asin_pd(__m128d x)
1210 {
1211     /* Same algorithm as cephes library */
1212     const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
1213     const __m128d limit1    = _mm_set1_pd(0.625);
1214     const __m128d limit2    = _mm_set1_pd(1e-8);
1215     const __m128d one       = _mm_set1_pd(1.0);
1216     const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
1217     const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
1218     const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
1219
1220     const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
1221     const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
1222     const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
1223     const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
1224     const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
1225     const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
1226
1227     const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
1228     const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
1229     const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
1230     const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
1231     const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
1232
1233     const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
1234     const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
1235     const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
1236     const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
1237     const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
1238
1239     const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
1240     const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
1241     const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
1242     const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
1243
1244     __m128d       sign;
1245     __m128d       mask;
1246     __m128d       xabs;
1247     __m128d       zz, ww, z, q, w, y, zz2, ww2;
1248     __m128d       PA, PB;
1249     __m128d       QA, QB;
1250     __m128d       RA, RB;
1251     __m128d       SA, SB;
1252     __m128d       nom, denom;
1253
1254     sign  = _mm_andnot_pd(signmask, x);
1255     xabs  = _mm_and_pd(x, signmask);
1256
1257     mask  = _mm_cmpgt_pd(xabs, limit1);
1258
1259     zz    = _mm_sub_pd(one, xabs);
1260     ww    = _mm_mul_pd(xabs, xabs);
1261     zz2   = _mm_mul_pd(zz, zz);
1262     ww2   = _mm_mul_pd(ww, ww);
1263
1264     /* R */
1265     RA    = _mm_mul_pd(R4, zz2);
1266     RB    = _mm_mul_pd(R3, zz2);
1267     RA    = _mm_add_pd(RA, R2);
1268     RB    = _mm_add_pd(RB, R1);
1269     RA    = _mm_mul_pd(RA, zz2);
1270     RB    = _mm_mul_pd(RB, zz);
1271     RA    = _mm_add_pd(RA, R0);
1272     RA    = _mm_add_pd(RA, RB);
1273
1274     /* S, SA = zz2 */
1275     SB    = _mm_mul_pd(S3, zz2);
1276     SA    = _mm_add_pd(zz2, S2);
1277     SB    = _mm_add_pd(SB, S1);
1278     SA    = _mm_mul_pd(SA, zz2);
1279     SB    = _mm_mul_pd(SB, zz);
1280     SA    = _mm_add_pd(SA, S0);
1281     SA    = _mm_add_pd(SA, SB);
1282
1283     /* P */
1284     PA    = _mm_mul_pd(P5, ww2);
1285     PB    = _mm_mul_pd(P4, ww2);
1286     PA    = _mm_add_pd(PA, P3);
1287     PB    = _mm_add_pd(PB, P2);
1288     PA    = _mm_mul_pd(PA, ww2);
1289     PB    = _mm_mul_pd(PB, ww2);
1290     PA    = _mm_add_pd(PA, P1);
1291     PB    = _mm_add_pd(PB, P0);
1292     PA    = _mm_mul_pd(PA, ww);
1293     PA    = _mm_add_pd(PA, PB);
1294
1295     /* Q, QA = ww2 */
1296     QB    = _mm_mul_pd(Q4, ww2);
1297     QA    = _mm_add_pd(ww2, Q3);
1298     QB    = _mm_add_pd(QB, Q2);
1299     QA    = _mm_mul_pd(QA, ww2);
1300     QB    = _mm_mul_pd(QB, ww2);
1301     QA    = _mm_add_pd(QA, Q1);
1302     QB    = _mm_add_pd(QB, Q0);
1303     QA    = _mm_mul_pd(QA, ww);
1304     QA    = _mm_add_pd(QA, QB);
1305
1306     RA    = _mm_mul_pd(RA, zz);
1307     PA    = _mm_mul_pd(PA, ww);
1308
1309     nom   = _mm_or_pd( _mm_andnot_pd(mask, PA), _mm_and_pd(mask, RA) );
1310     denom = _mm_or_pd( _mm_andnot_pd(mask, QA), _mm_and_pd(mask, SA) );
1311
1312     q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
1313
1314     zz    = _mm_add_pd(zz, zz);
1315     zz    = gmx_mm_sqrt_pd(zz);
1316     z     = _mm_sub_pd(quarterpi, zz);
1317     zz    = _mm_mul_pd(zz, q);
1318     zz    = _mm_sub_pd(zz, morebits);
1319     z     = _mm_sub_pd(z, zz);
1320     z     = _mm_add_pd(z, quarterpi);
1321
1322     w     = _mm_mul_pd(xabs, q);
1323     w     = _mm_add_pd(w, xabs);
1324
1325     z     = _mm_or_pd( _mm_andnot_pd(mask, w), _mm_and_pd(mask, z) );
1326
1327     mask  = _mm_cmpgt_pd(xabs, limit2);
1328     z     = _mm_or_pd( _mm_andnot_pd(mask, xabs), _mm_and_pd(mask, z) );
1329
1330     z = _mm_xor_pd(z, sign);
1331
1332     return z;
1333 }
1334
1335
1336 static __m128d
1337 gmx_mm_acos_pd(__m128d x)
1338 {
1339     const __m128d signmask   = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
1340     const __m128d one        = _mm_set1_pd(1.0);
1341     const __m128d half       = _mm_set1_pd(0.5);
1342     const __m128d pi         = _mm_set1_pd(M_PI);
1343     const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
1344     const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
1345
1346
1347     __m128d mask1;
1348
1349     __m128d z, z1, z2;
1350
1351     mask1 = _mm_cmpgt_pd(x, half);
1352     z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
1353     z1    = gmx_mm_sqrt_pd(z1);
1354     z     = _mm_or_pd( _mm_andnot_pd(mask1, x), _mm_and_pd(mask1, z1) );
1355
1356     z     = gmx_mm_asin_pd(z);
1357
1358     z1    = _mm_add_pd(z, z);
1359
1360     z2    = _mm_sub_pd(quarterpi0, z);
1361     z2    = _mm_add_pd(z2, quarterpi1);
1362     z2    = _mm_add_pd(z2, quarterpi0);
1363
1364     z     = _mm_or_pd(_mm_andnot_pd(mask1, z2), _mm_and_pd(mask1, z1));
1365
1366     return z;
1367 }
1368
1369 static __m128d
1370 gmx_mm_atan_pd(__m128d x)
1371 {
1372     /* Same algorithm as cephes library */
1373     const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
1374     const __m128d limit1    = _mm_set1_pd(0.66);
1375     const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
1376     const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
1377     const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
1378     const __m128d mone      = _mm_set1_pd(-1.0);
1379     const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
1380     const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
1381
1382     const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
1383     const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
1384     const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
1385     const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
1386     const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
1387
1388     const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
1389     const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
1390     const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
1391     const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
1392     const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
1393
1394     __m128d       sign;
1395     __m128d       mask1, mask2;
1396     __m128d       y, t1, t2;
1397     __m128d       z, z2;
1398     __m128d       P_A, P_B, Q_A, Q_B;
1399
1400     sign   = _mm_andnot_pd(signmask, x);
1401     x      = _mm_and_pd(x, signmask);
1402
1403     mask1  = _mm_cmpgt_pd(x, limit1);
1404     mask2  = _mm_cmpgt_pd(x, limit2);
1405
1406     t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
1407     t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
1408
1409     y      = _mm_and_pd(mask1, quarterpi);
1410     y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
1411
1412     x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
1413     x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
1414
1415     z      = _mm_mul_pd(x, x);
1416     z2     = _mm_mul_pd(z, z);
1417
1418     P_A    = _mm_mul_pd(P4, z2);
1419     P_B    = _mm_mul_pd(P3, z2);
1420     P_A    = _mm_add_pd(P_A, P2);
1421     P_B    = _mm_add_pd(P_B, P1);
1422     P_A    = _mm_mul_pd(P_A, z2);
1423     P_B    = _mm_mul_pd(P_B, z);
1424     P_A    = _mm_add_pd(P_A, P0);
1425     P_A    = _mm_add_pd(P_A, P_B);
1426
1427     /* Q_A = z2 */
1428     Q_B    = _mm_mul_pd(Q4, z2);
1429     Q_A    = _mm_add_pd(z2, Q3);
1430     Q_B    = _mm_add_pd(Q_B, Q2);
1431     Q_A    = _mm_mul_pd(Q_A, z2);
1432     Q_B    = _mm_mul_pd(Q_B, z2);
1433     Q_A    = _mm_add_pd(Q_A, Q1);
1434     Q_B    = _mm_add_pd(Q_B, Q0);
1435     Q_A    = _mm_mul_pd(Q_A, z);
1436     Q_A    = _mm_add_pd(Q_A, Q_B);
1437
1438     z      = _mm_mul_pd(z, P_A);
1439     z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
1440     z      = _mm_mul_pd(z, x);
1441     z      = _mm_add_pd(z, x);
1442
1443     t1     = _mm_and_pd(mask1, morebits1);
1444     t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
1445
1446     z      = _mm_add_pd(z, t1);
1447     y      = _mm_add_pd(y, z);
1448
1449     y      = _mm_xor_pd(y, sign);
1450
1451     return y;
1452 }
1453
1454
1455 static __m128d
1456 gmx_mm_atan2_pd(__m128d y, __m128d x)
1457 {
1458     const __m128d pi          = _mm_set1_pd(M_PI);
1459     const __m128d minuspi     = _mm_set1_pd(-M_PI);
1460     const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
1461     const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
1462
1463     __m128d       z, z1, z3, z4;
1464     __m128d       w;
1465     __m128d       maskx_lt, maskx_eq;
1466     __m128d       masky_lt, masky_eq;
1467     __m128d       mask1, mask2, mask3, mask4, maskall;
1468
1469     maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
1470     masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
1471     maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
1472     masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
1473
1474     z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
1475     z         = gmx_mm_atan_pd(z);
1476
1477     mask1     = _mm_and_pd(maskx_eq, masky_lt);
1478     mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
1479     mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
1480     mask4     = _mm_and_pd(masky_eq, maskx_lt);
1481
1482     maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
1483
1484     z         = _mm_andnot_pd(maskall, z);
1485     z1        = _mm_and_pd(mask1, minushalfpi);
1486     z3        = _mm_and_pd(mask3, halfpi);
1487     z4        = _mm_and_pd(mask4, pi);
1488
1489     z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
1490
1491     w         = _mm_or_pd(_mm_andnot_pd(masky_lt, pi), _mm_and_pd(masky_lt, minuspi));
1492     w         = _mm_and_pd(w, maskx_lt);
1493
1494     w         = _mm_andnot_pd(maskall, w);
1495
1496     z         = _mm_add_pd(z, w);
1497
1498     return z;
1499 }
1500
1501 #endif /*_gmx_math_x86_sse2_double_h_ */