Added CHARMM neutral lysine, LYN, to residuetypes.dat.
[gromacs/adressmacs.git] / include / gmx_sse2_double.h
blobd6635af150dd37dec874ce96c189e44c363cbb84
1 /*
2 * This source code is part of
4 * G R O M A C S
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
16 * later version.
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 /* We require SSE2 now! */
32 #include <math.h>
33 #include <xmmintrin.h> /* SSE */
34 #include <emmintrin.h> /* SSE2 */
35 #ifdef GMX_SSE3
36 # include <pmmintrin.h> /* SSE3 */
37 #endif
38 #ifdef GMX_SSE4
39 # include <smmintrin.h> /* SSE4.1 */
40 #endif
43 #include <stdio.h>
45 #include "types/simple.h"
47 /***************************************************
48 * *
49 * COMPILER RANT WARNING: *
50 * *
51 * Ideally, this header would be filled with *
52 * simple static inline functions. Unfortunately, *
53 * many vendors provide really braindead compilers *
54 * that either cannot handle more than 1-2 SSE *
55 * function parameters, and some cannot handle *
56 * pointers to SSE __m128 datatypes as parameters *
57 * at all. Thus, for portability we have had to *
58 * implement all but the simplest routines as *
59 * macros instead... *
60 * *
61 ***************************************************/
64 /***************************************************
65 * *
66 * Wrappers/replacements for some instructions *
67 * not available in all SSE versions. *
68 * *
69 ***************************************************/
71 #ifdef GMX_SSE4
72 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
73 #else
74 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
75 #endif
78 * Some compilers require a cast to change the interpretation
79 * of a register from FP to Int and vice versa, and not all of
80 * the provide instructions to do this. Roll our own wrappers...
83 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
84 # define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
85 # define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
86 # define gmx_mm_castpd_pd128(a) (a)
87 #elif defined(__GNUC__)
88 # define gmx_mm_castsi128_pd(a) ((__m128d)(a))
89 # define gmx_mm_castpd_si128(a) ((__m128i)(a))
90 # define gmx_mm_castpd_pd128(a) ((__m128d)(a))
91 #else
92 static __m128d gmx_mm_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
93 static __m128i gmx_mm_castpd_si128(__m128d a) { return *(__m128i *) &a; }
94 static __m128d gmx_mm_castpd_pd128(__m128d a) { return *(__m128d *) &a; }
95 #endif
97 #define gmx_mm_extract_epi64(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
101 static inline void
102 gmx_printxmm_pd(const char *s,__m128d xmm)
104 double f[2];
106 _mm_storeu_pd(f,xmm);
107 printf("%s: %15.10g %15.10g\n",s,f[0],f[1]);
111 static inline __m128d
112 gmx_mm_inv_pd(__m128d x)
114 const __m128d two = _mm_set1_pd(2.0);
116 /* Lookup instruction only exists in single precision, convert back and forth... */
117 __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
119 /* Perform two N-R steps for double precision */
120 lu = _mm_mul_pd(lu,_mm_sub_pd(two,_mm_mul_pd(x,lu)));
121 return _mm_mul_pd(lu,_mm_sub_pd(two,_mm_mul_pd(x,lu)));
125 static int gmx_mm_check_and_reset_overflow(void)
127 int MXCSR;
128 int sse_overflow;
130 MXCSR = _mm_getcsr();
131 /* The overflow flag is bit 3 in the register */
132 if (MXCSR & 0x0008)
134 sse_overflow = 1;
135 /* Set the overflow flag to zero */
136 MXCSR = MXCSR & 0xFFF7;
137 _mm_setcsr(MXCSR);
139 else
141 sse_overflow = 0;
144 return sse_overflow;
148 static inline __m128d
149 gmx_mm_invsqrt_pd(__m128d x)
151 const __m128d half = _mm_set1_pd(0.5);
152 const __m128d three = _mm_set1_pd(3.0);
154 /* Lookup instruction only exists in single precision, convert back and forth... */
155 __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
157 lu = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu,lu),x)),lu));
158 return _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu,lu),x)),lu));
161 static inline __m128d
162 gmx_mm_sqrt_pd(__m128d x)
164 __m128d mask;
165 __m128d res;
167 mask = _mm_cmpeq_pd(x,_mm_setzero_pd());
168 res = _mm_andnot_pd(mask,gmx_mm_invsqrt_pd(x));
170 res = _mm_mul_pd(x,res);
172 return res;
177 * Exponential function.
179 * Exp(x) is calculate from the relation Exp(x)=2^(y), where y=log2(e)*x
180 * Thus, the contents of this routine is mostly about calculating 2^y.
182 * This is done by separating y=z+w, where z=[y] is an integer. For technical reasons it is easiest
183 * for us to round to the _nearest_ integer and have w in [-0.5,0.5] rather than always rounding down.
184 * (It is not until SSE4 there was an efficient operation to do rounding towards -infinity).
186 * With this we get 2^y=2^z*2^w
188 * Since we have IEEE fp representation, we can easily calculate 2^z by adding the FP exponent bias
189 * (1023 in double), and shifting the integer to the exponent field of the FP number (52 bits up).
191 * The 2^w term is calculated from a (10,0)-th order (no denominator) Minimax polynomia on the interval
192 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
194 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 10, 0}, WorkingPrecision -> 20]
196 * The lowest exponent we can represent in IEEE double-precision binary format is 2^-1022; below that
197 * it will wrap around and lead to very large positive numbers. This corresponds to a lower bound
198 * on the argument for exp(x) of roughly -708.39. For smaller arguments the return value will be 0.0.
200 * There appears to be a slight loss of precision for large arguments (~250), where the largest relative
201 * error reaches ~2e-14. However, since the actual value for that argument is around 1E100, it might
202 * not matter for typical workloads. This is likely caused by the polynomial evaluation,
203 * and the only way around would then be a table-based version, which I haven't managed to get the
204 * same performance from.
206 * The _average_ accuracy is about 51 bits in the range [-20,20], and the worst roughly 1 bit worse.
208 static __m128d
209 gmx_mm_exp_pd(__m128d x)
211 const __m128d argscale = _mm_set1_pd(1.442695040888963387);
212 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
213 const __m128d arglimit = _mm_set1_pd(-1022.0/1.442695040888963387);
214 const __m128i expbase = _mm_set1_epi32(1023);
216 const __m128d CA0 = _mm_set1_pd(7.0372789822689374920e-9);
217 const __m128d CB0 = _mm_set1_pd(89.491964762085371);
218 const __m128d CB1 = _mm_set1_pd(-9.7373870675164587);
219 const __m128d CC0 = _mm_set1_pd(51.247261867992408);
220 const __m128d CC1 = _mm_set1_pd(-0.184020268133945);
221 const __m128d CD0 = _mm_set1_pd(36.82070153762337);
222 const __m128d CD1 = _mm_set1_pd(5.416849282638991);
223 const __m128d CE0 = _mm_set1_pd(30.34003452248759);
224 const __m128d CE1 = _mm_set1_pd(8.726173289493301);
225 const __m128d CF0 = _mm_set1_pd(27.73526969472330);
226 const __m128d CF1 = _mm_set1_pd(10.284755658866532);
228 __m128d valuemask;
229 __m128i iexppart;
230 __m128d fexppart;
231 __m128d intpart;
232 __m128d z,z2;
233 __m128d factB,factC,factD,factE,factF;
235 z = _mm_mul_pd(x,argscale);
236 iexppart = _mm_cvtpd_epi32(z);
238 #if GMX_SSE4
239 /* This reduces latency and speeds up the code by roughly 5% when supported */
240 intpart = _mm_round_pd(z,0);
241 #else
242 intpart = _mm_cvtepi32_pd(iexppart);
243 #endif
244 /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
245 * To be able to shift it into the exponent for a double precision number we first need to
246 * shuffle so that the lower half contains the first element, and the upper half the second.
247 * This should really be done as a zero-extension, but since the next instructions will shift
248 * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
249 * (thus we just use element 2 from iexppart).
251 iexppart = _mm_shuffle_epi32(iexppart,_MM_SHUFFLE(2,1,2,0));
253 /* Do the shift operation on the 64-bit registers */
254 iexppart = _mm_add_epi32(iexppart,expbase);
255 iexppart = _mm_slli_epi64(iexppart,52);
256 valuemask = _mm_cmpgt_pd(x,arglimit);
258 z = _mm_sub_pd(z,intpart);
259 z2 = _mm_mul_pd(z,z);
261 fexppart = _mm_and_pd(valuemask,gmx_mm_castsi128_pd(iexppart));
263 /* Since SSE doubleing-point has relatively high latency it is faster to do
264 * factorized polynomial summation with independent terms than using alternating add/multiply, i.e.
265 * p(z) = A0 * (B0 + z) * (C0 + C1*z + z^2) * (D0 + D1*z + z^2) * (E0 + E1*z + z^2) * (F0 + F1*z + z^2)
268 factB = _mm_add_pd(CB0,_mm_mul_pd(CB1,z) );
269 factB = _mm_add_pd(factB,z2);
270 factC = _mm_add_pd(CC0,_mm_mul_pd(CC1,z) );
271 factC = _mm_add_pd(factC,z2);
272 factD = _mm_add_pd(CD0,_mm_mul_pd(CD1,z) );
273 factD = _mm_add_pd(factD,z2);
274 factE = _mm_add_pd(CE0,_mm_mul_pd(CE1,z) );
275 factE = _mm_add_pd(factE,z2);
276 factF = _mm_add_pd(CF0,_mm_mul_pd(CF1,z) );
277 factF = _mm_add_pd(factF,z2);
279 z = _mm_mul_pd(CA0,fexppart);
280 factB = _mm_mul_pd(factB,factC);
281 factD = _mm_mul_pd(factD,factE);
282 z = _mm_mul_pd(z,factF);
283 factB = _mm_mul_pd(factB,factD);
284 z = _mm_mul_pd(z,factB);
286 /* Currently uses 32 actual (real, not including casts) SSE instructions */
287 return z;
290 static __m128d
291 gmx_mm_log_pd(__m128d x)
293 /* Same algorithm as cephes library */
294 const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
296 const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
297 const __m128d half = _mm_set1_pd(0.5);
298 const __m128d one = _mm_set1_pd(1.0);
299 const __m128i itwo = _mm_set1_epi32(2);
300 const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0));
302 const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4);
303 const __m128d corr2 = _mm_set1_pd(0.693359375);
305 const __m128d P5 = _mm_set1_pd(1.01875663804580931796E-4);
306 const __m128d P4 = _mm_set1_pd(4.97494994976747001425E-1);
307 const __m128d P3 = _mm_set1_pd(4.70579119878881725854E0);
308 const __m128d P2 = _mm_set1_pd(1.44989225341610930846E1);
309 const __m128d P1 = _mm_set1_pd(1.79368678507819816313E1);
310 const __m128d P0 = _mm_set1_pd(7.70838733755885391666E0);
312 const __m128d Q4 = _mm_set1_pd(1.12873587189167450590E1);
313 const __m128d Q3 = _mm_set1_pd(4.52279145837532221105E1);
314 const __m128d Q2 = _mm_set1_pd(8.29875266912776603211E1);
315 const __m128d Q1 = _mm_set1_pd(7.11544750618563894466E1);
316 const __m128d Q0 = _mm_set1_pd(2.31251620126765340583E1);
318 const __m128d R2 = _mm_set1_pd(-7.89580278884799154124E-1);
319 const __m128d R1 = _mm_set1_pd(1.63866645699558079767E1);
320 const __m128d R0 = _mm_set1_pd(-6.41409952958715622951E1);
322 const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1);
323 const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2);
324 const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2);
326 __m128d fexp;
327 __m128i iexp,iexp1,signbit,iexpabs;
328 __m128i imask1;
329 __m128d mask1,mask2;
330 __m128d corr,t1,t2,q;
331 __m128d zA,yA,xA,zB,yB,xB,z;
332 __m128d polyR,polyS;
333 __m128d polyP1,polyP2,polyQ1,polyQ2;
335 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
336 fexp = _mm_and_pd(x,expmask);
337 iexp = gmx_mm_castpd_si128(fexp);
338 iexp = _mm_srli_epi64(iexp,52);
339 iexp = _mm_sub_epi32(iexp,expbase_m1);
340 iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1,1,2,0) );
342 x = _mm_andnot_pd(expmask,x);
343 x = _mm_or_pd(x,one);
344 x = _mm_mul_pd(x,half);
346 signbit = _mm_srai_epi32(iexp,31);
347 iexpabs = _mm_sub_epi32(_mm_xor_si128(iexp,signbit),signbit);
349 imask1 = _mm_cmpgt_epi32( iexpabs, itwo );
350 mask2 = _mm_cmplt_pd(x,invsq2);
352 fexp = _mm_cvtepi32_pd(iexp);
353 corr = _mm_and_pd(mask2,one);
354 fexp = _mm_sub_pd(fexp,corr);
356 /* If mask1 is set ('A') */
357 zA = _mm_sub_pd(x,half);
358 t1 = _mm_or_pd( _mm_andnot_pd(mask2,zA), _mm_and_pd(mask2,x) );
359 zA = _mm_sub_pd(t1,half);
360 t2 = _mm_or_pd( _mm_andnot_pd(mask2,x), _mm_and_pd(mask2,zA) );
361 yA = _mm_mul_pd(half,_mm_add_pd(t2,one));
363 xA = _mm_mul_pd(zA,gmx_mm_inv_pd(yA));
364 zA = _mm_mul_pd(xA,xA);
366 /* EVALUATE POLY */
367 polyR = _mm_mul_pd(R2,zA);
368 polyR = _mm_add_pd(polyR,R1);
369 polyR = _mm_mul_pd(polyR,zA);
370 polyR = _mm_add_pd(polyR,R0);
372 polyS = _mm_add_pd(zA,S2);
373 polyS = _mm_mul_pd(polyS,zA);
374 polyS = _mm_add_pd(polyS,S1);
375 polyS = _mm_mul_pd(polyS,zA);
376 polyS = _mm_add_pd(polyS,S0);
378 q = _mm_mul_pd(polyR,gmx_mm_inv_pd(polyS));
379 zA = _mm_mul_pd(_mm_mul_pd(xA,zA),q);
381 zA = _mm_add_pd(zA,_mm_mul_pd(corr1,fexp));
382 zA = _mm_add_pd(zA,xA);
383 zA = _mm_add_pd(zA,_mm_mul_pd(corr2,fexp));
386 /* If mask1 is not set ('B') */
387 corr = _mm_and_pd(mask2,x);
388 xB = _mm_add_pd(x,corr);
389 xB = _mm_sub_pd(xB,one);
390 zB = _mm_mul_pd(xB,xB);
392 polyP1 = _mm_mul_pd(P5,zB);
393 polyP2 = _mm_mul_pd(P4,zB);
394 polyP1 = _mm_add_pd(polyP1,P3);
395 polyP2 = _mm_add_pd(polyP2,P2);
396 polyP1 = _mm_mul_pd(polyP1,zB);
397 polyP2 = _mm_mul_pd(polyP2,zB);
398 polyP1 = _mm_add_pd(polyP1,P1);
399 polyP2 = _mm_add_pd(polyP2,P0);
400 polyP1 = _mm_mul_pd(polyP1,xB);
401 polyP1 = _mm_add_pd(polyP1,polyP2);
403 polyQ2 = _mm_mul_pd(Q4,zB);
404 polyQ1 = _mm_add_pd(zB,Q3);
405 polyQ2 = _mm_add_pd(polyQ2,Q2);
406 polyQ1 = _mm_mul_pd(polyQ1,zB);
407 polyQ2 = _mm_mul_pd(polyQ2,zB);
408 polyQ1 = _mm_add_pd(polyQ1,Q1);
409 polyQ2 = _mm_add_pd(polyQ2,Q0);
410 polyQ1 = _mm_mul_pd(polyQ1,xB);
411 polyQ1 = _mm_add_pd(polyQ1,polyQ2);
413 q = _mm_mul_pd(polyP1,gmx_mm_inv_pd(polyQ1));
414 yB = _mm_mul_pd(_mm_mul_pd(xB,zB),q);
416 yB = _mm_add_pd(yB,_mm_mul_pd(corr1,fexp));
417 yB = _mm_sub_pd(yB,_mm_mul_pd(half,zB));
418 zB = _mm_add_pd(xB,yB);
419 zB = _mm_add_pd(zB,_mm_mul_pd(corr2,fexp));
422 mask1 = gmx_mm_castsi128_pd( _mm_shuffle_epi32(imask1, _MM_SHUFFLE(1,1,0,0)) );
423 z = _mm_or_pd( _mm_and_pd(mask1,zA), _mm_andnot_pd(mask1,zB) );
425 return z;
429 static int
430 gmx_mm_sincos_pd(__m128d x,
431 __m128d *sinval,
432 __m128d *cosval)
434 #ifdef _MSC_VER
435 __declspec(align(16))
436 const double sintable[34] =
438 1.00000000000000000e+00 , 0.00000000000000000e+00 ,
439 9.95184726672196929e-01 , 9.80171403295606036e-02 ,
440 9.80785280403230431e-01 , 1.95090322016128248e-01 ,
441 9.56940335732208824e-01 , 2.90284677254462331e-01 ,
442 9.23879532511286738e-01 , 3.82683432365089782e-01 ,
443 8.81921264348355050e-01 , 4.71396736825997642e-01 ,
444 8.31469612302545236e-01 , 5.55570233019602178e-01 ,
445 7.73010453362736993e-01 , 6.34393284163645488e-01 ,
446 7.07106781186547573e-01 , 7.07106781186547462e-01 ,
447 6.34393284163645599e-01 , 7.73010453362736882e-01 ,
448 5.55570233019602289e-01 , 8.31469612302545125e-01 ,
449 4.71396736825997809e-01 , 8.81921264348354939e-01 ,
450 3.82683432365089837e-01 , 9.23879532511286738e-01 ,
451 2.90284677254462276e-01 , 9.56940335732208935e-01 ,
452 1.95090322016128304e-01 , 9.80785280403230431e-01 ,
453 9.80171403295607702e-02 , 9.95184726672196818e-01 ,
454 6.12323399573676604e-17 , 1.00000000000000000e+00
456 #else
457 const __m128d sintable[17] =
459 _mm_set_pd( sin( 0.0 * (M_PI/2.0) / 16.0) , cos( 0.0 * (M_PI/2.0) / 16.0) ),
460 _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0) , cos( 1.0 * (M_PI/2.0) / 16.0) ),
461 _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0) , cos( 2.0 * (M_PI/2.0) / 16.0) ),
462 _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0) , cos( 3.0 * (M_PI/2.0) / 16.0) ),
463 _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0) , cos( 4.0 * (M_PI/2.0) / 16.0) ),
464 _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0) , cos( 5.0 * (M_PI/2.0) / 16.0) ),
465 _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0) , cos( 6.0 * (M_PI/2.0) / 16.0) ),
466 _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0) , cos( 7.0 * (M_PI/2.0) / 16.0) ),
467 _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0) , cos( 8.0 * (M_PI/2.0) / 16.0) ),
468 _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0) , cos( 9.0 * (M_PI/2.0) / 16.0) ),
469 _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0) , cos( 10.0 * (M_PI/2.0) / 16.0) ),
470 _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0) , cos( 11.0 * (M_PI/2.0) / 16.0) ),
471 _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0) , cos( 12.0 * (M_PI/2.0) / 16.0) ),
472 _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0) , cos( 13.0 * (M_PI/2.0) / 16.0) ),
473 _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0) , cos( 14.0 * (M_PI/2.0) / 16.0) ),
474 _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0) , cos( 15.0 * (M_PI/2.0) / 16.0) ),
475 _mm_set_pd( sin( 16.0 * (M_PI/2.0) / 16.0) , cos( 16.0 * (M_PI/2.0) / 16.0) )
477 #endif
479 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
480 const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
481 const __m128d invtabscale = _mm_set1_pd(M_PI/32.0);
482 const __m128d one = _mm_set1_pd(1.0);
483 const __m128i i32 = _mm_set1_epi32(32);
484 const __m128i i16 = _mm_set1_epi32(16);
485 const __m128i tabmask = _mm_set1_epi32(0x3F);
486 const __m128d sinP7 = _mm_set1_pd(-1.9835958374985742404167983310657359E-4);
487 const __m128d sinP5 = _mm_set1_pd(8.3333330133863188710910904896670286347068944E-3);
488 const __m128d sinP3 = _mm_set1_pd(-1.66666666666049927649121240304808461825E-1);
489 const __m128d sinP1 = _mm_set1_pd(9.99999999999999814240922423058227089729828E-1);
491 const __m128d cosP6 = _mm_set1_pd(-1.3884108697213500852322439746988228E-3);
492 const __m128d cosP4 = _mm_set1_pd(4.16666637872444585215198198619627956E-2);
493 const __m128d cosP2 = _mm_set1_pd(-4.999999999944495616220671189143471E-1);
494 const __m128d cosP0 = _mm_set1_pd(9.999999999999983282334075742852867E-1);
496 __m128d scalex;
497 __m128i tabidx,corridx;
498 __m128d xabs,z,z2,polySin,polyCos;
499 __m128d xpoint;
500 __m128d ypoint0,ypoint1;
501 __m128d sinpoint,cospoint;
502 __m128d xsign,ssign,csign;
503 __m128i imask,sswapsign,cswapsign;
504 __m128d minusone;
506 xsign = _mm_andnot_pd(signmask,x);
507 xabs = _mm_and_pd(x,signmask);
509 scalex = _mm_mul_pd(tabscale,xabs);
510 tabidx = _mm_cvtpd_epi32(scalex);
511 xpoint = _mm_cvtepi32_pd(tabidx);
512 xpoint = _mm_mul_pd(xpoint,invtabscale);
513 z = _mm_sub_pd(xabs, xpoint);
515 /* Range reduction to 0..2*Pi */
516 tabidx = _mm_and_si128(tabidx,tabmask);
518 /* tabidx is now in range [0,..,64] */
519 imask = _mm_cmpgt_epi32(tabidx,i32);
520 sswapsign = imask;
521 cswapsign = imask;
522 corridx = _mm_and_si128(imask,i32);
523 tabidx = _mm_sub_epi32(tabidx,corridx);
525 /* tabidx is now in range [0..32] */
526 imask = _mm_cmpgt_epi32(tabidx,i16);
527 cswapsign = _mm_xor_si128(cswapsign,imask);
528 corridx = _mm_sub_epi32(i32,tabidx);
529 tabidx = _mm_or_si128( _mm_and_si128(imask,corridx), _mm_andnot_si128(imask,tabidx) );
531 /* tabidx is now in range [0..16] */
532 sswapsign = _mm_shuffle_epi32(sswapsign,_MM_SHUFFLE(1,1,0,0));
533 cswapsign = _mm_shuffle_epi32(cswapsign,_MM_SHUFFLE(1,1,0,0));
534 minusone = _mm_sub_pd(_mm_setzero_pd(),one);
536 ssign = _mm_or_pd(_mm_and_pd( gmx_mm_castsi128_pd(sswapsign),minusone ),
537 _mm_andnot_pd( gmx_mm_castsi128_pd(sswapsign),one ));
538 csign = _mm_or_pd(_mm_and_pd( gmx_mm_castsi128_pd(cswapsign),minusone ),
539 _mm_andnot_pd( gmx_mm_castsi128_pd(cswapsign),one ));
541 /* First lookup into table */
542 #ifdef _MSC_VER
543 ypoint0 = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx,0));
544 ypoint1 = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx,1));
545 #else
546 ypoint0 = sintable[gmx_mm_extract_epi32(tabidx,0)];
547 ypoint1 = sintable[gmx_mm_extract_epi32(tabidx,1)];
548 #endif
549 sinpoint = _mm_unpackhi_pd(ypoint0,ypoint1);
550 cospoint = _mm_unpacklo_pd(ypoint0,ypoint1);
552 sinpoint = _mm_mul_pd(sinpoint,ssign);
553 cospoint = _mm_mul_pd(cospoint,csign);
555 z2 = _mm_mul_pd(z,z);
557 polySin = _mm_mul_pd(sinP7,z2);
558 polySin = _mm_add_pd(polySin,sinP5);
559 polySin = _mm_mul_pd(polySin,z2);
560 polySin = _mm_add_pd(polySin,sinP3);
561 polySin = _mm_mul_pd(polySin,z2);
562 polySin = _mm_add_pd(polySin,sinP1);
563 polySin = _mm_mul_pd(polySin,z);
565 polyCos = _mm_mul_pd(cosP6,z2);
566 polyCos = _mm_add_pd(polyCos,cosP4);
567 polyCos = _mm_mul_pd(polyCos,z2);
568 polyCos = _mm_add_pd(polyCos,cosP2);
569 polyCos = _mm_mul_pd(polyCos,z2);
570 polyCos = _mm_add_pd(polyCos,cosP0);
572 *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint,polyCos) , _mm_mul_pd(cospoint,polySin) ),xsign);
573 *cosval = _mm_sub_pd( _mm_mul_pd(cospoint,polyCos) , _mm_mul_pd(sinpoint,polySin) );
575 return 0;
580 static __m128d
581 gmx_mm_tan_pd(__m128d x)
583 __m128d sinval,cosval;
584 __m128d tanval;
586 gmx_mm_sincos_pd(x,&sinval,&cosval);
588 tanval = _mm_mul_pd(sinval,gmx_mm_inv_pd(cosval));
590 return tanval;
595 static __m128d
596 gmx_mm_asin_pd(__m128d x)
598 /* Same algorithm as cephes library */
599 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
600 const __m128d limit1 = _mm_set1_pd(0.625);
601 const __m128d limit2 = _mm_set1_pd(1e-8);
602 const __m128d one = _mm_set1_pd(1.0);
603 const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
604 const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
605 const __m128d morebits = _mm_set1_pd(6.123233995736765886130E-17);
607 const __m128d P5 = _mm_set1_pd(4.253011369004428248960E-3);
608 const __m128d P4 = _mm_set1_pd(-6.019598008014123785661E-1);
609 const __m128d P3 = _mm_set1_pd(5.444622390564711410273E0);
610 const __m128d P2 = _mm_set1_pd(-1.626247967210700244449E1);
611 const __m128d P1 = _mm_set1_pd(1.956261983317594739197E1);
612 const __m128d P0 = _mm_set1_pd(-8.198089802484824371615E0);
614 const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896E1);
615 const __m128d Q3 = _mm_set1_pd(7.049610280856842141659E1);
616 const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859E2);
617 const __m128d Q1 = _mm_set1_pd(1.395105614657485689735E2);
618 const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097E1);
620 const __m128d R4 = _mm_set1_pd(2.967721961301243206100E-3);
621 const __m128d R3 = _mm_set1_pd(-5.634242780008963776856E-1);
622 const __m128d R2 = _mm_set1_pd(6.968710824104713396794E0);
623 const __m128d R1 = _mm_set1_pd(-2.556901049652824852289E1);
624 const __m128d R0 = _mm_set1_pd(2.853665548261061424989E1);
626 const __m128d S3 = _mm_set1_pd(-2.194779531642920639778E1);
627 const __m128d S2 = _mm_set1_pd(1.470656354026814941758E2);
628 const __m128d S1 = _mm_set1_pd(-3.838770957603691357202E2);
629 const __m128d S0 = _mm_set1_pd(3.424398657913078477438E2);
631 __m128d sign;
632 __m128d mask;
633 __m128d xabs;
634 __m128d zz,ww,z,q,w,y,zz2,ww2;
635 __m128d PA,PB;
636 __m128d QA,QB;
637 __m128d RA,RB;
638 __m128d SA,SB;
639 __m128d nom,denom;
641 sign = _mm_andnot_pd(signmask,x);
642 xabs = _mm_and_pd(x,signmask);
644 mask = _mm_cmpgt_pd(xabs,limit1);
646 zz = _mm_sub_pd(one,xabs);
647 ww = _mm_mul_pd(xabs,xabs);
648 zz2 = _mm_mul_pd(zz,zz);
649 ww2 = _mm_mul_pd(ww,ww);
651 /* R */
652 RA = _mm_mul_pd(R4,zz2);
653 RB = _mm_mul_pd(R3,zz2);
654 RA = _mm_add_pd(RA,R2);
655 RB = _mm_add_pd(RB,R1);
656 RA = _mm_mul_pd(RA,zz2);
657 RB = _mm_mul_pd(RB,zz);
658 RA = _mm_add_pd(RA,R0);
659 RA = _mm_add_pd(RA,RB);
661 /* S, SA = zz2 */
662 SB = _mm_mul_pd(S3,zz2);
663 SA = _mm_add_pd(zz2,S2);
664 SB = _mm_add_pd(SB,S1);
665 SA = _mm_mul_pd(SA,zz2);
666 SB = _mm_mul_pd(SB,zz);
667 SA = _mm_add_pd(SA,S0);
668 SA = _mm_add_pd(SA,SB);
670 /* P */
671 PA = _mm_mul_pd(P5,ww2);
672 PB = _mm_mul_pd(P4,ww2);
673 PA = _mm_add_pd(PA,P3);
674 PB = _mm_add_pd(PB,P2);
675 PA = _mm_mul_pd(PA,ww2);
676 PB = _mm_mul_pd(PB,ww2);
677 PA = _mm_add_pd(PA,P1);
678 PB = _mm_add_pd(PB,P0);
679 PA = _mm_mul_pd(PA,ww);
680 PA = _mm_add_pd(PA,PB);
682 /* Q, QA = ww2 */
683 QB = _mm_mul_pd(Q4,ww2);
684 QA = _mm_add_pd(ww2,Q3);
685 QB = _mm_add_pd(QB,Q2);
686 QA = _mm_mul_pd(QA,ww2);
687 QB = _mm_mul_pd(QB,ww2);
688 QA = _mm_add_pd(QA,Q1);
689 QB = _mm_add_pd(QB,Q0);
690 QA = _mm_mul_pd(QA,ww);
691 QA = _mm_add_pd(QA,QB);
693 RA = _mm_mul_pd(RA,zz);
694 PA = _mm_mul_pd(PA,ww);
696 nom = _mm_or_pd( _mm_and_pd(mask,RA) , _mm_andnot_pd(mask,PA) );
697 denom = _mm_or_pd( _mm_and_pd(mask,SA) , _mm_andnot_pd(mask,QA) );
699 q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
701 zz = _mm_add_pd(zz,zz);
702 zz = gmx_mm_sqrt_pd(zz);
703 z = _mm_sub_pd(quarterpi,zz);
704 zz = _mm_mul_pd(zz,q);
705 zz = _mm_sub_pd(zz,morebits);
706 z = _mm_sub_pd(z,zz);
707 z = _mm_add_pd(z,quarterpi);
709 w = _mm_mul_pd(xabs,q);
710 w = _mm_add_pd(w,xabs);
712 z = _mm_or_pd( _mm_and_pd(mask,z) , _mm_andnot_pd(mask,w) );
714 mask = _mm_cmpgt_pd(xabs,limit2);
715 z = _mm_or_pd( _mm_and_pd(mask,z) , _mm_andnot_pd(mask,xabs) );
717 z = _mm_xor_pd(z,sign);
719 return z;
723 static __m128d
724 gmx_mm_acos_pd(__m128d x)
726 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
727 const __m128d one_pd = _mm_set1_pd(1.0);
728 const __m128d half_pd = _mm_set1_pd(0.5);
729 const __m128d pi_pd = _mm_set1_pd(M_PI);
730 const __m128d halfpi_pd = _mm_set1_pd(M_PI/2.0);
732 __m128d mask1;
733 __m128d mask2;
734 __m128d xabs;
735 __m128d z,z1,z2,z3;
737 xabs = _mm_and_pd(x,signmask);
738 mask1 = _mm_cmpgt_pd(xabs,half_pd);
739 mask2 = _mm_cmpgt_pd(x,_mm_setzero_pd());
741 z = _mm_mul_pd(half_pd,_mm_sub_pd(one_pd,xabs));
742 z = _mm_mul_pd(z,gmx_mm_invsqrt_pd(z));
743 z = _mm_andnot_pd(_mm_cmpeq_pd(xabs,one_pd),z);
745 z = _mm_or_pd( _mm_and_pd(mask1,z) , _mm_andnot_pd(mask1,x) );
746 z = gmx_mm_asin_pd(z);
748 z2 = _mm_add_pd(z,z);
749 z1 = _mm_sub_pd(pi_pd,z2);
750 z3 = _mm_sub_pd(halfpi_pd,z);
752 z = _mm_or_pd( _mm_and_pd(mask2,z2) , _mm_andnot_pd(mask2,z1) );
753 z = _mm_or_pd( _mm_and_pd(mask1,z) , _mm_andnot_pd(mask1,z3) );
755 return z;
758 static __m128d
759 gmx_mm_atan_pd(__m128d x)
761 /* Same algorithm as cephes library */
762 const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF) );
763 const __m128d limit1 = _mm_set1_pd(0.66);
764 const __m128d limit2 = _mm_set1_pd(2.41421356237309504880);
765 const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
766 const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
767 const __m128d mone = _mm_set1_pd(-1.0);
768 const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
769 const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
771 const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1);
772 const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1);
773 const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1);
774 const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2);
775 const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1);
777 const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1);
778 const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2);
779 const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2);
780 const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2);
781 const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2);
783 __m128d sign;
784 __m128d mask1,mask2;
785 __m128d y,t1,t2;
786 __m128d z,z2;
787 __m128d P_A,P_B,Q_A,Q_B;
789 sign = _mm_andnot_pd(signmask,x);
790 x = _mm_and_pd(x,signmask);
792 mask1 = _mm_cmpgt_pd(x,limit1);
793 mask2 = _mm_cmpgt_pd(x,limit2);
795 t1 = _mm_mul_pd(_mm_add_pd(x,mone),gmx_mm_inv_pd(_mm_sub_pd(x,mone)));
796 t2 = _mm_mul_pd(mone,gmx_mm_inv_pd(x));
798 y = _mm_and_pd(mask1,quarterpi);
799 y = _mm_or_pd( _mm_and_pd(mask2,halfpi) , _mm_andnot_pd(mask2,y) );
801 x = _mm_or_pd( _mm_and_pd(mask1,t1) , _mm_andnot_pd(mask1,x) );
802 x = _mm_or_pd( _mm_and_pd(mask2,t2) , _mm_andnot_pd(mask2,x) );
804 z = _mm_mul_pd(x,x);
805 z2 = _mm_mul_pd(z,z);
807 P_A = _mm_mul_pd(P4,z2);
808 P_B = _mm_mul_pd(P3,z2);
809 P_A = _mm_add_pd(P_A,P2);
810 P_B = _mm_add_pd(P_B,P1);
811 P_A = _mm_mul_pd(P_A,z2);
812 P_B = _mm_mul_pd(P_B,z);
813 P_A = _mm_add_pd(P_A,P0);
814 P_A = _mm_add_pd(P_A,P_B);
816 /* Q_A = z2 */
817 Q_B = _mm_mul_pd(Q4,z2);
818 Q_A = _mm_add_pd(z2,Q3);
819 Q_B = _mm_add_pd(Q_B,Q2);
820 Q_A = _mm_mul_pd(Q_A,z2);
821 Q_B = _mm_mul_pd(Q_B,z2);
822 Q_A = _mm_add_pd(Q_A,Q1);
823 Q_B = _mm_add_pd(Q_B,Q0);
824 Q_A = _mm_mul_pd(Q_A,z);
825 Q_A = _mm_add_pd(Q_A,Q_B);
827 z = _mm_mul_pd(z,P_A);
828 z = _mm_mul_pd(z,gmx_mm_inv_pd(Q_A));
829 z = _mm_mul_pd(z,x);
830 z = _mm_add_pd(z,x);
832 t1 = _mm_and_pd(mask1,morebits1);
833 t1 = _mm_or_pd( _mm_and_pd(mask2,morebits2) , _mm_andnot_pd(mask2,t1) );
835 z = _mm_add_pd(z,t1);
836 y = _mm_add_pd(y,z);
838 y = _mm_xor_pd(y,sign);
840 return y;
843 static __m128d
844 gmx_mm_atan2_pd(__m128d y, __m128d x)
846 const __m128d pi = _mm_set1_pd(M_PI);
847 const __m128d minuspi = _mm_set1_pd(-M_PI);
848 const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
849 const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
851 __m128d z,z1,z3,z4;
852 __m128d w;
853 __m128d maskx_lt,maskx_eq;
854 __m128d masky_lt,masky_eq;
855 __m128d mask1,mask2,mask3,mask4,maskall;
857 maskx_lt = _mm_cmplt_pd(x,_mm_setzero_pd());
858 masky_lt = _mm_cmplt_pd(y,_mm_setzero_pd());
859 maskx_eq = _mm_cmpeq_pd(x,_mm_setzero_pd());
860 masky_eq = _mm_cmpeq_pd(y,_mm_setzero_pd());
862 z = _mm_mul_pd(y,gmx_mm_inv_pd(x));
863 z = gmx_mm_atan_pd(z);
865 mask1 = _mm_and_pd(maskx_eq,masky_lt);
866 mask2 = _mm_andnot_pd(maskx_lt,masky_eq);
867 mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt,masky_eq) , maskx_eq);
868 mask4 = _mm_and_pd(masky_eq,maskx_lt);
870 maskall = _mm_or_pd( _mm_or_pd(mask1,mask2), _mm_or_pd(mask3,mask4) );
872 z = _mm_andnot_pd(maskall,z);
873 z1 = _mm_and_pd(mask1,minushalfpi);
874 z3 = _mm_and_pd(mask3,halfpi);
875 z4 = _mm_and_pd(mask4,pi);
877 z = _mm_or_pd( _mm_or_pd(z,z1), _mm_or_pd(z3,z4) );
879 mask1 = _mm_andnot_pd(masky_lt,maskx_lt);
880 mask2 = _mm_and_pd(maskx_lt,masky_lt);
882 w = _mm_or_pd( _mm_and_pd(mask1,pi), _mm_and_pd(mask2,minuspi) );
883 w = _mm_andnot_pd(maskall,w);
885 z = _mm_add_pd(z,w);
887 return z;
891 #define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
892 __m128d __gmx_t1 = row0; \
893 row0 = _mm_unpacklo_pd(row0,row1); \
894 row1 = _mm_unpackhi_pd(__gmx_t1,row1); \
897 #define GMX_MM_LOAD_2VALUES_PD(ptr1,ptr2,xmm1) \
899 __m128d _txmm2; \
900 xmm1 = _mm_load_sd(ptr1); \
901 _txmm2 = _mm_load_sd(ptr2); \
902 xmm1 = _mm_unpacklo_pd(xmm1,_txmm2); \
906 #define GMX_MM_LOAD_1VALUE_PD(ptr1,xmm1) \
908 xmm1 = _mm_load_sd(ptr1); \
912 #define GMX_MM_STORE_2VALUES_PD(ptr1,ptr2,xmm1) \
914 __m128d _txmm2; \
915 _txmm2 = _mm_unpackhi_pd(xmm1,xmm1); \
916 _mm_store_sd(ptr1,xmm1); \
917 _mm_store_sd(ptr2,_txmm2); \
921 #define GMX_MM_STORE_1VALUE_PD(ptr1,xmm1) \
923 _mm_store_sd(ptr1,xmm1); \
926 #define GMX_MM_INCREMENT_2VALUES_PD(ptr1,ptr2,xmm1) \
928 __m128d _tincr; \
929 GMX_MM_LOAD_2VALUES_PD(ptr1,ptr2,_tincr); \
930 _tincr = _mm_add_pd(_tincr,xmm1); \
931 GMX_MM_STORE_2VALUES_PD(ptr1,ptr2,_tincr); \
934 #define GMX_MM_INCREMENT_1VALUE_PD(ptr1,xmm1) \
936 __m128d _tincr; \
937 GMX_MM_LOAD_1VALUE_PD(ptr1,_tincr); \
938 _tincr = _mm_add_sd(_tincr,xmm1); \
939 GMX_MM_STORE_1VALUE_PD(ptr1,_tincr); \
943 #define GMX_MM_LOAD_2PAIRS_PD(ptr1,ptr2,c6,c12) \
945 __m128d _tmp1,_tmp2; \
946 _tmp1 = _mm_load_pd(ptr1); \
947 _tmp2 = _mm_load_pd(ptr2); \
948 c6 = _mm_unpacklo_pd(_tmp1,_tmp2); \
949 c12 = _mm_unpackhi_pd(_tmp1,_tmp2); \
952 #define GMX_MM_LOAD_1PAIR_PD(ptr1,c6,c12) \
954 c6 = _mm_load_sd(ptr1); \
955 c12 = _mm_load_sd(ptr1+1); \
961 /* Routines to load 1-4 rvecs from 1-2 places.
962 * We mainly use these to load coordinates. The extra routines
963 * are very efficient for the water-water loops, since we e.g.
964 * know that a TIP4p water has 4 atoms, so we should load 12 doubles from each pointer and shuffle.
966 #define GMX_MM_LOAD_1RVEC_1POINTER_PD(ptr1,jx1,jy1,jz1) { \
967 jx1 = _mm_load_sd(ptr1); \
968 jy1 = _mm_load_sd((ptr1)+1); \
969 jz1 = _mm_load_sd((ptr1)+2); \
972 #define GMX_MM_LOAD_2RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
973 jx1 = _mm_load_sd(ptr1); \
974 jy1 = _mm_load_sd((ptr1)+1); \
975 jz1 = _mm_load_sd((ptr1)+2); \
976 jx2 = _mm_load_sd((ptr1)+3); \
977 jy2 = _mm_load_sd((ptr1)+4); \
978 jz2 = _mm_load_sd((ptr1)+5); \
982 #define GMX_MM_LOAD_3RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
983 jx1 = _mm_load_sd(ptr1); \
984 jy1 = _mm_load_sd((ptr1)+1); \
985 jz1 = _mm_load_sd((ptr1)+2); \
986 jx2 = _mm_load_sd((ptr1)+3); \
987 jy2 = _mm_load_sd((ptr1)+4); \
988 jz2 = _mm_load_sd((ptr1)+5); \
989 jx3 = _mm_load_sd((ptr1)+6); \
990 jy3 = _mm_load_sd((ptr1)+7); \
991 jz3 = _mm_load_sd((ptr1)+8); \
995 #define GMX_MM_LOAD_4RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
996 jx1 = _mm_load_sd(ptr1); \
997 jy1 = _mm_load_sd((ptr1)+1); \
998 jz1 = _mm_load_sd((ptr1)+2); \
999 jx2 = _mm_load_sd((ptr1)+3); \
1000 jy2 = _mm_load_sd((ptr1)+4); \
1001 jz2 = _mm_load_sd((ptr1)+5); \
1002 jx3 = _mm_load_sd((ptr1)+6); \
1003 jy3 = _mm_load_sd((ptr1)+7); \
1004 jz3 = _mm_load_sd((ptr1)+8); \
1005 jx4 = _mm_load_sd((ptr1)+9); \
1006 jy4 = _mm_load_sd((ptr1)+10); \
1007 jz4 = _mm_load_sd((ptr1)+11); \
1011 #define GMX_MM_LOAD_1RVEC_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1) { \
1012 __m128d _tmp1; \
1013 _tmp1 = _mm_loadu_pd(ptr1); \
1014 jy1 = _mm_loadu_pd(ptr2); \
1015 jz1 = _mm_load_sd(ptr1+2); \
1016 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1017 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1018 jz1 = _mm_loadh_pd(jz1,ptr2+2); \
1021 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1022 __m128d _tmp1, _tmp2,_tmp3; \
1023 _tmp1 = _mm_loadu_pd(ptr1); \
1024 jy1 = _mm_loadu_pd(ptr2); \
1025 _tmp2 = _mm_loadu_pd(ptr1+2); \
1026 jx2 = _mm_loadu_pd(ptr2+2); \
1027 _tmp3 = _mm_loadu_pd(ptr1+4); \
1028 jz2 = _mm_loadu_pd(ptr2+4); \
1029 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1030 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1031 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
1032 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
1033 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
1034 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
1038 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1039 __m128d _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
1040 _tmp1 = _mm_loadu_pd(ptr1); \
1041 jy1 = _mm_loadu_pd(ptr2); \
1042 _tmp2 = _mm_loadu_pd(ptr1+2); \
1043 jx2 = _mm_loadu_pd(ptr2+2); \
1044 _tmp3 = _mm_loadu_pd(ptr1+4); \
1045 jz2 = _mm_loadu_pd(ptr2+4); \
1046 _tmp4 = _mm_loadu_pd(ptr1+6); \
1047 jy3 = _mm_loadu_pd(ptr2+6); \
1048 jz3 = _mm_load_sd(ptr1+8); \
1049 _tmp5 = _mm_load_sd(ptr2+8); \
1050 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1051 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1052 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
1053 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
1054 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
1055 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
1056 jx3 = _mm_unpacklo_pd(_tmp4,jy3); \
1057 jy3 = _mm_unpackhi_pd(_tmp4,jy3); \
1058 jz3 = _mm_unpacklo_pd(jz2,_tmp5); \
1062 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1063 __m128d _tmp1, _tmp2,_tmp3, _tmp4, _tmp5, _tmp6; \
1064 _tmp1 = _mm_loadu_pd(ptr1); \
1065 jy1 = _mm_loadu_pd(ptr2); \
1066 _tmp2 = _mm_loadu_pd(ptr1+2); \
1067 jx2 = _mm_loadu_pd(ptr2+2); \
1068 _tmp3 = _mm_loadu_pd(ptr1+4); \
1069 jz2 = _mm_loadu_pd(ptr2+4); \
1070 _tmp4 = _mm_loadu_pd(ptr1+6); \
1071 jy3 = _mm_loadu_pd(ptr2+6); \
1072 _tmp5 = _mm_loadu_pd(ptr1+8); \
1073 jx4 = _mm_loadu_pd(ptr2+8); \
1074 _tmp6 = _mm_loadu_pd(ptr1+10); \
1075 jz4 = _mm_loadu_pd(ptr2+10); \
1076 jx1 = _mm_unpacklo_pd(_tmp1,jy1); \
1077 jy1 = _mm_unpackhi_pd(_tmp1,jy1); \
1078 jz1 = _mm_unpacklo_pd(_tmp2,jx2); \
1079 jx2 = _mm_unpackhi_pd(_tmp2,jx2); \
1080 jy2 = _mm_unpacklo_pd(_tmp3,jz2); \
1081 jz2 = _mm_unpackhi_pd(_tmp3,jz2); \
1082 jx3 = _mm_unpacklo_pd(_tmp4,jy3); \
1083 jy3 = _mm_unpackhi_pd(_tmp4,jy3); \
1084 jz3 = _mm_unpacklo_pd(_tmp5,jx4); \
1085 jx4 = _mm_unpackhi_pd(_tmp5,jx4); \
1086 jy4 = _mm_unpacklo_pd(_tmp6,jz4); \
1087 jz4 = _mm_unpackhi_pd(_tmp6,jz4); \
1091 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PD(ptr1,jx1,jy1,jz1) { \
1092 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1093 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1094 _mm_store_sd(ptr1+2, _mm_add_sd( _mm_load_sd(ptr1+2), jz1)); \
1098 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1099 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1100 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1101 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1102 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1103 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1104 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1108 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1109 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1110 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1111 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1112 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1113 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1114 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1115 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1116 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1117 _mm_store_sd(ptr1+8, _mm_add_pd( _mm_load_sd(ptr1+8), jz3 )); \
1121 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1122 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1123 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1124 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1125 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1126 jz3 = _mm_unpacklo_pd(jz3,jx4); \
1127 jy4 = _mm_unpacklo_pd(jy4,jz4); \
1128 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), jx1 )); \
1129 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1130 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1131 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1132 _mm_storeu_pd(ptr1+8, _mm_add_pd( _mm_loadu_pd(ptr1+8), jz3 )); \
1133 _mm_storeu_pd(ptr1+10, _mm_add_pd( _mm_loadu_pd(ptr1+10), jy4 )); \
1137 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1) { \
1138 __m128d _tmp1,_tmp2,_tmp3; \
1139 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1140 _tmp2 = _mm_unpackhi_pd(jx1,jy1); \
1141 _tmp3 = _mm_unpackhi_pd(jz1,jz1); \
1142 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1143 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), _tmp2 )); \
1144 _mm_store_sd(ptr1+2, _mm_add_pd( _mm_load_sd(ptr1+2), jz1 )); \
1145 _mm_store_sd(ptr2+2, _mm_add_sd( _mm_load_sd(ptr2+2), _tmp3 )); \
1149 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1150 __m128d _tmp1,_tmp2,_tmp3; \
1151 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1152 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1153 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1154 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1155 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1156 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1157 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1158 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1159 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1160 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1161 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1162 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1166 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1167 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1168 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1169 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1170 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1171 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1172 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1173 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1174 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1175 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1176 _tmp5 = _mm_unpackhi_pd(jz3,jz3); \
1177 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1178 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1179 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1180 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1181 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1182 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1183 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1184 _mm_storeu_pd(ptr2+6, _mm_add_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1185 _mm_store_sd(ptr1+8, _mm_add_sd( _mm_load_sd(ptr1+8), jz3 )); \
1186 _mm_store_sd(ptr2+8, _mm_add_sd( _mm_load_sd(ptr2+8), _tmp5 )); \
1190 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1191 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6; \
1192 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1193 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1194 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1195 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1196 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1197 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1198 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1199 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1200 _tmp5 = _mm_unpacklo_pd(jz3,jx4); \
1201 jx4 = _mm_unpackhi_pd(jz3,jx4); \
1202 _tmp6 = _mm_unpacklo_pd(jy4,jz4); \
1203 jz4 = _mm_unpackhi_pd(jy4,jz4); \
1204 _mm_storeu_pd(ptr1, _mm_add_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1205 _mm_storeu_pd(ptr2, _mm_add_pd( _mm_loadu_pd(ptr2), jy1 )); \
1206 _mm_storeu_pd(ptr1+2, _mm_add_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1207 _mm_storeu_pd(ptr2+2, _mm_add_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1208 _mm_storeu_pd(ptr1+4, _mm_add_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1209 _mm_storeu_pd(ptr2+4, _mm_add_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1210 _mm_storeu_pd(ptr1+6, _mm_add_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1211 _mm_storeu_pd(ptr2+6, _mm_add_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1212 _mm_storeu_pd(ptr1+8, _mm_add_pd( _mm_loadu_pd(ptr1+8), _tmp5 )); \
1213 _mm_storeu_pd(ptr2+8, _mm_add_pd( _mm_loadu_pd(ptr2+8), jx4 )); \
1214 _mm_storeu_pd(ptr1+10,_mm_add_pd( _mm_loadu_pd(ptr1+10),_tmp6 )); \
1215 _mm_storeu_pd(ptr2+10,_mm_add_pd( _mm_loadu_pd(ptr2+10), jz4 )); \
1220 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PD(ptr1,jx1,jy1,jz1) { \
1221 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1222 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1223 _mm_store_sd(ptr1+2, _mm_sub_sd( _mm_load_sd(ptr1+2), jz1)); \
1227 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1228 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1229 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1230 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1231 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1232 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1233 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1237 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1238 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1239 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1240 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1241 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1242 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1243 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1244 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1245 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1246 _mm_store_sd(ptr1+8, _mm_sub_sd( _mm_load_sd(ptr1+8), jz3 )); \
1250 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PD(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1251 jx1 = _mm_unpacklo_pd(jx1,jy1); \
1252 jz1 = _mm_unpacklo_pd(jz1,jx2); \
1253 jy2 = _mm_unpacklo_pd(jy2,jz2); \
1254 jx3 = _mm_unpacklo_pd(jx3,jy3); \
1255 jz3 = _mm_unpacklo_pd(jz3,jx4); \
1256 jy4 = _mm_unpacklo_pd(jy4,jz4); \
1257 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), jx1 )); \
1258 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), jz1 )); \
1259 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), jy2 )); \
1260 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), jx3 )); \
1261 _mm_storeu_pd(ptr1+8, _mm_sub_pd( _mm_loadu_pd(ptr1+8), jz3 )); \
1262 _mm_storeu_pd(ptr1+10, _mm_sub_pd( _mm_loadu_pd(ptr1+10), jy4 )); \
1266 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1) { \
1267 __m128d _tmp1,_tmp2,_tmp3; \
1268 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1269 _tmp2 = _mm_unpackhi_pd(jx1,jy1); \
1270 _tmp3 = _mm_unpackhi_pd(jz1,jz1); \
1271 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1272 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), _tmp2 )); \
1273 _mm_store_sd(ptr1+2, _mm_sub_pd( _mm_load_sd(ptr1+2), jz1 )); \
1274 _mm_store_sd(ptr2+2, _mm_sub_sd( _mm_load_sd(ptr2+2), _tmp3 )); \
1278 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1279 __m128d _tmp1,_tmp2,_tmp3; \
1280 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1281 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1282 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1283 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1284 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1285 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1286 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1287 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), jy1 )); \
1288 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1289 _mm_storeu_pd(ptr2+2, _mm_sub_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1290 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1291 _mm_storeu_pd(ptr2+4, _mm_sub_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1295 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1296 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1297 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1298 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1299 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1300 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1301 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1302 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1303 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1304 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1305 _tmp5 = _mm_unpackhi_pd(jz3,jz3); \
1306 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1307 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), jy1 )); \
1308 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1309 _mm_storeu_pd(ptr2+2, _mm_sub_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1310 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1311 _mm_storeu_pd(ptr2+4, _mm_sub_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1312 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1313 _mm_storeu_pd(ptr2+6, _mm_sub_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1314 _mm_store_sd(ptr1+8, _mm_sub_sd( _mm_load_sd(ptr1+8), jz3 )); \
1315 _mm_store_sd(ptr2+8, _mm_sub_sd( _mm_load_sd(ptr2+8), _tmp5 )); \
1319 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PD(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1320 __m128d _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6; \
1321 _tmp1 = _mm_unpacklo_pd(jx1,jy1); \
1322 jy1 = _mm_unpackhi_pd(jx1,jy1); \
1323 _tmp2 = _mm_unpacklo_pd(jz1,jx2); \
1324 jx2 = _mm_unpackhi_pd(jz1,jx2); \
1325 _tmp3 = _mm_unpacklo_pd(jy2,jz2); \
1326 jz2 = _mm_unpackhi_pd(jy2,jz2); \
1327 _tmp4 = _mm_unpacklo_pd(jx3,jy3); \
1328 jy3 = _mm_unpackhi_pd(jx3,jy3); \
1329 _tmp5 = _mm_unpacklo_pd(jz3,jx4); \
1330 jx4 = _mm_unpackhi_pd(jz3,jx4); \
1331 _tmp6 = _mm_unpacklo_pd(jy4,jz4); \
1332 jz4 = _mm_unpackhi_pd(jy4,jz4); \
1333 _mm_storeu_pd(ptr1, _mm_sub_pd( _mm_loadu_pd(ptr1), _tmp1 )); \
1334 _mm_storeu_pd(ptr2, _mm_sub_pd( _mm_loadu_pd(ptr2), jy1 )); \
1335 _mm_storeu_pd(ptr1+2, _mm_sub_pd( _mm_loadu_pd(ptr1+2), _tmp2 )); \
1336 _mm_storeu_pd(ptr2+2, _mm_sub_pd( _mm_loadu_pd(ptr2+2), jx2 )); \
1337 _mm_storeu_pd(ptr1+4, _mm_sub_pd( _mm_loadu_pd(ptr1+4), _tmp3 )); \
1338 _mm_storeu_pd(ptr2+4, _mm_sub_pd( _mm_loadu_pd(ptr2+4), jz2 )); \
1339 _mm_storeu_pd(ptr1+6, _mm_sub_pd( _mm_loadu_pd(ptr1+6), _tmp4 )); \
1340 _mm_storeu_pd(ptr2+6, _mm_sub_pd( _mm_loadu_pd(ptr2+6), jy3 )); \
1341 _mm_storeu_pd(ptr1+8, _mm_sub_pd( _mm_loadu_pd(ptr1+8), _tmp5 )); \
1342 _mm_storeu_pd(ptr2+8, _mm_sub_pd( _mm_loadu_pd(ptr2+8), jx4 )); \
1343 _mm_storeu_pd(ptr1+10,_mm_sub_pd( _mm_loadu_pd(ptr1+10),_tmp6 )); \
1344 _mm_storeu_pd(ptr2+10,_mm_sub_pd( _mm_loadu_pd(ptr2+10), jz4 )); \
1350 static inline __m128d
1351 gmx_mm_scalarprod_pd(__m128d x, __m128d y, __m128d z)
1353 return _mm_add_pd(_mm_add_pd(_mm_mul_pd(x,x),_mm_mul_pd(y,y)),_mm_mul_pd(z,z));
1356 static inline __m128d
1357 gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
1359 return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx), _mm_mul_pd(dy,dy) ), _mm_mul_pd(dz,dz) );
1364 static inline void
1365 gmx_mm_update_iforce_1atom_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1366 double *fptr,
1367 double *fshiftptr)
1369 __m128d t1,t2,t3;
1371 #ifdef GMX_SSE3
1372 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1373 /* fiz1 is fine as it is */
1374 #else
1375 /* SSE2 */
1376 /* transpose data */
1377 t1 = *fix1;
1378 *fix1 = _mm_unpacklo_pd(*fix1,*fiy1); /* y0 x0 */
1379 *fiy1 = _mm_unpackhi_pd(t1,*fiy1); /* y1 x1 */
1381 *fix1 = _mm_add_pd(*fix1,*fiy1);
1382 *fiz1 = _mm_add_sd( *fiz1, _mm_unpackhi_pd(*fiz1,*fiz1 ));
1383 #endif
1384 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1385 _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), *fiz1 ));
1387 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1388 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1391 static inline void
1392 gmx_mm_update_iforce_2atoms_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1393 __m128d *fix2, __m128d *fiy2, __m128d *fiz2,
1394 double *fptr,
1395 double *fshiftptr)
1397 __m128d t1;
1399 #ifdef GMX_SSE3
1400 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1401 *fiz1 = _mm_hadd_pd(*fiz1,*fix2);
1402 *fiy2 = _mm_hadd_pd(*fiy2,*fiz2);
1403 #else
1404 /* SSE2 */
1405 /* transpose data */
1406 GMX_MM_TRANSPOSE2_PD(*fix1,*fiy1);
1407 GMX_MM_TRANSPOSE2_PD(*fiz1,*fix2);
1408 GMX_MM_TRANSPOSE2_PD(*fiy2,*fiz2);
1410 *fix1 = _mm_add_pd(*fix1,*fiy1);
1411 *fiz1 = _mm_add_pd(*fiz1,*fix2);
1412 *fiy2 = _mm_add_pd(*fiy2,*fiz2);
1413 #endif
1414 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1415 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), *fiz1 ));
1416 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), *fiy2 ));
1418 t1 = _mm_shuffle_pd(*fiz1,*fiy2,_MM_SHUFFLE2(0,1));
1419 *fix1 = _mm_add_pd(*fix1,t1); /* x and y sums */
1420 *fiz1 = _mm_add_sd(*fiz1, _mm_unpackhi_pd(*fiy2,*fiy2)); /* z sum */
1422 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1423 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1428 static inline void
1429 gmx_mm_update_iforce_3atoms_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1430 __m128d *fix2, __m128d *fiy2, __m128d *fiz2,
1431 __m128d *fix3, __m128d *fiy3, __m128d *fiz3,
1432 double *fptr,
1433 double *fshiftptr)
1435 __m128d t1,t2;
1437 #ifdef GMX_SSE3
1438 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1439 *fiz1 = _mm_hadd_pd(*fiz1,*fix2);
1440 *fiy2 = _mm_hadd_pd(*fiy2,*fiz2);
1441 *fix3 = _mm_hadd_pd(*fix3,*fiy3);
1442 /* fiz3 is fine as it is */
1443 #else
1444 /* SSE2 */
1445 /* transpose data */
1446 GMX_MM_TRANSPOSE2_PD(*fix1,*fiy1);
1447 GMX_MM_TRANSPOSE2_PD(*fiz1,*fix2);
1448 GMX_MM_TRANSPOSE2_PD(*fiy2,*fiz2);
1449 t1 = *fix3;
1450 *fix3 = _mm_unpacklo_pd(*fix3,*fiy3); /* y0 x0 */
1451 *fiy3 = _mm_unpackhi_pd(t1,*fiy3); /* y1 x1 */
1453 *fix1 = _mm_add_pd(*fix1,*fiy1);
1454 *fiz1 = _mm_add_pd(*fiz1,*fix2);
1455 *fiy2 = _mm_add_pd(*fiy2,*fiz2);
1457 *fix3 = _mm_add_pd(*fix3,*fiy3);
1458 *fiz3 = _mm_add_sd( *fiz3, _mm_unpackhi_pd(*fiz3,*fiz3));
1459 #endif
1460 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1461 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), *fiz1 ));
1462 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), *fiy2 ));
1463 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), *fix3 ));
1464 _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), *fiz3 ));
1466 *fix1 = _mm_add_pd(*fix1,*fix3);
1467 t1 = _mm_shuffle_pd(*fiz1,*fiy2,_MM_SHUFFLE2(0,1));
1468 *fix1 = _mm_add_pd(*fix1,t1); /* x and y sums */
1470 t2 = _mm_shuffle_pd(*fiy2,*fiy2,_MM_SHUFFLE2(1,1));
1471 *fiz1 = _mm_add_sd(*fiz1,*fiz3);
1472 *fiz1 = _mm_add_sd(*fiz1,t2); /* z sum */
1474 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1475 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1480 static inline void
1481 gmx_mm_update_iforce_4atoms_pd(__m128d *fix1, __m128d *fiy1, __m128d *fiz1,
1482 __m128d *fix2, __m128d *fiy2, __m128d *fiz2,
1483 __m128d *fix3, __m128d *fiy3, __m128d *fiz3,
1484 __m128d *fix4, __m128d *fiy4, __m128d *fiz4,
1485 double *fptr,
1486 double *fshiftptr)
1488 __m128d t1,t2;
1490 #ifdef GMX_SSE3
1491 *fix1 = _mm_hadd_pd(*fix1,*fiy1);
1492 *fiz1 = _mm_hadd_pd(*fiz1,*fix2);
1493 *fiy2 = _mm_hadd_pd(*fiy2,*fiz2);
1494 *fix3 = _mm_hadd_pd(*fix3,*fiy3);
1495 *fiz3 = _mm_hadd_pd(*fiz3,*fix4);
1496 *fiy4 = _mm_hadd_pd(*fiy4,*fiz4);
1497 #else
1498 /* SSE2 */
1499 /* transpose data */
1500 GMX_MM_TRANSPOSE2_PD(*fix1,*fiy1);
1501 GMX_MM_TRANSPOSE2_PD(*fiz1,*fix2);
1502 GMX_MM_TRANSPOSE2_PD(*fiy2,*fiz2);
1503 GMX_MM_TRANSPOSE2_PD(*fix3,*fiy3);
1504 GMX_MM_TRANSPOSE2_PD(*fiz3,*fix4);
1505 GMX_MM_TRANSPOSE2_PD(*fiy4,*fiz4);
1507 *fix1 = _mm_add_pd(*fix1,*fiy1);
1508 *fiz1 = _mm_add_pd(*fiz1,*fix2);
1509 *fiy2 = _mm_add_pd(*fiy2,*fiz2);
1510 *fix3 = _mm_add_pd(*fix3,*fiy3);
1511 *fiz3 = _mm_add_pd(*fiz3,*fix4);
1512 *fiy4 = _mm_add_pd(*fiy4,*fiz4);
1513 #endif
1514 _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), *fix1 ));
1515 _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), *fiz1 ));
1516 _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), *fiy2 ));
1517 _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), *fix3 ));
1518 _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), *fiz3 ));
1519 _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), *fiy4 ));
1521 t1 = _mm_shuffle_pd(*fiz1,*fiy2,_MM_SHUFFLE2(0,1));
1522 *fix1 = _mm_add_pd(*fix1,t1);
1523 t2 = _mm_shuffle_pd(*fiz3,*fiy4,_MM_SHUFFLE2(0,1));
1524 *fix3 = _mm_add_pd(*fix3,t2);
1525 *fix1 = _mm_add_pd(*fix1,*fix3); /* x and y sums */
1528 *fiz1 = _mm_add_sd(*fiz1, _mm_unpackhi_pd(*fiy2,*fiy2));
1529 *fiz3 = _mm_add_sd(*fiz3, _mm_unpackhi_pd(*fiy4,*fiy4));
1530 *fiz1 = _mm_add_sd(*fiz1,*fiz3); /* z sum */
1533 _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), *fix1 ));
1534 _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), *fiz1 ));
1538 static inline void
1539 gmx_mm_update_1pot_pd(__m128d pot1, double *ptr1)
1541 #ifdef GMX_SSE3
1542 pot1 = _mm_hadd_pd(pot1,pot1);
1543 #else
1544 /* SSE2 */
1545 pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1,pot1));
1546 #endif
1547 _mm_store_sd(ptr1,_mm_add_sd(pot1,_mm_load_sd(ptr1)));
1550 static inline void
1551 gmx_mm_update_2pot_pd(__m128d pot1, double *ptr1, __m128d pot2, double *ptr2)
1553 #ifdef GMX_SSE3
1554 pot1 = _mm_hadd_pd(pot1,pot2);
1555 #else
1556 /* SSE2 */
1557 GMX_MM_TRANSPOSE2_PD(pot1,pot2);
1558 pot1 = _mm_add_pd(pot1,pot2);
1559 #endif
1560 pot2 = _mm_unpackhi_pd(pot1,pot1);
1562 _mm_store_sd(ptr1,_mm_add_sd(pot1,_mm_load_sd(ptr1)));
1563 _mm_store_sd(ptr2,_mm_add_sd(pot2,_mm_load_sd(ptr2)));