Better selection stdin handling.
[gromacs/qmmm-gamess-us.git] / include / gmx_sse2_single.h
blob597a0b8643abe34e8ae9097b5217cb6170eaab74
1 /*
2 * This source code is part of
4 * G R O M A C S
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
16 * later version.
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
34 /* We require SSE2 now! */
36 #include <math.h>
39 #include <xmmintrin.h> /* SSE */
40 #include <emmintrin.h> /* SSE2 */
42 #ifdef GMX_SSE3
43 # include <pmmintrin.h> /* SSE3 */
44 #endif
45 #ifdef GMX_SSE4
46 # include <smmintrin.h> /* SSE4.1 */
47 #endif
49 #include <stdio.h>
51 /***************************************************
52 * *
53 * COMPILER RANT WARNING: *
54 * *
55 * Ideally, this header would be filled with *
56 * simple static inline functions. Unfortunately, *
57 * many vendors provide really braindead compilers *
58 * that either cannot handle more than 1-2 SSE *
59 * function parameters, and some cannot handle *
60 * pointers to SSE __m128 datatypes as parameters *
61 * at all. Thus, for portability we have had to *
62 * implement all but the simplest routines as *
63 * macros instead... *
64 * *
65 ***************************************************/
68 /***************************************************
69 * *
70 * Wrappers/replacements for some instructions *
71 * not available in all SSE versions. *
72 * *
73 ***************************************************/
75 #ifdef GMX_SSE4
76 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
77 #else
78 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
79 #endif
82 * Some compilers require a cast to change the interpretation
83 * of a register from FP to Int and vice versa, and not all of
84 * the provide instructions to do this. Roll our own wrappers...
87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
88 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
89 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
90 #elif defined(__GNUC__)
91 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
92 # define gmx_mm_castps_si128(a) ((__m128i)(a))
93 #else
94 static __m128 gmx_mm_castsi128_ps(__m128i a) { return *(__m128 *) &a; }
95 static __m128i gmx_mm_castps_si128(__m128 a) { return *(__m128i *) &a; }
96 #endif
100 /* IO functions, just for debugging */
102 static void
103 printxmm(const char *s,__m128 xmm)
105 float f[4];
107 _mm_storeu_ps(f,xmm);
108 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s,f[0],f[1],f[2],f[3]);
112 static void
113 printxmmsum(const char *s,__m128 xmm)
115 float f[4];
117 _mm_storeu_ps(f,xmm);
118 printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
122 static void
123 printxmmi(const char *s,__m128i xmmi)
125 int i[4];
127 _mm_storeu_si128((__m128i *)i,xmmi);
128 printf("%10s: %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3]);
132 /************************
134 * Simple math routines *
136 ************************/
138 static inline __m128
139 gmx_mm_invsqrt_ps(__m128 x)
141 const __m128 half = {0.5,0.5,0.5,0.5};
142 const __m128 three = {3.0,3.0,3.0,3.0};
144 __m128 lu = _mm_rsqrt_ps(x);
146 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
149 static inline __m128
150 gmx_mm_inv_ps(__m128 x)
152 const __m128 two = {2.0f,2.0f,2.0f,2.0f};
154 __m128 lu = _mm_rcp_ps(x);
156 return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
160 static inline __m128
161 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
163 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
166 /* Normal sum of four xmm registers */
167 static inline __m128
168 gmx_mm_sum4_ps(__m128 t0, __m128 t1, __m128 t2, __m128 t3)
170 t0 = _mm_add_ps(t0,t1);
171 t2 = _mm_add_ps(t2,t3);
172 return _mm_add_ps(t0,t2);
176 static __m128
177 gmx_mm_log_ps(__m128 x)
179 const __m128 exp_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
180 const __m128 one_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
181 const __m128 off_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
182 const __m128 mant_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
183 const __m128 base_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
184 const __m128 loge_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
186 const __m128 D5 = gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
187 const __m128 D4 = gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
188 const __m128 D3 = gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
189 const __m128 D2 = gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
190 const __m128 D1 = gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
191 const __m128 D0 = gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
193 __m128 xmm0,xmm1,xmm2;
195 xmm0 = x;
196 xmm1 = xmm0;
197 xmm1 = _mm_and_ps(xmm1, exp_ps);
198 xmm1 = gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1),8) );
200 xmm1 = _mm_or_ps(xmm1, one_ps);
201 xmm1 = _mm_sub_ps(xmm1, off_ps);
203 xmm1 = _mm_mul_ps(xmm1, base_ps);
204 xmm0 = _mm_and_ps(xmm0, mant_ps);
205 xmm0 = _mm_or_ps(xmm0, one_ps);
207 xmm2 = _mm_mul_ps(xmm0, D5);
208 xmm2 = _mm_add_ps(xmm2, D4);
209 xmm2 = _mm_mul_ps(xmm2,xmm0);
210 xmm2 = _mm_add_ps(xmm2, D3);
211 xmm2 = _mm_mul_ps(xmm2,xmm0);
212 xmm2 = _mm_add_ps(xmm2, D2);
213 xmm2 = _mm_mul_ps(xmm2,xmm0);
214 xmm2 = _mm_add_ps(xmm2, D1);
215 xmm2 = _mm_mul_ps(xmm2,xmm0);
216 xmm2 = _mm_add_ps(xmm2, D0);
217 xmm0 = _mm_sub_ps(xmm0, one_ps);
218 xmm0 = _mm_mul_ps(xmm0,xmm2);
219 xmm1 = _mm_add_ps(xmm1,xmm0);
221 x = xmm1;
222 x = _mm_mul_ps(x, loge_ps);
224 return x;
228 /* This exp-routine has a relative precision of 2^-22.33 bits (essentially single precision :-) ) */
229 static __m128
230 gmx_mm_exp_ps(__m128 x)
232 const __m128i half = _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000); // 0.5e+0f
233 const __m128i base = _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F); // 127
234 const __m128i CC = _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B); // log2(e)
236 const __m128i D5 = _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905); // 1.8775767e-3f
237 const __m128i D4 = _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806); // 8.9893397e-3f
238 const __m128i D3 = _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23); // 5.5826318e-2f
239 const __m128i D2 = _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4); // 2.4015361e-1f
240 const __m128i D1 = _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B); // 6.9315308e-1f
241 const __m128i D0 = _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF); // 9.9999994e-1f
243 __m128 xmm0,xmm1;
244 __m128i xmm2;
246 xmm0 = _mm_mul_ps(x,(__m128) CC);
247 xmm1 = _mm_sub_ps(xmm0, (__m128) half);
248 xmm2 = _mm_cvtps_epi32(xmm1);
249 xmm1 = _mm_cvtepi32_ps(xmm2);
251 xmm2 = _mm_add_epi32(xmm2,(__m128i) base);
252 xmm2 = _mm_slli_epi32(xmm2,23);
254 xmm0 = _mm_sub_ps(xmm0,xmm1);
255 xmm1 = _mm_mul_ps(xmm0,(__m128) D5);
256 xmm1 = _mm_add_ps(xmm1,(__m128) D4);
257 xmm1 = _mm_mul_ps(xmm1,xmm0);
258 xmm1 = _mm_add_ps(xmm1,(__m128) D3);
259 xmm1 = _mm_mul_ps(xmm1,xmm0);
260 xmm1 = _mm_add_ps(xmm1,(__m128) D2);
261 xmm1 = _mm_mul_ps(xmm1,xmm0);
262 xmm1 = _mm_add_ps(xmm1,(__m128) D1);
263 xmm1 = _mm_mul_ps(xmm1,xmm0);
264 xmm1 = _mm_add_ps(xmm1,(__m128) D0);
265 xmm1 = _mm_mul_ps(xmm1,(__m128) xmm2);
267 /* 18 instructions currently */
268 return xmm1;
273 #define GMX_MM_SINCOS_PS(x,sinval,cosval) \
275 const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI}; \
276 const __m128 _sincosf_half = {0.5,0.5,0.5,0.5}; \
277 const __m128 _sincosf_one = {1.0,1.0,1.0,1.0}; \
279 const __m128i _sincosf_izero = _mm_set1_epi32(0); \
280 const __m128i _sincosf_ione = _mm_set1_epi32(1); \
281 const __m128i _sincosf_itwo = _mm_set1_epi32(2); \
282 const __m128i _sincosf_ithree = _mm_set1_epi32(3); \
284 const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129}; \
285 const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8}; \
286 const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249}; \
287 const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950}; \
288 const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225}; \
289 const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727}; \
290 const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758}; \
291 const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247}; \
293 __m128 _sincosf_signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); \
294 __m128 _sincosf_tiny = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) ); \
296 __m128 _sincosf_xl; \
297 __m128 _sincosf_xl2; \
298 __m128 _sincosf_xl3; \
299 __m128 _sincosf_qf; \
300 __m128 _sincosf_absxl; \
301 __m128 _sincosf_p1; \
302 __m128 _sincosf_cx; \
303 __m128 _sincosf_sx; \
304 __m128 _sincosf_ts; \
305 __m128 _sincosf_tc; \
306 __m128 _sincosf_tsn; \
307 __m128 _sincosf_tcn; \
308 __m128i _sincosf_q; \
309 __m128i _sincosf_offsetSin; \
310 __m128i _sincosf_offsetCos; \
311 __m128 _sincosf_sinMask; \
312 __m128 _sincosf_cosMask; \
313 __m128 _sincosf_isTiny; \
314 __m128 _sincosf_ct0; \
315 __m128 _sincosf_ct1; \
316 __m128 _sincosf_ct2; \
317 __m128 _sincosf_st1; \
318 __m128 _sincosf_st2; \
320 _sincosf_xl = _mm_mul_ps(x,_sincosf_two_over_pi); \
322 _sincosf_xl = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
324 _sincosf_q = _mm_cvttps_epi32(_sincosf_xl); \
325 _sincosf_qf = _mm_cvtepi32_ps(_sincosf_q); \
327 _sincosf_offsetSin = _mm_and_si128(_sincosf_q,_sincosf_ithree); \
328 _sincosf_offsetCos = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione); \
330 _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1); \
331 _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2); \
332 _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1); \
333 _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl); \
335 _sincosf_absxl = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl); \
336 _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl); \
338 _sincosf_xl2 = _mm_mul_ps(_sincosf_xl,_sincosf_xl); \
339 _sincosf_xl3 = _mm_mul_ps(_sincosf_xl2,_sincosf_xl); \
341 _sincosf_ct1 = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2); \
342 _sincosf_ct1 = _mm_add_ps(_sincosf_ct1,_sincosf_cc1); \
343 _sincosf_st1 = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2); \
344 _sincosf_st1 = _mm_add_ps(_sincosf_st1,_sincosf_sc1); \
345 _sincosf_ct2 = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2); \
346 _sincosf_ct2 = _mm_add_ps(_sincosf_ct2,_sincosf_cc2); \
347 _sincosf_st2 = _mm_mul_ps(_sincosf_st1,_sincosf_xl2); \
348 _sincosf_st2 = _mm_add_ps(_sincosf_st2,_sincosf_sc2); \
350 _sincosf_cx = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2); \
351 _sincosf_cx = _mm_add_ps(_sincosf_cx,_sincosf_one); \
353 _sincosf_sx = _mm_mul_ps(_sincosf_st2,_sincosf_xl3); \
354 _sincosf_sx = _mm_add_ps(_sincosf_sx,_sincosf_xl); \
356 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
357 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
359 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
360 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
362 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
363 _sincosf_tsn = _mm_xor_ps(_sincosf_signbit,_sincosf_ts); \
364 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
366 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
367 _sincosf_tcn = _mm_xor_ps(_sincosf_signbit,_sincosf_tc); \
368 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
370 sinval = _sincosf_ts; \
371 cosval = _sincosf_tc; \
376 /* Load a single value from 1-4 places, merge into xmm register */
378 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
380 __m128 _txmm2,_txmm3,_txmm4; \
381 xmm1 = _mm_load_ss(ptr1); \
382 _txmm2 = _mm_load_ss(ptr2); \
383 _txmm3 = _mm_load_ss(ptr3); \
384 _txmm4 = _mm_load_ss(ptr4); \
385 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
386 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
387 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
391 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
393 __m128 _txmm2,_txmm3; \
394 xmm1 = _mm_load_ss(ptr1); \
395 _txmm2 = _mm_load_ss(ptr2); \
396 _txmm3 = _mm_load_ss(ptr3); \
397 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
398 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
402 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
404 __m128 _txmm2; \
405 xmm1 = _mm_load_ss(ptr1); \
406 _txmm2 = _mm_load_ss(ptr2); \
407 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
411 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
413 xmm1 = _mm_load_ss(ptr1); \
416 /* Store data in an xmm register into 1-4 different places */
417 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
419 __m128 _txmm2,_txmm3,_txmm4; \
420 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
421 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
422 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
423 _mm_store_ss(ptr1,xmm1); \
424 _mm_store_ss(ptr2,_txmm2); \
425 _mm_store_ss(ptr3,_txmm3); \
426 _mm_store_ss(ptr4,_txmm4); \
430 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
432 __m128 _txmm2,_txmm3; \
433 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
434 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
435 _mm_store_ss(ptr1,xmm1); \
436 _mm_store_ss(ptr2,_txmm2); \
437 _mm_store_ss(ptr3,_txmm3); \
441 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
443 __m128 _txmm2; \
444 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
445 _mm_store_ss(ptr1,xmm1); \
446 _mm_store_ss(ptr2,_txmm2); \
450 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
452 _mm_store_ss(ptr1,xmm1); \
456 /* Similar to store, but increments value in memory */
457 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
459 __m128 _tincr1,_tincr2; \
460 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
461 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
462 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
463 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
464 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
465 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
468 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
470 __m128 _tincr; \
471 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
472 _tincr = _mm_add_ps(_tincr,xmm1); \
473 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
476 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
478 __m128 _tincr; \
479 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
480 _tincr = _mm_add_ps(_tincr,xmm1); \
481 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
484 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
486 __m128 _tincr; \
487 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
488 _tincr = _mm_add_ps(_tincr,xmm1); \
489 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
492 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
494 __m128 _tincr; \
495 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
496 _tincr = _mm_add_ss(_tincr,xmm1); \
497 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
502 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
503 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
505 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
506 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
507 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
508 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
509 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
510 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
511 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
512 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
513 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
516 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
518 __m128 _tmp1,_tmp2,_tmp3; \
519 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
520 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
521 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
522 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
523 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
524 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
525 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
529 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
531 __m128 _tmp1,_tmp2; \
532 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
533 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
534 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
535 c12 = _mm_movehl_ps(c12,c6); \
538 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
540 c6 = _mm_load_ss(ptr1); \
541 c12 = _mm_load_ss(ptr1+1); \
545 /* Routines to load 1-4 rvecs from 1-4 places.
546 * We mainly use these to load coordinates. The extra routines
547 * are very efficient for the water-water loops, since we e.g.
548 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
550 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
551 jx1 = _mm_load_ss(ptr1); \
552 jy1 = _mm_load_ss((ptr1)+1); \
553 jz1 = _mm_load_ss((ptr1)+2); \
556 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
557 jx1 = _mm_load_ss(ptr1); \
558 jy1 = _mm_load_ss((ptr1)+1); \
559 jz1 = _mm_load_ss((ptr1)+2); \
560 jx2 = _mm_load_ss((ptr1)+3); \
561 jy2 = _mm_load_ss((ptr1)+4); \
562 jz2 = _mm_load_ss((ptr1)+5); \
566 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
567 jx1 = _mm_load_ss(ptr1); \
568 jy1 = _mm_load_ss((ptr1)+1); \
569 jz1 = _mm_load_ss((ptr1)+2); \
570 jx2 = _mm_load_ss((ptr1)+3); \
571 jy2 = _mm_load_ss((ptr1)+4); \
572 jz2 = _mm_load_ss((ptr1)+5); \
573 jx3 = _mm_load_ss((ptr1)+6); \
574 jy3 = _mm_load_ss((ptr1)+7); \
575 jz3 = _mm_load_ss((ptr1)+8); \
579 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
580 jx1 = _mm_load_ss(ptr1); \
581 jy1 = _mm_load_ss((ptr1)+1); \
582 jz1 = _mm_load_ss((ptr1)+2); \
583 jx2 = _mm_load_ss((ptr1)+3); \
584 jy2 = _mm_load_ss((ptr1)+4); \
585 jz2 = _mm_load_ss((ptr1)+5); \
586 jx3 = _mm_load_ss((ptr1)+6); \
587 jy3 = _mm_load_ss((ptr1)+7); \
588 jz3 = _mm_load_ss((ptr1)+8); \
589 jx4 = _mm_load_ss((ptr1)+9); \
590 jy4 = _mm_load_ss((ptr1)+10); \
591 jz4 = _mm_load_ss((ptr1)+11); \
595 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
596 __m128 _tmp1,_tmp2; \
597 _tmp1 = _mm_load_ss(ptr1); \
598 _tmp2 = _mm_load_ss(ptr2); \
599 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
600 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
601 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
602 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
603 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
604 jz1 = _mm_movehl_ps(jz1,jy1); \
607 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
608 __m128 _tmp1, _tmp2; \
609 _tmp1 = _mm_loadu_ps(ptr1); \
610 jy1 = _mm_loadu_ps(ptr2); \
611 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
612 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
613 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
614 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
615 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
616 jy1 = _mm_movehl_ps(jx1,jx1); \
617 jx2 = _mm_movehl_ps(jz1,jz1); \
618 jz2 = _mm_movehl_ps(jy2,jy2); \
622 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
623 __m128 _tmp1, _tmp2, _tmp3; \
624 _tmp1 = _mm_loadu_ps(ptr1); \
625 jy1 = _mm_loadu_ps(ptr2); \
626 _tmp2 = _mm_loadu_ps(ptr1+4); \
627 jz2 = _mm_loadu_ps(ptr2+4); \
628 jz3 = _mm_load_ss(ptr1+8); \
629 _tmp3 = _mm_load_ss(ptr2+8); \
630 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
631 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
632 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
633 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
634 jy1 = _mm_movehl_ps(jx1,jx1); \
635 jx2 = _mm_movehl_ps(jz1,jz1); \
636 jz2 = _mm_movehl_ps(jy2,jy2); \
637 jy3 = _mm_movehl_ps(jx3,jx3); \
638 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
642 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
643 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
644 _tmp1 = _mm_loadu_ps(ptr1); \
645 jy1 = _mm_loadu_ps(ptr2); \
646 _tmp2 = _mm_loadu_ps(ptr1+4); \
647 jz2 = _mm_loadu_ps(ptr2+4); \
648 _tmp3 = _mm_loadu_ps(ptr1+8); \
649 _tmp4 = _mm_loadu_ps(ptr2+8); \
650 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
651 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
652 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
653 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
654 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
655 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
656 jy1 = _mm_movehl_ps(jx1,jx1); \
657 jx2 = _mm_movehl_ps(jz1,jz1); \
658 jz2 = _mm_movehl_ps(jy2,jy2); \
659 jy3 = _mm_movehl_ps(jx3,jx3); \
660 jx4 = _mm_movehl_ps(jz3,jz3); \
661 jz4 = _mm_movehl_ps(jy4,jy4); \
665 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
666 __m128 _tmp1,_tmp3,_tmp4; \
667 jx1 = _mm_load_ss(ptr1); \
668 jy1 = _mm_load_ss(ptr2); \
669 jz1 = _mm_load_ss(ptr3); \
670 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
671 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
672 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
673 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
674 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
675 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
676 jx1 = _mm_movelh_ps(_tmp1,jz1); \
677 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
678 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
682 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
683 __m128 _tmp1, _tmp2; \
684 jx1 = _mm_loadu_ps(ptr1); \
685 jy1 = _mm_loadu_ps(ptr2); \
686 jz1 = _mm_loadu_ps(ptr3); \
687 jx2 = _mm_setzero_ps(); \
688 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
689 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
690 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
691 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
692 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
693 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
694 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
695 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
699 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
700 __m128 _tmp1, _tmp2; \
701 jx1 = _mm_loadu_ps(ptr1); \
702 jy1 = _mm_loadu_ps(ptr2); \
703 jz1 = _mm_loadu_ps(ptr3); \
704 jx2 = _mm_setzero_ps(); \
705 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
706 jy2 = _mm_loadu_ps(ptr1+4); \
707 jz2 = _mm_loadu_ps(ptr2+4); \
708 jx3 = _mm_loadu_ps(ptr3+4); \
709 jy3 = _mm_setzero_ps(); \
710 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
711 jz3 = _mm_load_ss(ptr1+8); \
712 _tmp1 = _mm_load_ss(ptr2+8); \
713 _tmp2 = _mm_load_ss(ptr3+8); \
714 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
715 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
716 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
720 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
721 jx1 = _mm_loadu_ps(ptr1); \
722 jy1 = _mm_loadu_ps(ptr2); \
723 jz1 = _mm_loadu_ps(ptr3); \
724 jx2 = _mm_setzero_ps(); \
725 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
726 jy2 = _mm_loadu_ps(ptr1+4); \
727 jz2 = _mm_loadu_ps(ptr2+4); \
728 jx3 = _mm_loadu_ps(ptr3+4); \
729 jy3 = _mm_setzero_ps(); \
730 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
731 jz3 = _mm_loadu_ps(ptr1+8); \
732 jx4 = _mm_loadu_ps(ptr2+8); \
733 jy4 = _mm_loadu_ps(ptr3+8); \
734 jz4 = _mm_setzero_ps(); \
735 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
740 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
741 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
742 jx1 = _mm_load_ss(ptr1); \
743 _tmp1 = _mm_load_ss(ptr2); \
744 jy1 = _mm_load_ss(ptr3); \
745 jz1 = _mm_load_ss(ptr4); \
746 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
747 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
748 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
749 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
750 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
751 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
752 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
753 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
754 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
755 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
756 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
760 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
761 __m128 _tmp1, _tmp2; \
762 jx1 = _mm_loadu_ps(ptr1); \
763 jy1 = _mm_loadu_ps(ptr2); \
764 jz1 = _mm_loadu_ps(ptr3); \
765 jx2 = _mm_loadu_ps(ptr4); \
766 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
767 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
768 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
769 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
770 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
771 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
772 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
773 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
774 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
778 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
779 __m128 _tmp1, _tmp2, _tmp3; \
780 jx1 = _mm_loadu_ps(ptr1); \
781 jy1 = _mm_loadu_ps(ptr2); \
782 jz1 = _mm_loadu_ps(ptr3); \
783 jx2 = _mm_loadu_ps(ptr4); \
784 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
785 jy2 = _mm_loadu_ps(ptr1+4); \
786 jz2 = _mm_loadu_ps(ptr2+4); \
787 jx3 = _mm_loadu_ps(ptr3+4); \
788 jy3 = _mm_loadu_ps(ptr4+4); \
789 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
790 jz3 = _mm_load_ss(ptr1+8); \
791 _tmp1 = _mm_load_ss(ptr2+8); \
792 _tmp2 = _mm_load_ss(ptr3+8); \
793 _tmp3 = _mm_load_ss(ptr4+8); \
794 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
795 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
796 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
800 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
801 jx1 = _mm_loadu_ps(ptr1); \
802 jy1 = _mm_loadu_ps(ptr2); \
803 jz1 = _mm_loadu_ps(ptr3); \
804 jx2 = _mm_loadu_ps(ptr4); \
805 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
806 jy2 = _mm_loadu_ps(ptr1+4); \
807 jz2 = _mm_loadu_ps(ptr2+4); \
808 jx3 = _mm_loadu_ps(ptr3+4); \
809 jy3 = _mm_loadu_ps(ptr4+4); \
810 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
811 jz3 = _mm_loadu_ps(ptr1+8); \
812 jx4 = _mm_loadu_ps(ptr2+8); \
813 jy4 = _mm_loadu_ps(ptr3+8); \
814 jz4 = _mm_loadu_ps(ptr4+8); \
815 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
819 /* Routines to increment rvecs in memory, typically use for j particle force updates */
820 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
821 __m128 _tmp1; \
822 jy1 = _mm_unpacklo_ps(jy1,jz1); \
823 jx1 = _mm_movelh_ps(jx1,jy1); \
824 _tmp1 = _mm_load_ss(ptr1); \
825 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
826 _tmp1 = _mm_add_ps(_tmp1,jx1); \
827 _mm_store_ss(ptr1,_tmp1); \
828 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
832 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
833 __m128 _tmp1, _tmp2; \
834 _tmp1 = _mm_loadu_ps(ptr1); \
835 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
836 jx1 = _mm_unpacklo_ps(jx1,jy1); \
837 jz1 = _mm_unpacklo_ps(jz1,jx2); \
838 jy2 = _mm_unpacklo_ps(jy2,jz2); \
839 jx1 = _mm_movelh_ps(jx1,jz1); \
840 _tmp1 = _mm_add_ps(_tmp1,jx1); \
841 _tmp2 = _mm_add_ps(_tmp2,jy2); \
842 _mm_storeu_ps(ptr1,_tmp1); \
843 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
847 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
848 __m128 _tmp1, _tmp2, _tmp3; \
849 _tmp1 = _mm_loadu_ps(ptr1); \
850 _tmp2 = _mm_loadu_ps(ptr1+4); \
851 _tmp3 = _mm_load_ss(ptr1+8); \
852 jx1 = _mm_unpacklo_ps(jx1,jy1); \
853 jz1 = _mm_unpacklo_ps(jz1,jx2); \
854 jy2 = _mm_unpacklo_ps(jy2,jz2); \
855 jx3 = _mm_unpacklo_ps(jx3,jy3); \
856 jx1 = _mm_movelh_ps(jx1,jz1); \
857 jy2 = _mm_movelh_ps(jy2,jx3); \
858 _tmp1 = _mm_add_ps(_tmp1,jx1); \
859 _tmp2 = _mm_add_ps(_tmp2,jy2); \
860 _tmp3 = _mm_add_ss(_tmp3,jz3); \
861 _mm_storeu_ps(ptr1,_tmp1); \
862 _mm_storeu_ps(ptr1+4,_tmp2); \
863 _mm_store_ss(ptr1+8,_tmp3); \
867 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
868 __m128 _tmp1, _tmp2, _tmp3; \
869 _tmp1 = _mm_loadu_ps(ptr1); \
870 _tmp2 = _mm_loadu_ps(ptr1+4); \
871 _tmp3 = _mm_loadu_ps(ptr1+8); \
872 jx1 = _mm_unpacklo_ps(jx1,jy1); \
873 jz1 = _mm_unpacklo_ps(jz1,jx2); \
874 jy2 = _mm_unpacklo_ps(jy2,jz2); \
875 jx3 = _mm_unpacklo_ps(jx3,jy3); \
876 jz3 = _mm_unpacklo_ps(jz3,jx4); \
877 jy4 = _mm_unpacklo_ps(jy4,jz4); \
878 jx1 = _mm_movelh_ps(jx1,jz1); \
879 jy2 = _mm_movelh_ps(jy2,jx3); \
880 jz3 = _mm_movelh_ps(jz3,jy4); \
881 _tmp1 = _mm_add_ps(_tmp1,jx1); \
882 _tmp2 = _mm_add_ps(_tmp2,jy2); \
883 _tmp3 = _mm_add_ps(_tmp3,jz3); \
884 _mm_storeu_ps(ptr1,_tmp1); \
885 _mm_storeu_ps(ptr1+4,_tmp2); \
886 _mm_storeu_ps(ptr1+8,_tmp3); \
890 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
891 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
892 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
893 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
894 _tmp2 = _mm_load_ss(ptr1+2); \
895 _tmp3 = _mm_load_ss(ptr2+2); \
896 jx1 = _mm_unpacklo_ps(jx1,jy1); \
897 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
898 _tmp1 = _mm_add_ps(_tmp1,jx1); \
899 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
900 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
901 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
902 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
906 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
907 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
908 _tmp1 = _mm_loadu_ps(ptr1); \
909 _tmp2 = _mm_loadu_ps(ptr2); \
910 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
911 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
912 jx1 = _mm_unpacklo_ps(jx1,jy1); \
913 jz1 = _mm_unpacklo_ps(jz1,jx2); \
914 jy2 = _mm_unpacklo_ps(jy2,jz2); \
915 _tmp4 = _mm_movelh_ps(jx1,jz1); \
916 _tmp5 = _mm_movehl_ps(jz1,jx1); \
917 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
918 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
919 _tmp3 = _mm_add_ps(_tmp3,jy2); \
920 _mm_storeu_ps(ptr1,_tmp1); \
921 _mm_storeu_ps(ptr2,_tmp2); \
922 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
923 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
927 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
928 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
929 _tmp1 = _mm_loadu_ps(ptr1); \
930 _tmp2 = _mm_loadu_ps(ptr1+4); \
931 _tmp3 = _mm_load_ss(ptr1+8); \
932 _tmp4 = _mm_loadu_ps(ptr2); \
933 _tmp5 = _mm_loadu_ps(ptr2+4); \
934 _tmp6 = _mm_load_ss(ptr2+8); \
935 jx1 = _mm_unpacklo_ps(jx1,jy1); \
936 jz1 = _mm_unpacklo_ps(jz1,jx2); \
937 jy2 = _mm_unpacklo_ps(jy2,jz2); \
938 jx3 = _mm_unpacklo_ps(jx3,jy3); \
939 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
940 _tmp8 = _mm_movelh_ps(jx1,jz1); \
941 _tmp9 = _mm_movehl_ps(jz1,jx1); \
942 _tmp10 = _mm_movelh_ps(jy2,jx3); \
943 _tmp11 = _mm_movehl_ps(jx3,jy2); \
944 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
945 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
946 _tmp3 = _mm_add_ss(_tmp3,jz3); \
947 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
948 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
949 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
950 _mm_storeu_ps(ptr1,_tmp1); \
951 _mm_storeu_ps(ptr1+4,_tmp2); \
952 _mm_store_ss(ptr1+8,_tmp3); \
953 _mm_storeu_ps(ptr2,_tmp4); \
954 _mm_storeu_ps(ptr2+4,_tmp5); \
955 _mm_store_ss(ptr2+8,_tmp6); \
959 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
960 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
961 _tmp1 = _mm_loadu_ps(ptr1); \
962 _tmp2 = _mm_loadu_ps(ptr1+4); \
963 _tmp3 = _mm_loadu_ps(ptr1+8); \
964 _tmp4 = _mm_loadu_ps(ptr2); \
965 _tmp5 = _mm_loadu_ps(ptr2+4); \
966 _tmp6 = _mm_loadu_ps(ptr2+8); \
967 jx1 = _mm_unpacklo_ps(jx1,jy1); \
968 jz1 = _mm_unpacklo_ps(jz1,jx2); \
969 jy2 = _mm_unpacklo_ps(jy2,jz2); \
970 jx3 = _mm_unpacklo_ps(jx3,jy3); \
971 jz3 = _mm_unpacklo_ps(jz3,jx4); \
972 jy4 = _mm_unpacklo_ps(jy4,jz4); \
973 _tmp8 = _mm_movelh_ps(jx1,jz1); \
974 _tmp9 = _mm_movehl_ps(jz1,jx1); \
975 _tmp10 = _mm_movelh_ps(jy2,jx3); \
976 _tmp11 = _mm_movehl_ps(jx3,jy2); \
977 _tmp12 = _mm_movelh_ps(jz3,jy4); \
978 _tmp13 = _mm_movehl_ps(jy4,jz3); \
979 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
980 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
981 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
982 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
983 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
984 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
985 _mm_storeu_ps(ptr1,_tmp1); \
986 _mm_storeu_ps(ptr1+4,_tmp2); \
987 _mm_storeu_ps(ptr1+8,_tmp3); \
988 _mm_storeu_ps(ptr2,_tmp4); \
989 _mm_storeu_ps(ptr2+4,_tmp5); \
990 _mm_storeu_ps(ptr2+8,_tmp6); \
994 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
995 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
996 _tmp1 = _mm_load_ss(ptr1); \
997 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
998 _tmp2 = _mm_load_ss(ptr2); \
999 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1000 _tmp3 = _mm_load_ss(ptr3); \
1001 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1002 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1003 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1004 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1005 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1006 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1007 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1008 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1009 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1010 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1011 _mm_store_ss(ptr1,_tmp1); \
1012 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1013 _mm_store_ss(ptr2,_tmp2); \
1014 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1015 _mm_store_ss(ptr3,_tmp3); \
1016 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1020 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1021 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1022 _tmp1 = _mm_loadu_ps(ptr1); \
1023 _tmp2 = _mm_loadu_ps(ptr2); \
1024 _tmp3 = _mm_loadu_ps(ptr3); \
1025 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1026 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1027 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1028 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1029 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1030 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1031 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1032 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1033 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1034 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1035 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1036 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1037 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1038 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1039 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1040 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1041 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1042 _mm_storeu_ps(ptr1,_tmp1); \
1043 _mm_storeu_ps(ptr2,_tmp2); \
1044 _mm_storeu_ps(ptr3,_tmp3); \
1045 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1046 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1047 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1051 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1052 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1053 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1054 _tmp1 = _mm_loadu_ps(ptr1); \
1055 _tmp2 = _mm_loadu_ps(ptr1+4); \
1056 _tmp3 = _mm_load_ss(ptr1+8); \
1057 _tmp4 = _mm_loadu_ps(ptr2); \
1058 _tmp5 = _mm_loadu_ps(ptr2+4); \
1059 _tmp6 = _mm_load_ss(ptr2+8); \
1060 _tmp7 = _mm_loadu_ps(ptr3); \
1061 _tmp8 = _mm_loadu_ps(ptr3+4); \
1062 _tmp9 = _mm_load_ss(ptr3+8); \
1063 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1064 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1065 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1066 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1067 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1068 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1069 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1070 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1071 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1072 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1073 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1074 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1075 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1076 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1077 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1078 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1079 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1080 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1081 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1082 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1083 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1084 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1085 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1086 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1087 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1088 _mm_storeu_ps(ptr1,_tmp1); \
1089 _mm_storeu_ps(ptr1+4,_tmp2); \
1090 _mm_store_ss(ptr1+8,_tmp3); \
1091 _mm_storeu_ps(ptr2,_tmp4); \
1092 _mm_storeu_ps(ptr2+4,_tmp5); \
1093 _mm_store_ss(ptr2+8,_tmp6); \
1094 _mm_storeu_ps(ptr3,_tmp7); \
1095 _mm_storeu_ps(ptr3+4,_tmp8); \
1096 _mm_store_ss(ptr3+8,_tmp9); \
1100 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1101 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1102 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1103 _tmp1 = _mm_loadu_ps(ptr1); \
1104 _tmp2 = _mm_loadu_ps(ptr1+4); \
1105 _tmp3 = _mm_loadu_ps(ptr1+8); \
1106 _tmp4 = _mm_loadu_ps(ptr2); \
1107 _tmp5 = _mm_loadu_ps(ptr2+4); \
1108 _tmp6 = _mm_loadu_ps(ptr2+8); \
1109 _tmp7 = _mm_loadu_ps(ptr3); \
1110 _tmp8 = _mm_loadu_ps(ptr3+4); \
1111 _tmp9 = _mm_loadu_ps(ptr3+8); \
1112 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1113 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1114 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1115 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1116 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1117 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1118 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1119 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1120 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1121 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1122 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1123 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1124 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1125 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1126 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1127 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1128 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1129 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1130 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1131 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1132 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1133 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1134 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1135 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1136 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1137 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1138 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1139 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1140 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1141 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1142 _mm_storeu_ps(ptr1,_tmp1); \
1143 _mm_storeu_ps(ptr1+4,_tmp2); \
1144 _mm_storeu_ps(ptr1+8,_tmp3); \
1145 _mm_storeu_ps(ptr2,_tmp4); \
1146 _mm_storeu_ps(ptr2+4,_tmp5); \
1147 _mm_storeu_ps(ptr2+8,_tmp6); \
1148 _mm_storeu_ps(ptr3,_tmp7); \
1149 _mm_storeu_ps(ptr3+4,_tmp8); \
1150 _mm_storeu_ps(ptr3+8,_tmp9); \
1155 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1156 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1157 _tmp1 = _mm_load_ss(ptr1); \
1158 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1159 _tmp2 = _mm_load_ss(ptr2); \
1160 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1161 _tmp3 = _mm_load_ss(ptr3); \
1162 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1163 _tmp4 = _mm_load_ss(ptr4); \
1164 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1165 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1166 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1167 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1168 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1169 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1170 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1171 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1172 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1173 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1174 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1175 _mm_store_ss(ptr1,_tmp1); \
1176 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1177 _mm_store_ss(ptr2,_tmp2); \
1178 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1179 _mm_store_ss(ptr3,_tmp3); \
1180 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1181 _mm_store_ss(ptr4,_tmp4); \
1182 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1186 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1187 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1188 _tmp1 = _mm_loadu_ps(ptr1); \
1189 _tmp2 = _mm_loadu_ps(ptr2); \
1190 _tmp3 = _mm_loadu_ps(ptr3); \
1191 _tmp4 = _mm_loadu_ps(ptr4); \
1192 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1193 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1194 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1195 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1196 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1197 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1198 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1199 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1200 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1201 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1202 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1203 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1204 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1205 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1206 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1207 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1208 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1209 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1210 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1211 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1212 _mm_storeu_ps(ptr1,_tmp1); \
1213 _mm_storeu_ps(ptr2,_tmp2); \
1214 _mm_storeu_ps(ptr3,_tmp3); \
1215 _mm_storeu_ps(ptr4,_tmp4); \
1216 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1217 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1218 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1219 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1223 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1224 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1225 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1226 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1227 _tmp1 = _mm_loadu_ps(ptr1); \
1228 _tmp2 = _mm_loadu_ps(ptr1+4); \
1229 _tmp3 = _mm_load_ss(ptr1+8); \
1230 _tmp4 = _mm_loadu_ps(ptr2); \
1231 _tmp5 = _mm_loadu_ps(ptr2+4); \
1232 _tmp6 = _mm_load_ss(ptr2+8); \
1233 _tmp7 = _mm_loadu_ps(ptr3); \
1234 _tmp8 = _mm_loadu_ps(ptr3+4); \
1235 _tmp9 = _mm_load_ss(ptr3+8); \
1236 _tmp10 = _mm_loadu_ps(ptr4); \
1237 _tmp11 = _mm_loadu_ps(ptr4+4); \
1238 _tmp12 = _mm_load_ss(ptr4+8); \
1239 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1240 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1241 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1242 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1243 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1244 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1245 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1246 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1247 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1248 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1249 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1250 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1251 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1252 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1253 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1254 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1255 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1256 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1257 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1258 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1259 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1260 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1261 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1262 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1263 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1264 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1265 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1266 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1267 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1268 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1269 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1270 _mm_storeu_ps(ptr1,_tmp1); \
1271 _mm_storeu_ps(ptr1+4,_tmp2); \
1272 _mm_store_ss(ptr1+8,_tmp3); \
1273 _mm_storeu_ps(ptr2,_tmp4); \
1274 _mm_storeu_ps(ptr2+4,_tmp5); \
1275 _mm_store_ss(ptr2+8,_tmp6); \
1276 _mm_storeu_ps(ptr3,_tmp7); \
1277 _mm_storeu_ps(ptr3+4,_tmp8); \
1278 _mm_store_ss(ptr3+8,_tmp9); \
1279 _mm_storeu_ps(ptr4,_tmp10); \
1280 _mm_storeu_ps(ptr4+4,_tmp11); \
1281 _mm_store_ss(ptr4+8,_tmp12); \
1285 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1286 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1287 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1288 __m128 _tmp23,_tmp24; \
1289 _tmp1 = _mm_loadu_ps(ptr1); \
1290 _tmp2 = _mm_loadu_ps(ptr1+4); \
1291 _tmp3 = _mm_loadu_ps(ptr1+8); \
1292 _tmp4 = _mm_loadu_ps(ptr2); \
1293 _tmp5 = _mm_loadu_ps(ptr2+4); \
1294 _tmp6 = _mm_loadu_ps(ptr2+8); \
1295 _tmp7 = _mm_loadu_ps(ptr3); \
1296 _tmp8 = _mm_loadu_ps(ptr3+4); \
1297 _tmp9 = _mm_loadu_ps(ptr3+8); \
1298 _tmp10 = _mm_loadu_ps(ptr4); \
1299 _tmp11 = _mm_loadu_ps(ptr4+4); \
1300 _tmp12 = _mm_loadu_ps(ptr4+8); \
1301 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1302 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1303 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1304 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1305 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1306 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1307 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1308 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1309 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1310 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1311 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1312 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1313 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1314 jz1 = _mm_movehl_ps(jz1,jx1); \
1315 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1316 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1317 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1318 jx3 = _mm_movehl_ps(jx3,jy2); \
1319 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1320 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1321 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1322 jy4 = _mm_movehl_ps(jy4,jz3); \
1323 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1324 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1325 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1326 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1327 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1328 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1329 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1330 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1331 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1332 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1333 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1334 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1335 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1336 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1337 _mm_storeu_ps(ptr1,_tmp1); \
1338 _mm_storeu_ps(ptr1+4,_tmp2); \
1339 _mm_storeu_ps(ptr1+8,_tmp3); \
1340 _mm_storeu_ps(ptr2,_tmp4); \
1341 _mm_storeu_ps(ptr2+4,_tmp5); \
1342 _mm_storeu_ps(ptr2+8,_tmp6); \
1343 _mm_storeu_ps(ptr3,_tmp7); \
1344 _mm_storeu_ps(ptr3+4,_tmp8); \
1345 _mm_storeu_ps(ptr3+8,_tmp9); \
1346 _mm_storeu_ps(ptr4,_tmp10); \
1347 _mm_storeu_ps(ptr4+4,_tmp11); \
1348 _mm_storeu_ps(ptr4+8,_tmp12); \
1353 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1354 __m128 _tmp1; \
1355 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1356 jx1 = _mm_movelh_ps(jx1,jy1); \
1357 _tmp1 = _mm_load_ss(ptr1); \
1358 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1359 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1360 _mm_store_ss(ptr1,_tmp1); \
1361 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1365 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1366 __m128 _tmp1, _tmp2; \
1367 _tmp1 = _mm_loadu_ps(ptr1); \
1368 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1369 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1370 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1371 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1372 jx1 = _mm_movelh_ps(jx1,jz1); \
1373 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1374 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1375 _mm_storeu_ps(ptr1,_tmp1); \
1376 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1380 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1381 __m128 _tmp1, _tmp2, _tmp3; \
1382 _tmp1 = _mm_loadu_ps(ptr1); \
1383 _tmp2 = _mm_loadu_ps(ptr1+4); \
1384 _tmp3 = _mm_load_ss(ptr1+8); \
1385 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1386 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1387 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1388 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1389 jx1 = _mm_movelh_ps(jx1,jz1); \
1390 jy2 = _mm_movelh_ps(jy2,jx3); \
1391 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1392 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1393 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1394 _mm_storeu_ps(ptr1,_tmp1); \
1395 _mm_storeu_ps(ptr1+4,_tmp2); \
1396 _mm_store_ss(ptr1+8,_tmp3); \
1400 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1401 __m128 _tmp1, _tmp2, _tmp3; \
1402 _tmp1 = _mm_loadu_ps(ptr1); \
1403 _tmp2 = _mm_loadu_ps(ptr1+4); \
1404 _tmp3 = _mm_loadu_ps(ptr1+8); \
1405 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1406 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1407 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1408 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1409 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1410 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1411 jx1 = _mm_movelh_ps(jx1,jz1); \
1412 jy2 = _mm_movelh_ps(jy2,jx3); \
1413 jz3 = _mm_movelh_ps(jz3,jy4); \
1414 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1415 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1416 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1417 _mm_storeu_ps(ptr1,_tmp1); \
1418 _mm_storeu_ps(ptr1+4,_tmp2); \
1419 _mm_storeu_ps(ptr1+8,_tmp3); \
1423 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1424 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1425 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1426 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1427 _tmp2 = _mm_load_ss(ptr1+2); \
1428 _tmp3 = _mm_load_ss(ptr2+2); \
1429 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1430 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1431 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1432 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1433 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1434 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1435 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1439 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1440 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1441 _tmp1 = _mm_loadu_ps(ptr1); \
1442 _tmp2 = _mm_loadu_ps(ptr2); \
1443 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1444 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1445 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1446 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1447 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1448 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1449 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1450 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1451 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1452 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1453 _mm_storeu_ps(ptr1,_tmp1); \
1454 _mm_storeu_ps(ptr2,_tmp2); \
1455 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1456 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1460 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1461 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1462 _tmp1 = _mm_loadu_ps(ptr1); \
1463 _tmp2 = _mm_loadu_ps(ptr1+4); \
1464 _tmp3 = _mm_load_ss(ptr1+8); \
1465 _tmp4 = _mm_loadu_ps(ptr2); \
1466 _tmp5 = _mm_loadu_ps(ptr2+4); \
1467 _tmp6 = _mm_load_ss(ptr2+8); \
1468 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1469 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1470 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1471 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1472 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1473 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1474 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1475 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1476 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1477 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1478 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1479 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1480 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1481 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1482 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1483 _mm_storeu_ps(ptr1,_tmp1); \
1484 _mm_storeu_ps(ptr1+4,_tmp2); \
1485 _mm_store_ss(ptr1+8,_tmp3); \
1486 _mm_storeu_ps(ptr2,_tmp4); \
1487 _mm_storeu_ps(ptr2+4,_tmp5); \
1488 _mm_store_ss(ptr2+8,_tmp6); \
1492 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1493 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1494 _tmp1 = _mm_loadu_ps(ptr1); \
1495 _tmp2 = _mm_loadu_ps(ptr1+4); \
1496 _tmp3 = _mm_loadu_ps(ptr1+8); \
1497 _tmp4 = _mm_loadu_ps(ptr2); \
1498 _tmp5 = _mm_loadu_ps(ptr2+4); \
1499 _tmp6 = _mm_loadu_ps(ptr2+8); \
1500 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1501 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1502 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1503 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1504 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1505 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1506 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1507 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1508 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1509 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1510 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1511 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1512 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1513 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1514 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1515 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1516 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1517 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1518 _mm_storeu_ps(ptr1,_tmp1); \
1519 _mm_storeu_ps(ptr1+4,_tmp2); \
1520 _mm_storeu_ps(ptr1+8,_tmp3); \
1521 _mm_storeu_ps(ptr2,_tmp4); \
1522 _mm_storeu_ps(ptr2+4,_tmp5); \
1523 _mm_storeu_ps(ptr2+8,_tmp6); \
1527 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1528 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1529 _tmp1 = _mm_load_ss(ptr1); \
1530 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1531 _tmp2 = _mm_load_ss(ptr2); \
1532 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1533 _tmp3 = _mm_load_ss(ptr3); \
1534 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1535 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1536 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1537 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1538 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1539 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1540 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1541 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1542 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1543 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1544 _mm_store_ss(ptr1,_tmp1); \
1545 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1546 _mm_store_ss(ptr2,_tmp2); \
1547 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1548 _mm_store_ss(ptr3,_tmp3); \
1549 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1553 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1554 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1555 _tmp1 = _mm_loadu_ps(ptr1); \
1556 _tmp2 = _mm_loadu_ps(ptr2); \
1557 _tmp3 = _mm_loadu_ps(ptr3); \
1558 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1559 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1560 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1561 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1562 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1563 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1564 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1565 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1566 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1567 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1568 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1569 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1570 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1571 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1572 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1573 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1574 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1575 _mm_storeu_ps(ptr1,_tmp1); \
1576 _mm_storeu_ps(ptr2,_tmp2); \
1577 _mm_storeu_ps(ptr3,_tmp3); \
1578 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1579 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1580 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1584 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1585 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1586 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1587 _tmp1 = _mm_loadu_ps(ptr1); \
1588 _tmp2 = _mm_loadu_ps(ptr1+4); \
1589 _tmp3 = _mm_load_ss(ptr1+8); \
1590 _tmp4 = _mm_loadu_ps(ptr2); \
1591 _tmp5 = _mm_loadu_ps(ptr2+4); \
1592 _tmp6 = _mm_load_ss(ptr2+8); \
1593 _tmp7 = _mm_loadu_ps(ptr3); \
1594 _tmp8 = _mm_loadu_ps(ptr3+4); \
1595 _tmp9 = _mm_load_ss(ptr3+8); \
1596 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1597 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1598 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1599 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1600 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1601 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1602 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1603 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1604 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1605 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1606 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1607 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1608 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1609 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1610 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1611 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1612 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1613 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1614 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1615 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1616 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1617 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1618 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1619 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1620 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1621 _mm_storeu_ps(ptr1,_tmp1); \
1622 _mm_storeu_ps(ptr1+4,_tmp2); \
1623 _mm_store_ss(ptr1+8,_tmp3); \
1624 _mm_storeu_ps(ptr2,_tmp4); \
1625 _mm_storeu_ps(ptr2+4,_tmp5); \
1626 _mm_store_ss(ptr2+8,_tmp6); \
1627 _mm_storeu_ps(ptr3,_tmp7); \
1628 _mm_storeu_ps(ptr3+4,_tmp8); \
1629 _mm_store_ss(ptr3+8,_tmp9); \
1633 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1634 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1635 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1636 _tmp1 = _mm_loadu_ps(ptr1); \
1637 _tmp2 = _mm_loadu_ps(ptr1+4); \
1638 _tmp3 = _mm_loadu_ps(ptr1+8); \
1639 _tmp4 = _mm_loadu_ps(ptr2); \
1640 _tmp5 = _mm_loadu_ps(ptr2+4); \
1641 _tmp6 = _mm_loadu_ps(ptr2+8); \
1642 _tmp7 = _mm_loadu_ps(ptr3); \
1643 _tmp8 = _mm_loadu_ps(ptr3+4); \
1644 _tmp9 = _mm_loadu_ps(ptr3+8); \
1645 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1646 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1647 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1648 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1649 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1650 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1651 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1652 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1653 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1654 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1655 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1656 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1657 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1658 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1659 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1660 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1661 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1662 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1663 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1664 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1665 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1666 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1667 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1668 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
1669 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1670 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1671 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
1672 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1673 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1674 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
1675 _mm_storeu_ps(ptr1,_tmp1); \
1676 _mm_storeu_ps(ptr1+4,_tmp2); \
1677 _mm_storeu_ps(ptr1+8,_tmp3); \
1678 _mm_storeu_ps(ptr2,_tmp4); \
1679 _mm_storeu_ps(ptr2+4,_tmp5); \
1680 _mm_storeu_ps(ptr2+8,_tmp6); \
1681 _mm_storeu_ps(ptr3,_tmp7); \
1682 _mm_storeu_ps(ptr3+4,_tmp8); \
1683 _mm_storeu_ps(ptr3+8,_tmp9); \
1689 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1690 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1691 _tmp1 = _mm_load_ss(ptr1); \
1692 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1693 _tmp2 = _mm_load_ss(ptr2); \
1694 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1695 _tmp3 = _mm_load_ss(ptr3); \
1696 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1697 _tmp4 = _mm_load_ss(ptr4); \
1698 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1699 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1700 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1701 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1702 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1703 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1704 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1705 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
1706 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
1707 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
1708 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
1709 _mm_store_ss(ptr1,_tmp1); \
1710 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1711 _mm_store_ss(ptr2,_tmp2); \
1712 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1713 _mm_store_ss(ptr3,_tmp3); \
1714 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1715 _mm_store_ss(ptr4,_tmp4); \
1716 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1721 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1722 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1723 _tmp1 = _mm_loadu_ps(ptr1); \
1724 _tmp2 = _mm_loadu_ps(ptr2); \
1725 _tmp3 = _mm_loadu_ps(ptr3); \
1726 _tmp4 = _mm_loadu_ps(ptr4); \
1727 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1728 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1729 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1730 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1731 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1732 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1733 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1734 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1735 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1736 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1737 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1738 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1739 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1740 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1741 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
1742 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
1743 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1744 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
1745 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
1746 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
1747 _mm_storeu_ps(ptr1,_tmp1); \
1748 _mm_storeu_ps(ptr2,_tmp2); \
1749 _mm_storeu_ps(ptr3,_tmp3); \
1750 _mm_storeu_ps(ptr4,_tmp4); \
1751 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1752 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1753 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1754 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1758 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1759 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1760 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1761 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1762 _tmp1 = _mm_loadu_ps(ptr1); \
1763 _tmp2 = _mm_loadu_ps(ptr1+4); \
1764 _tmp3 = _mm_load_ss(ptr1+8); \
1765 _tmp4 = _mm_loadu_ps(ptr2); \
1766 _tmp5 = _mm_loadu_ps(ptr2+4); \
1767 _tmp6 = _mm_load_ss(ptr2+8); \
1768 _tmp7 = _mm_loadu_ps(ptr3); \
1769 _tmp8 = _mm_loadu_ps(ptr3+4); \
1770 _tmp9 = _mm_load_ss(ptr3+8); \
1771 _tmp10 = _mm_loadu_ps(ptr4); \
1772 _tmp11 = _mm_loadu_ps(ptr4+4); \
1773 _tmp12 = _mm_load_ss(ptr4+8); \
1774 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1775 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1776 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1777 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1778 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1779 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1780 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1781 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1782 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1783 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1784 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1785 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1786 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1787 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1788 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1789 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1790 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1791 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1792 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1793 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
1794 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
1795 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1796 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
1797 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
1798 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
1799 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
1800 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
1801 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
1802 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1803 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1804 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
1805 _mm_storeu_ps(ptr1,_tmp1); \
1806 _mm_storeu_ps(ptr1+4,_tmp2); \
1807 _mm_store_ss(ptr1+8,_tmp3); \
1808 _mm_storeu_ps(ptr2,_tmp4); \
1809 _mm_storeu_ps(ptr2+4,_tmp5); \
1810 _mm_store_ss(ptr2+8,_tmp6); \
1811 _mm_storeu_ps(ptr3,_tmp7); \
1812 _mm_storeu_ps(ptr3+4,_tmp8); \
1813 _mm_store_ss(ptr3+8,_tmp9); \
1814 _mm_storeu_ps(ptr4,_tmp10); \
1815 _mm_storeu_ps(ptr4+4,_tmp11); \
1816 _mm_store_ss(ptr4+8,_tmp12); \
1820 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1821 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1822 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1823 __m128 _tmp23,_tmp24; \
1824 _tmp1 = _mm_loadu_ps(ptr1); \
1825 _tmp2 = _mm_loadu_ps(ptr1+4); \
1826 _tmp3 = _mm_loadu_ps(ptr1+8); \
1827 _tmp4 = _mm_loadu_ps(ptr2); \
1828 _tmp5 = _mm_loadu_ps(ptr2+4); \
1829 _tmp6 = _mm_loadu_ps(ptr2+8); \
1830 _tmp7 = _mm_loadu_ps(ptr3); \
1831 _tmp8 = _mm_loadu_ps(ptr3+4); \
1832 _tmp9 = _mm_loadu_ps(ptr3+8); \
1833 _tmp10 = _mm_loadu_ps(ptr4); \
1834 _tmp11 = _mm_loadu_ps(ptr4+4); \
1835 _tmp12 = _mm_loadu_ps(ptr4+8); \
1836 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1837 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1838 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1839 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1840 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1841 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1842 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1843 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1844 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1845 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1846 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1847 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1848 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1849 jz1 = _mm_movehl_ps(jz1,jx1); \
1850 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1851 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1852 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1853 jx3 = _mm_movehl_ps(jx3,jy2); \
1854 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1855 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1856 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1857 jy4 = _mm_movehl_ps(jy4,jz3); \
1858 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1859 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1860 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
1861 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
1862 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
1863 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
1864 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
1865 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
1866 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
1867 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
1868 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
1869 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1870 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1871 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
1872 _mm_storeu_ps(ptr1,_tmp1); \
1873 _mm_storeu_ps(ptr1+4,_tmp2); \
1874 _mm_storeu_ps(ptr1+8,_tmp3); \
1875 _mm_storeu_ps(ptr2,_tmp4); \
1876 _mm_storeu_ps(ptr2+4,_tmp5); \
1877 _mm_storeu_ps(ptr2+8,_tmp6); \
1878 _mm_storeu_ps(ptr3,_tmp7); \
1879 _mm_storeu_ps(ptr3+4,_tmp8); \
1880 _mm_storeu_ps(ptr3+8,_tmp9); \
1881 _mm_storeu_ps(ptr4,_tmp10); \
1882 _mm_storeu_ps(ptr4+4,_tmp11); \
1883 _mm_storeu_ps(ptr4+8,_tmp12); \
1891 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1892 * to set up the 7 constants used for analytic 5th order switch calculations.
1894 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
1895 const __m128 _swsetup_cm6 = { -6.0, -6.0, -6.0, -6.0}; \
1896 const __m128 _swsetup_cm10 = {-10.0,-10.0,-10.0,-10.0}; \
1897 const __m128 _swsetup_c15 = { 15.0, 15.0, 15.0, 15.0}; \
1898 const __m128 _swsetup_cm30 = {-30.0,-30.0,-30.0,-30.0}; \
1899 const __m128 _swsetup_c60 = { 60.0, 60.0, 60.0, 60.0}; \
1901 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
1903 d = _mm_sub_ps(rcut,rswitch); \
1904 dinv = gmx_mm_inv_ps(d); \
1905 dinv2 = _mm_mul_ps(dinv,dinv); \
1906 dinv3 = _mm_mul_ps(dinv2,dinv); \
1907 dinv4 = _mm_mul_ps(dinv2,dinv2); \
1908 dinv5 = _mm_mul_ps(dinv3,dinv2); \
1910 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
1911 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
1912 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
1913 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
1914 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
1915 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
1919 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1920 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
1921 __m128 d,d2; \
1922 d = _mm_max_ps(r,rswitch); \
1923 d = _mm_min_ps(d,rcut); \
1924 d = _mm_sub_ps(d,rswitch); \
1925 d2 = _mm_mul_ps(d,d); \
1926 sw = _mm_mul_ps(d,sw_C5); \
1927 dsw = _mm_mul_ps(d,sw_D4); \
1928 sw = _mm_add_ps(sw,sw_C4); \
1929 dsw = _mm_add_ps(dsw,sw_D3); \
1930 sw = _mm_mul_ps(sw,d); \
1931 dsw = _mm_mul_ps(dsw,d); \
1932 sw = _mm_add_ps(sw,sw_C3); \
1933 dsw = _mm_add_ps(dsw,sw_D2); \
1934 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
1935 dsw = _mm_mul_ps(dsw,d2); \
1936 sw = _mm_add_ps(sw,_sw_one); \
1940 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1941 static inline __m128
1942 gmx_mm_interaction_coulomb_ps(__m128 rinv, __m128 qq,__m128 *vctot)
1944 __m128 vcoul = _mm_mul_ps(qq,rinv);
1945 *vctot = _mm_add_ps(*vctot,vcoul);
1946 return vcoul;
1950 static inline void
1951 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv, __m128 qq,__m128 *vctot)
1953 __m128 vcoul = _mm_mul_ps(qq,rinv);
1954 *vctot = _mm_add_ps(*vctot,vcoul);
1955 return;
1958 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1959 static inline __m128
1960 gmx_mm_interaction_coulombrf_ps(const __m128 rinv, const __m128 rsq, const __m128 krf, const __m128 crf, const __m128 qq,__m128 *vctot)
1962 const __m128 two = {2.0,2.0,2.0,2.0};
1963 __m128 vcoul,krsq;
1965 krsq = _mm_mul_ps(krf,rsq);
1966 vcoul = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
1967 *vctot = _mm_add_ps(*vctot,vcoul);
1969 return _mm_mul_ps(qq, _mm_sub_ps(rinv, _mm_mul_ps(two,krsq)));
1973 static inline void
1974 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv, __m128 rsq, __m128 krf, __m128 crf, __m128 qq,__m128 *vctot)
1976 __m128 vcoul,krsq;
1978 krsq = _mm_mul_ps(krf,rsq);
1979 vcoul = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
1980 *vctot = _mm_add_ps(*vctot,vcoul);
1981 return;
1985 /* GB */
1990 /* GB + RF */
1993 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1994 static inline __m128
1995 gmx_mm_int_lj_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
1997 const __m128 six = {6.0,6.0,6.0,6.0};
1998 const __m128 twelve = {12.0,12.0,12.0,12.0};
2000 __m128 rinvsix,vvdw6,vvdw12;
2002 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
2003 vvdw6 = _mm_mul_ps(c6,rinvsix);
2004 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
2005 *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
2007 return _mm_sub_ps( _mm_mul_ps(twelve,vvdw12),_mm_mul_ps(six,vvdw6));
2011 static inline void
2012 gmx_mm_int_lj_potonly_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
2014 __m128 rinvsix,vvdw6,vvdw12;
2016 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
2017 vvdw6 = _mm_mul_ps(c6,rinvsix);
2018 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
2019 *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
2021 return;
2026 /* Return force should be multiplied by -rinv to get fscal */
2027 static inline __m128
2028 gmx_mm_int_4_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2030 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
2031 __m128i n0;
2032 int n_a,n_b,n_c,n_d;
2034 rt = _mm_mul_ps(r,tabscale);
2035 n0 = _mm_cvttps_epi32(rt);
2036 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2037 eps2 = _mm_mul_ps(eps,eps);
2039 /* Extract indices from n0 */
2040 n_a = gmx_mm_extract_epi32(n0,0);
2041 n_b = gmx_mm_extract_epi32(n0,1);
2042 n_c = gmx_mm_extract_epi32(n0,2);
2043 n_d = gmx_mm_extract_epi32(n0,3);
2044 Y = _mm_load_ps(VFtab + 4* n_a);
2045 F = _mm_load_ps(VFtab + 4* n_b);
2046 G = _mm_load_ps(VFtab + 4* n_c);
2047 H = _mm_load_ps(VFtab + 4* n_d);
2048 _MM_TRANSPOSE4_PS(Y,F,G,H);
2049 H = _mm_mul_ps(H,eps2); /* Heps2 */
2050 G = _mm_mul_ps(G,eps); /* Geps */
2051 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
2052 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2053 *vctot = _mm_add_ps(*vctot,vcoul);
2055 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2057 return _mm_mul_ps(F,tabscale);
2062 /* Return force should be multiplied by -rinv to get fscal */
2063 static inline __m128
2064 gmx_mm_int_4_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2066 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2067 __m128i n0;
2068 int n_a,n_b,n_c,n_d;
2070 rt = _mm_mul_ps(r,tabscale);
2071 n0 = _mm_cvttps_epi32(rt);
2072 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2073 eps2 = _mm_mul_ps(eps,eps);
2075 /* Extract indices from n0 */
2076 n_a = gmx_mm_extract_epi32(n0,0);
2077 n_b = gmx_mm_extract_epi32(n0,1);
2078 n_c = gmx_mm_extract_epi32(n0,2);
2079 n_d = gmx_mm_extract_epi32(n0,3);
2081 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2082 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2083 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2085 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset);
2086 Fd = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset);
2087 Gd = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset);
2088 Hd = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset);
2089 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset + 4);
2090 Fr = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset + 4);
2091 Gr = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset + 4);
2092 Hr = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset + 4);
2093 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2094 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2095 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2096 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2097 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2098 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2099 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2100 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2101 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2102 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2103 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2105 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2106 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2108 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2112 /* Return force should be multiplied by -rinv to get fscal */
2113 static inline __m128
2114 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2115 __m128 *vctot, __m128 *vvdwtot)
2117 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2118 __m128i n0;
2119 int n_a,n_b,n_c,n_d;
2121 rt = _mm_mul_ps(r,tabscale);
2122 n0 = _mm_cvttps_epi32(rt);
2123 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2124 eps2 = _mm_mul_ps(eps,eps);
2126 /* Extract indices from n0 */
2127 n_a = gmx_mm_extract_epi32(n0,0);
2128 n_b = gmx_mm_extract_epi32(n0,1);
2129 n_c = gmx_mm_extract_epi32(n0,2);
2130 n_d = gmx_mm_extract_epi32(n0,3);
2133 Yc = _mm_load_ps(VFtab + 12* n_a);
2134 Fc = _mm_load_ps(VFtab + 12* n_b);
2135 Gc = _mm_load_ps(VFtab + 12* n_c);
2136 Hc = _mm_load_ps(VFtab + 12* n_d);
2137 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
2138 Fd = _mm_load_ps(VFtab + 12* n_b + 4);
2139 Gd = _mm_load_ps(VFtab + 12* n_c + 4);
2140 Hd = _mm_load_ps(VFtab + 12* n_d + 4);
2141 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
2142 Fr = _mm_load_ps(VFtab + 12* n_b + 8);
2143 Gr = _mm_load_ps(VFtab + 12* n_c + 8);
2144 Hr = _mm_load_ps(VFtab + 12* n_d + 8);
2145 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2146 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2147 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2148 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
2149 Gc = _mm_mul_ps(Gc,eps); /* Geps */
2150 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
2151 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2152 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2153 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2154 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2155 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2156 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2158 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2159 *vctot = _mm_add_ps(*vctot,vcoul);
2161 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2162 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2163 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2165 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2166 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2167 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2169 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2174 /* Return force should be multiplied by -rinv to get fscal */
2175 static inline __m128
2176 gmx_mm_int_3_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2178 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
2179 __m128i n0;
2180 int n_a,n_b,n_c;
2182 rt = _mm_mul_ps(r,tabscale);
2183 n0 = _mm_cvttps_epi32(rt);
2184 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2185 eps2 = _mm_mul_ps(eps,eps);
2187 /* Extract indices from n0 */
2188 n_a = gmx_mm_extract_epi32(n0,0);
2189 n_b = gmx_mm_extract_epi32(n0,1);
2190 n_c = gmx_mm_extract_epi32(n0,2);
2191 Y = _mm_load_ps(VFtab + 4* n_a);
2192 F = _mm_load_ps(VFtab + 4* n_b);
2193 G = _mm_load_ps(VFtab + 4* n_c);
2194 H = _mm_setzero_ps();
2195 _MM_TRANSPOSE4_PS(Y,F,G,H);
2196 H = _mm_mul_ps(H,eps2); /* Heps2 */
2197 G = _mm_mul_ps(G,eps); /* Geps */
2198 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
2199 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2200 *vctot = _mm_add_ps(*vctot,vcoul);
2202 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2204 return _mm_mul_ps(F,tabscale);
2209 /* Return force should be multiplied by -rinv to get fscal */
2210 static inline __m128
2211 gmx_mm_int_3_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2213 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2214 __m128i n0;
2215 int n_a,n_b,n_c;
2217 rt = _mm_mul_ps(r,tabscale);
2218 n0 = _mm_cvttps_epi32(rt);
2219 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2220 eps2 = _mm_mul_ps(eps,eps);
2222 /* Extract indices from n0 */
2223 n_a = gmx_mm_extract_epi32(n0,0);
2224 n_b = gmx_mm_extract_epi32(n0,1);
2225 n_c = gmx_mm_extract_epi32(n0,2);
2227 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2228 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2229 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2231 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2232 Fd = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
2233 Gd = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset);
2234 Hd = _mm_setzero_ps();
2235 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2236 Fr = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
2237 Gr = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset + 4);
2238 Hr = _mm_setzero_ps();
2239 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2240 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2241 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2242 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2243 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2244 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2245 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2246 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2247 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2248 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2249 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2251 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2252 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2254 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2258 /* Return force should be multiplied by -rinv to get fscal */
2259 static inline __m128
2260 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2261 __m128 *vctot, __m128 *vvdwtot)
2263 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2264 __m128i n0;
2265 int n_a,n_b,n_c;
2267 rt = _mm_mul_ps(r,tabscale);
2268 n0 = _mm_cvttps_epi32(rt);
2269 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2270 eps2 = _mm_mul_ps(eps,eps);
2272 /* Extract indices from n0 */
2273 n_a = gmx_mm_extract_epi32(n0,0);
2274 n_b = gmx_mm_extract_epi32(n0,1);
2275 n_c = gmx_mm_extract_epi32(n0,2);
2278 Yc = _mm_load_ps(VFtab + 12* n_a);
2279 Fc = _mm_load_ps(VFtab + 12* n_b);
2280 Gc = _mm_load_ps(VFtab + 12* n_c);
2281 Hc = _mm_setzero_ps();
2282 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
2283 Fd = _mm_load_ps(VFtab + 12* n_b + 4);
2284 Gd = _mm_load_ps(VFtab + 12* n_c + 4);
2285 Hd = _mm_setzero_ps();
2286 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
2287 Fr = _mm_load_ps(VFtab + 12* n_b + 8);
2288 Gr = _mm_load_ps(VFtab + 12* n_c + 8);
2289 Hr = _mm_setzero_ps();
2290 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2291 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2292 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2293 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
2294 Gc = _mm_mul_ps(Gc,eps); /* Geps */
2295 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
2296 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2297 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2298 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2299 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2300 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2301 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2303 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2304 *vctot = _mm_add_ps(*vctot,vcoul);
2306 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2307 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2308 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2310 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2311 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2312 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2314 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2321 /* Return force should be multiplied by -rinv to get fscal */
2322 static inline __m128
2323 gmx_mm_int_2_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2325 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
2326 __m128i n0;
2327 int n_a,n_b;
2329 rt = _mm_mul_ps(r,tabscale);
2330 n0 = _mm_cvttps_epi32(rt);
2331 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2332 eps2 = _mm_mul_ps(eps,eps);
2334 /* Extract indices from n0 */
2335 n_a = gmx_mm_extract_epi32(n0,0);
2336 n_b = gmx_mm_extract_epi32(n0,1);
2337 Y = _mm_load_ps(VFtab + 4* n_a);
2338 F = _mm_load_ps(VFtab + 4* n_b);
2339 G = _mm_setzero_ps();
2340 H = _mm_setzero_ps();
2341 _MM_TRANSPOSE4_PS(Y,F,G,H);
2342 H = _mm_mul_ps(H,eps2); /* Heps2 */
2343 G = _mm_mul_ps(G,eps); /* Geps */
2344 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
2345 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2346 *vctot = _mm_add_ps(*vctot,vcoul);
2348 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2350 return _mm_mul_ps(F,tabscale);
2355 /* Return force should be multiplied by -rinv to get fscal */
2356 static inline __m128
2357 gmx_mm_int_2_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2359 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2360 __m128i n0;
2361 int n_a,n_b;
2363 rt = _mm_mul_ps(r,tabscale);
2364 n0 = _mm_cvttps_epi32(rt);
2365 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2366 eps2 = _mm_mul_ps(eps,eps);
2368 /* Extract indices from n0 */
2369 n_a = gmx_mm_extract_epi32(n0,0);
2370 n_b = gmx_mm_extract_epi32(n0,1);
2372 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2373 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2374 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2376 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2377 Fd = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
2378 Gd = _mm_setzero_ps();
2379 Hd = _mm_setzero_ps();
2380 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2381 Fr = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
2382 Gr = _mm_setzero_ps();
2383 Hr = _mm_setzero_ps();
2384 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2385 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2386 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2387 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2388 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2389 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2390 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2391 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2392 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2393 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2394 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2396 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2397 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2399 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2403 /* Return force should be multiplied by -rinv to get fscal */
2404 static inline __m128
2405 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2406 __m128 *vctot, __m128 *vvdwtot)
2408 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2409 __m128i n0;
2410 int n_a,n_b;
2412 rt = _mm_mul_ps(r,tabscale);
2413 n0 = _mm_cvttps_epi32(rt);
2414 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2415 eps2 = _mm_mul_ps(eps,eps);
2417 /* Extract indices from n0 */
2418 n_a = gmx_mm_extract_epi32(n0,0);
2419 n_b = gmx_mm_extract_epi32(n0,1);
2421 Yc = _mm_load_ps(VFtab + 12* n_a);
2422 Fc = _mm_load_ps(VFtab + 12* n_b);
2423 Gc = _mm_setzero_ps();
2424 Hc = _mm_setzero_ps();
2425 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
2426 Fd = _mm_load_ps(VFtab + 12* n_b + 4);
2427 Gd = _mm_setzero_ps();
2428 Hd = _mm_setzero_ps();
2429 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
2430 Fr = _mm_load_ps(VFtab + 12* n_b + 8);
2431 Gr = _mm_setzero_ps();
2432 Hr = _mm_setzero_ps();
2433 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2434 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2435 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2436 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
2437 Gc = _mm_mul_ps(Gc,eps); /* Geps */
2438 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
2439 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2440 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2441 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2442 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2443 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2444 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2446 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2447 *vctot = _mm_add_ps(*vctot,vcoul);
2449 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2450 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2451 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2453 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2454 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2455 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2457 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2463 /* Return force should be multiplied by -rinv to get fscal */
2464 static inline __m128
2465 gmx_mm_int_1_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2467 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
2468 __m128i n0;
2469 int n_a;
2471 rt = _mm_mul_ps(r,tabscale);
2472 n0 = _mm_cvttps_epi32(rt);
2473 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2474 eps2 = _mm_mul_ps(eps,eps);
2476 /* Extract indices from n0 */
2477 n_a = gmx_mm_extract_epi32(n0,0);
2478 Y = _mm_load_ps(VFtab + 4* n_a);
2479 F = _mm_setzero_ps();
2480 G = _mm_setzero_ps();
2481 H = _mm_setzero_ps();
2482 _MM_TRANSPOSE4_PS(Y,F,G,H);
2483 H = _mm_mul_ps(H,eps2); /* Heps2 */
2484 G = _mm_mul_ps(G,eps); /* Geps */
2485 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
2486 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2487 *vctot = _mm_add_ps(*vctot,vcoul);
2489 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2491 return _mm_mul_ps(F,tabscale);
2496 /* Return force should be multiplied by -rinv to get fscal */
2497 static inline __m128
2498 gmx_mm_int_1_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2500 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2501 __m128i n0;
2502 int n_a;
2504 rt = _mm_mul_ps(r,tabscale);
2505 n0 = _mm_cvttps_epi32(rt);
2506 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2507 eps2 = _mm_mul_ps(eps,eps);
2509 /* Extract indices from n0 */
2510 n_a = gmx_mm_extract_epi32(n0,0);
2512 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2513 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2514 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2516 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2517 Fd = _mm_setzero_ps();
2518 Gd = _mm_setzero_ps();
2519 Hd = _mm_setzero_ps();
2520 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2521 Fr = _mm_setzero_ps();
2522 Gr = _mm_setzero_ps();
2523 Hr = _mm_setzero_ps();
2524 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2525 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2526 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2527 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2528 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2529 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2530 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2531 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2532 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2533 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2534 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2536 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2537 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2539 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2543 /* Return force should be multiplied by -rinv to get fscal */
2544 static inline __m128
2545 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2546 __m128 *vctot, __m128 *vvdwtot)
2548 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2549 __m128i n0;
2550 int n_a;
2552 rt = _mm_mul_ps(r,tabscale);
2553 n0 = _mm_cvttps_epi32(rt);
2554 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2555 eps2 = _mm_mul_ps(eps,eps);
2557 /* Extract indices from n0 */
2558 n_a = gmx_mm_extract_epi32(n0,0);
2560 Yc = _mm_load_ps(VFtab + 12* n_a);
2561 Fc = _mm_setzero_ps();
2562 Gc = _mm_setzero_ps();
2563 Hc = _mm_setzero_ps();
2564 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
2565 Fd = _mm_setzero_ps();
2566 Gd = _mm_setzero_ps();
2567 Hd = _mm_setzero_ps();
2568 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
2569 Fr = _mm_setzero_ps();
2570 Gr = _mm_setzero_ps();
2571 Hr = _mm_setzero_ps();
2572 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2573 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2574 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2575 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
2576 Gc = _mm_mul_ps(Gc,eps); /* Geps */
2577 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
2578 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
2579 Gd = _mm_mul_ps(Gd,eps); /* Geps */
2580 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
2581 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
2582 Gr = _mm_mul_ps(Gr,eps); /* Geps */
2583 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
2585 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2586 *vctot = _mm_add_ps(*vctot,vcoul);
2588 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2589 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2590 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2592 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2593 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2594 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2596 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2603 /* Return force should be multiplied by +rinv to get fscal */
2604 static inline __m128
2605 gmx_mm_int_4_genborn_ps(__m128 r, __m128 isai,
2606 float * isaj1, float *isaj2, float *isaj3, float *isaj4,
2607 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2608 float *dvdaj1, float *dvdaj2, float *dvdaj3, float *dvdaj4,
2609 __m128 *vgbtot)
2611 const __m128 half = {0.5,0.5,0.5,0.5};
2613 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2614 __m128i n0;
2615 int n_a,n_b,n_c,n_d;
2617 /* Assemble isaj */
2618 isaj = _mm_load_ss(isaj1);
2619 t2 = _mm_load_ss(isaj2);
2620 t3 = _mm_load_ss(isaj3);
2621 t4 = _mm_load_ss(isaj4);
2622 isaj = _mm_unpacklo_ps(isaj,t2); /* - - t2 t1 */
2623 t3 = _mm_unpacklo_ps(t3,t4); /* - - t4 t3 */
2624 isaj = _mm_movelh_ps(isaj,t3); /* t4 t3 t2 t1 */
2626 isaprod = _mm_mul_ps(isai,isaj);
2627 qq = _mm_mul_ps(qq,isaprod);
2628 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
2630 rt = _mm_mul_ps(r,gbtabscale);
2631 n0 = _mm_cvttps_epi32(rt);
2632 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2633 eps2 = _mm_mul_ps(eps,eps);
2635 /* Extract indices from n0 */
2636 n_a = gmx_mm_extract_epi32(n0,0);
2637 n_b = gmx_mm_extract_epi32(n0,1);
2638 n_c = gmx_mm_extract_epi32(n0,2);
2639 n_d = gmx_mm_extract_epi32(n0,3);
2640 Y = _mm_load_ps(GBtab + 4* n_a);
2641 F = _mm_load_ps(GBtab + 4* n_b);
2642 G = _mm_load_ps(GBtab + 4* n_c);
2643 H = _mm_load_ps(GBtab + 4* n_d);
2644 _MM_TRANSPOSE4_PS(Y,F,G,H);
2645 G = _mm_mul_ps(G,eps); /* Geps */
2646 H = _mm_mul_ps(H,eps2); /* Heps2 */
2647 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
2649 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2650 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2652 vgb = _mm_mul_ps(qq, VV);
2653 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2655 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2657 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2659 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2661 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2663 /* Update 4 dada[j] values */
2664 Y = _mm_load_ss(dvdaj1);
2665 F = _mm_load_ss(dvdaj2);
2666 G = _mm_load_ss(dvdaj3);
2667 H = _mm_load_ss(dvdaj4);
2668 t3 = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
2669 t2 = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2670 t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,1));
2672 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2673 _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2674 _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
2675 _mm_store_ss( dvdaj4 , _mm_add_ss( H, t4 ) );
2677 return ftmp;
2682 /* Return force should be multiplied by +rinv to get fscal */
2683 static inline __m128
2684 gmx_mm_int_3_genborn_ps(__m128 r, __m128 isai,
2685 float * isaj1, float *isaj2, float *isaj3,
2686 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2687 float *dvdaj1, float *dvdaj2, float *dvdaj3,
2688 __m128 *vgbtot)
2690 const __m128 half = {0.5,0.5,0.5,0.5};
2692 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2693 __m128i n0;
2694 int n_a,n_b,n_c,n_d;
2696 /* Assemble isaj */
2697 isaj = _mm_load_ss(isaj1);
2698 t2 = _mm_load_ss(isaj2);
2699 t3 = _mm_load_ss(isaj3);
2700 isaj = _mm_unpacklo_ps(isaj,t2); /* - - t2 t1 */
2701 t3 = _mm_unpacklo_ps(t3,t3); /* - - t3 t3 */
2702 isaj = _mm_movelh_ps(isaj,t3); /* t3 t3 t2 t1 */
2704 isaprod = _mm_mul_ps(isai,isaj);
2705 qq = _mm_mul_ps(qq,isaprod);
2706 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
2708 rt = _mm_mul_ps(r,gbtabscale);
2709 n0 = _mm_cvttps_epi32(rt);
2710 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2711 eps2 = _mm_mul_ps(eps,eps);
2713 /* Extract indices from n0 */
2714 n_a = gmx_mm_extract_epi32(n0,0);
2715 n_b = gmx_mm_extract_epi32(n0,1);
2716 n_c = gmx_mm_extract_epi32(n0,2);
2717 Y = _mm_load_ps(GBtab + 4* n_a);
2718 F = _mm_load_ps(GBtab + 4* n_b);
2719 G = _mm_load_ps(GBtab + 4* n_c);
2720 H = _mm_setzero_ps();
2721 _MM_TRANSPOSE4_PS(Y,F,G,H);
2722 G = _mm_mul_ps(G,eps); /* Geps */
2723 H = _mm_mul_ps(H,eps2); /* Heps2 */
2724 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
2726 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2727 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2729 vgb = _mm_mul_ps(qq, VV);
2730 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2732 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2734 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2736 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2738 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2740 /* Update 3 dada[j] values */
2741 Y = _mm_load_ss(dvdaj1);
2742 F = _mm_load_ss(dvdaj2);
2743 G = _mm_load_ss(dvdaj3);
2744 t3 = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
2745 t2 = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2747 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2748 _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2749 _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
2751 return ftmp;
2757 /* Return force should be multiplied by +rinv to get fscal */
2758 static inline __m128
2759 gmx_mm_int_2_genborn_ps(__m128 r, __m128 isai,
2760 float * isaj1, float *isaj2,
2761 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2762 float *dvdaj1, float *dvdaj2,
2763 __m128 *vgbtot)
2765 const __m128 half = {0.5,0.5,0.5,0.5};
2767 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2768 __m128i n0;
2769 int n_a,n_b,n_c,n_d;
2771 /* Assemble isaj */
2772 isaj = _mm_load_ss(isaj1);
2773 t2 = _mm_load_ss(isaj2);
2774 isaj = _mm_unpacklo_ps(isaj,t2); /* - - t2 t1 */
2776 isaprod = _mm_mul_ps(isai,isaj);
2777 qq = _mm_mul_ps(qq,isaprod);
2778 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
2780 rt = _mm_mul_ps(r,gbtabscale);
2781 n0 = _mm_cvttps_epi32(rt);
2782 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2783 eps2 = _mm_mul_ps(eps,eps);
2785 /* Extract indices from n0 */
2786 n_a = gmx_mm_extract_epi32(n0,0);
2787 n_b = gmx_mm_extract_epi32(n0,1);
2788 Y = _mm_load_ps(GBtab + 4* n_a);
2789 F = _mm_load_ps(GBtab + 4* n_b);
2790 G = _mm_setzero_ps();
2791 H = _mm_setzero_ps();
2792 _MM_TRANSPOSE4_PS(Y,F,G,H);
2793 G = _mm_mul_ps(G,eps); /* Geps */
2794 H = _mm_mul_ps(H,eps2); /* Heps2 */
2795 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
2797 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2798 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2800 vgb = _mm_mul_ps(qq, VV);
2801 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2803 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2805 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2807 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2809 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2811 /* Update 2 dada[j] values */
2812 Y = _mm_load_ss(dvdaj1);
2813 F = _mm_load_ss(dvdaj2);
2814 t2 = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2816 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2817 _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2819 return ftmp;
2822 /* Return force should be multiplied by +rinv to get fscal */
2823 static inline __m128
2824 gmx_mm_int_1_genborn_ps(__m128 r, __m128 isai,
2825 float * isaj1,
2826 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2827 float *dvdaj1,
2828 __m128 *vgbtot)
2830 const __m128 half = {0.5,0.5,0.5,0.5};
2832 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2833 __m128i n0;
2834 int n_a,n_b,n_c,n_d;
2836 /* Assemble isaj */
2837 isaj = _mm_load_ss(isaj1);
2839 isaprod = _mm_mul_ps(isai,isaj);
2840 qq = _mm_mul_ps(qq,isaprod);
2841 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
2843 rt = _mm_mul_ps(r,gbtabscale);
2844 n0 = _mm_cvttps_epi32(rt);
2845 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2846 eps2 = _mm_mul_ps(eps,eps);
2848 /* Extract indices from n0 */
2849 n_a = gmx_mm_extract_epi32(n0,0);
2850 Y = _mm_load_ps(GBtab + 4* n_a);
2851 F = _mm_setzero_ps();
2852 G = _mm_setzero_ps();
2853 H = _mm_setzero_ps();
2854 _MM_TRANSPOSE4_PS(Y,F,G,H);
2855 G = _mm_mul_ps(G,eps); /* Geps */
2856 H = _mm_mul_ps(H,eps2); /* Heps2 */
2857 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
2859 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2860 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2862 vgb = _mm_mul_ps(qq, VV);
2863 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2865 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2867 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2869 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2871 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2873 /* Update 1 dada[j] values */
2874 Y = _mm_load_ss(dvdaj1);
2876 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2878 return ftmp;
2885 static inline void
2886 gmx_mm_update_iforce_1atom_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2887 float *fptr,
2888 float *fshiftptr)
2890 __m128 t1,t2,t3;
2892 #ifdef GMX_SSE3
2893 fix1 = _mm_hadd_ps(fix1,fix1);
2894 fiy1 = _mm_hadd_ps(fiy1,fiz1);
2896 fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
2897 #else
2898 /* SSE2 */
2899 /* transpose data */
2900 t1 = fix1;
2901 _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
2902 fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
2903 #endif
2904 t2 = _mm_load_ss(fptr);
2905 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
2906 t3 = _mm_load_ss(fshiftptr);
2907 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
2909 t2 = _mm_add_ps(t2,fix1);
2910 t3 = _mm_add_ps(t3,fix1);
2912 _mm_store_ss(fptr,t2);
2913 _mm_storeh_pi((__m64 *)(fptr+1),t2);
2914 _mm_store_ss(fshiftptr,t3);
2915 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
2918 static inline void
2919 gmx_mm_update_iforce_2atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2920 __m128 fix2, __m128 fiy2, __m128 fiz2,
2921 float *fptr,
2922 float *fshiftptr)
2924 __m128 t1,t2,t4;
2926 #ifdef GMX_SSE3
2927 fix1 = _mm_hadd_ps(fix1,fiy1);
2928 fiz1 = _mm_hadd_ps(fiz1,fix2);
2929 fiy2 = _mm_hadd_ps(fiy2,fiz2);
2931 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
2932 fiy2 = _mm_hadd_ps(fiy2,fiy2); /* - - fiz2 fiy2 */
2933 #else
2934 /* SSE2 */
2935 /* transpose data */
2936 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
2937 t1 = _mm_unpacklo_ps(fiy2,fiz2);
2938 t2 = _mm_unpackhi_ps(fiy2,fiz2);
2940 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2941 t1 = _mm_add_ps(t1,t2);
2942 t2 = _mm_movehl_ps(t2,t1);
2943 fiy2 = _mm_add_ps(t1,t2);
2944 #endif
2945 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
2946 t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
2947 _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
2949 t4 = _mm_load_ss(fshiftptr+2);
2950 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
2952 t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
2953 t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
2954 t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
2956 t1 = _mm_add_ps(t1,t2);
2957 t1 = _mm_add_ps(t1,t4); /* y x - z */
2959 _mm_store_ss(fshiftptr+2,t1);
2960 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
2965 static inline void
2966 gmx_mm_update_iforce_3atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2967 __m128 fix2, __m128 fiy2, __m128 fiz2,
2968 __m128 fix3, __m128 fiy3, __m128 fiz3,
2969 float *fptr,
2970 float *fshiftptr)
2972 __m128 t1,t2,t3,t4;
2974 #ifdef GMX_SSE3
2975 fix1 = _mm_hadd_ps(fix1,fiy1);
2976 fiz1 = _mm_hadd_ps(fiz1,fix2);
2977 fiy2 = _mm_hadd_ps(fiy2,fiz2);
2978 fix3 = _mm_hadd_ps(fix3,fiy3);
2979 fiz3 = _mm_hadd_ps(fiz3,fiz3);
2981 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
2982 fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
2983 fiz3 = _mm_hadd_ps(fiz3,fiz3); /* - - - fiz3 */
2984 #else
2985 /* SSE2 */
2986 /* transpose data */
2987 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
2988 _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
2989 t2 = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
2990 t1 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
2991 t3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
2993 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2994 fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
2995 fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1) , _mm_add_ps(t2,t3));
2996 #endif
2997 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
2998 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
2999 _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
3001 t4 = _mm_load_ss(fshiftptr+2);
3002 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
3004 t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
3005 t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
3006 t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
3007 t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
3009 t1 = _mm_add_ps(t1,t2);
3010 t3 = _mm_add_ps(t3,t4);
3011 t1 = _mm_add_ps(t1,t3); /* y x - z */
3013 _mm_store_ss(fshiftptr+2,t1);
3014 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
3018 static inline void
3019 gmx_mm_update_iforce_4atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
3020 __m128 fix2, __m128 fiy2, __m128 fiz2,
3021 __m128 fix3, __m128 fiy3, __m128 fiz3,
3022 __m128 fix4, __m128 fiy4, __m128 fiz4,
3023 float *fptr,
3024 float *fshiftptr)
3026 __m128 t1,t2,t3,t4,t5;
3028 #ifdef GMX_SSE3
3029 fix1 = _mm_hadd_ps(fix1,fiy1);
3030 fiz1 = _mm_hadd_ps(fiz1,fix2);
3031 fiy2 = _mm_hadd_ps(fiy2,fiz2);
3032 fix3 = _mm_hadd_ps(fix3,fiy3);
3033 fiz3 = _mm_hadd_ps(fiz3,fix4);
3034 fiy4 = _mm_hadd_ps(fiy4,fiz4);
3036 fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
3037 fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
3038 fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
3039 #else
3040 /* SSE2 */
3041 /* transpose data */
3042 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
3043 _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
3044 _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
3046 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
3047 fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
3048 fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
3049 #endif
3050 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
3051 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
3052 _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
3054 t5 = _mm_load_ss(fshiftptr+2);
3055 t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
3057 t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
3058 t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
3059 t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
3060 t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
3061 t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
3063 t1 = _mm_add_ps(t1,t2);
3064 t3 = _mm_add_ps(t3,t4);
3065 t1 = _mm_add_ps(t1,t3); /* y x - z */
3066 t5 = _mm_add_ps(t5,t1);
3068 _mm_store_ss(fshiftptr+2,t5);
3069 _mm_storeh_pi((__m64 *)(fshiftptr),t5);
3073 static inline void
3074 gmx_mm_update_1pot_ps(__m128 pot1, float *ptr1)
3076 #ifdef GMX_SSE3
3077 pot1 = _mm_hadd_ps(pot1,pot1);
3078 pot1 = _mm_hadd_ps(pot1,pot1);
3079 #else
3080 /* SSE2 */
3081 pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1));
3082 pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
3083 #endif
3084 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3088 static inline void
3089 gmx_mm_update_2pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2)
3091 #ifdef GMX_SSE3
3092 pot1 = _mm_hadd_ps(pot1,pot2);
3093 pot1 = _mm_hadd_ps(pot1,pot1);
3094 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1));
3095 #else
3096 /* SSE2 */
3097 __m128 t1,t2;
3098 t1 = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */
3099 t2 = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */
3100 t1 = _mm_add_ps(t1,t2); /* 2 2 1 1 */
3101 t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
3102 pot1 = _mm_add_ps(t1,t2); /* - 2 - 1 */
3103 pot2 = _mm_movehl_ps(t2,pot1); /* - - - 2 */
3104 #endif
3106 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3107 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
3111 static inline void
3112 gmx_mm_update_4pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2, __m128 pot3, float *ptr3, __m128 pot4, float *ptr4)
3114 _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
3116 pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
3117 pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
3118 pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
3119 pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
3121 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3122 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
3123 _mm_store_ss(ptr3,_mm_add_ss(pot3,_mm_load_ss(ptr3)));
3124 _mm_store_ss(ptr4,_mm_add_ss(pot4,_mm_load_ss(ptr4)));