2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
34 /* We require SSE2 now! */
39 #include <xmmintrin.h> /* SSE */
40 #include <emmintrin.h> /* SSE2 */
43 # include <pmmintrin.h> /* SSE3 */
46 # include <smmintrin.h> /* SSE4.1 */
51 /***************************************************
53 * COMPILER RANT WARNING: *
55 * Ideally, this header would be filled with *
56 * simple static inline functions. Unfortunately, *
57 * many vendors provide really braindead compilers *
58 * that either cannot handle more than 1-2 SSE *
59 * function parameters, and some cannot handle *
60 * pointers to SSE __m128 datatypes as parameters *
61 * at all. Thus, for portability we have had to *
62 * implement all but the simplest routines as *
65 ***************************************************/
68 /***************************************************
70 * Wrappers/replacements for some instructions *
71 * not available in all SSE versions. *
73 ***************************************************/
76 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
78 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
82 * Some compilers require a cast to change the interpretation
83 * of a register from FP to Int and vice versa, and not all of
84 * the provide instructions to do this. Roll our own wrappers...
87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
88 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
89 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
90 #elif defined(__GNUC__)
91 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
92 # define gmx_mm_castps_si128(a) ((__m128i)(a))
94 static __m128
gmx_mm_castsi128_ps(__m128i a
) { return *(__m128
*) &a
; }
95 static __m128i
gmx_mm_castps_si128(__m128 a
) { return *(__m128i
*) &a
; }
100 /* IO functions, just for debugging */
103 printxmm(const char *s
,__m128 xmm
)
107 _mm_storeu_ps(f
,xmm
);
108 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s
,f
[0],f
[1],f
[2],f
[3]);
113 printxmmsum(const char *s
,__m128 xmm
)
117 _mm_storeu_ps(f
,xmm
);
118 printf("%s (sum): %15.10g\n",s
,f
[0]+f
[1]+f
[2]+f
[3]);
123 printxmmi(const char *s
,__m128i xmmi
)
127 _mm_storeu_si128((__m128i
*)i
,xmmi
);
128 printf("%10s: %2d %2d %2d %2d\n",s
,i
[0],i
[1],i
[2],i
[3]);
132 /************************
134 * Simple math routines *
136 ************************/
139 gmx_mm_invsqrt_ps(__m128 x
)
141 const __m128 half
= {0.5,0.5,0.5,0.5};
142 const __m128 three
= {3.0,3.0,3.0,3.0};
144 __m128 lu
= _mm_rsqrt_ps(x
);
146 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
150 gmx_mm_inv_ps(__m128 x
)
152 const __m128 two
= {2.0f
,2.0f
,2.0f
,2.0f
};
154 __m128 lu
= _mm_rcp_ps(x
);
156 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
161 gmx_mm_calc_rsq_ps(__m128 dx
, __m128 dy
, __m128 dz
)
163 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
), _mm_mul_ps(dy
,dy
) ), _mm_mul_ps(dz
,dz
) );
166 /* Normal sum of four xmm registers */
168 gmx_mm_sum4_ps(__m128 t0
, __m128 t1
, __m128 t2
, __m128 t3
)
170 t0
= _mm_add_ps(t0
,t1
);
171 t2
= _mm_add_ps(t2
,t3
);
172 return _mm_add_ps(t0
,t2
);
177 gmx_mm_log_ps(__m128 x
)
179 const __m128 exp_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
180 const __m128 one_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
181 const __m128 off_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
182 const __m128 mant_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
183 const __m128 base_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
184 const __m128 loge_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
186 const __m128 D5
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
187 const __m128 D4
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
188 const __m128 D3
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
189 const __m128 D2
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
190 const __m128 D1
= gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
191 const __m128 D0
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
193 __m128 xmm0
,xmm1
,xmm2
;
197 xmm1
= _mm_and_ps(xmm1
, exp_ps
);
198 xmm1
= gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1
),8) );
200 xmm1
= _mm_or_ps(xmm1
, one_ps
);
201 xmm1
= _mm_sub_ps(xmm1
, off_ps
);
203 xmm1
= _mm_mul_ps(xmm1
, base_ps
);
204 xmm0
= _mm_and_ps(xmm0
, mant_ps
);
205 xmm0
= _mm_or_ps(xmm0
, one_ps
);
207 xmm2
= _mm_mul_ps(xmm0
, D5
);
208 xmm2
= _mm_add_ps(xmm2
, D4
);
209 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
210 xmm2
= _mm_add_ps(xmm2
, D3
);
211 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
212 xmm2
= _mm_add_ps(xmm2
, D2
);
213 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
214 xmm2
= _mm_add_ps(xmm2
, D1
);
215 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
216 xmm2
= _mm_add_ps(xmm2
, D0
);
217 xmm0
= _mm_sub_ps(xmm0
, one_ps
);
218 xmm0
= _mm_mul_ps(xmm0
,xmm2
);
219 xmm1
= _mm_add_ps(xmm1
,xmm0
);
222 x
= _mm_mul_ps(x
, loge_ps
);
228 /* This exp-routine has a relative precision of 2^-22.33 bits (essentially single precision :-) ) */
230 gmx_mm_exp_ps(__m128 x
)
232 const __m128i half
= _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000); // 0.5e+0f
233 const __m128i base
= _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F); // 127
234 const __m128i CC
= _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B); // log2(e)
236 const __m128i D5
= _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905); // 1.8775767e-3f
237 const __m128i D4
= _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806); // 8.9893397e-3f
238 const __m128i D3
= _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23); // 5.5826318e-2f
239 const __m128i D2
= _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4); // 2.4015361e-1f
240 const __m128i D1
= _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B); // 6.9315308e-1f
241 const __m128i D0
= _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF); // 9.9999994e-1f
246 xmm0
= _mm_mul_ps(x
,gmx_mm_castsi128_ps(CC
) );
247 xmm1
= _mm_sub_ps(xmm0
, gmx_mm_castsi128_ps(half
));
248 xmm2
= _mm_cvtps_epi32(xmm1
);
249 xmm1
= _mm_cvtepi32_ps(xmm2
);
251 xmm2
= _mm_add_epi32(xmm2
,base
);
252 xmm2
= _mm_slli_epi32(xmm2
,23);
254 xmm0
= _mm_sub_ps(xmm0
,xmm1
);
255 xmm1
= _mm_mul_ps(xmm0
,gmx_mm_castsi128_ps(D5
));
256 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D4
));
257 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
258 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D3
));
259 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
260 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D2
));
261 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
262 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D1
));
263 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
264 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D0
));
265 xmm1
= _mm_mul_ps(xmm1
,gmx_mm_castsi128_ps(xmm2
));
267 /* 18 instructions currently */
273 #define GMX_MM_SINCOS_PS(x,sinval,cosval) \
275 const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI}; \
276 const __m128 _sincosf_half = {0.5,0.5,0.5,0.5}; \
277 const __m128 _sincosf_one = {1.0,1.0,1.0,1.0}; \
279 const __m128i _sincosf_izero = _mm_set1_epi32(0); \
280 const __m128i _sincosf_ione = _mm_set1_epi32(1); \
281 const __m128i _sincosf_itwo = _mm_set1_epi32(2); \
282 const __m128i _sincosf_ithree = _mm_set1_epi32(3); \
284 const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129}; \
285 const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8}; \
286 const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249}; \
287 const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950}; \
288 const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225}; \
289 const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727}; \
290 const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758}; \
291 const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247}; \
293 __m128 _sincosf_signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); \
294 __m128 _sincosf_tiny = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) ); \
296 __m128 _sincosf_xl; \
297 __m128 _sincosf_xl2; \
298 __m128 _sincosf_xl3; \
299 __m128 _sincosf_qf; \
300 __m128 _sincosf_absxl; \
301 __m128 _sincosf_p1; \
302 __m128 _sincosf_cx; \
303 __m128 _sincosf_sx; \
304 __m128 _sincosf_ts; \
305 __m128 _sincosf_tc; \
306 __m128 _sincosf_tsn; \
307 __m128 _sincosf_tcn; \
308 __m128i _sincosf_q; \
309 __m128i _sincosf_offsetSin; \
310 __m128i _sincosf_offsetCos; \
311 __m128 _sincosf_sinMask; \
312 __m128 _sincosf_cosMask; \
313 __m128 _sincosf_isTiny; \
314 __m128 _sincosf_ct0; \
315 __m128 _sincosf_ct1; \
316 __m128 _sincosf_ct2; \
317 __m128 _sincosf_st1; \
318 __m128 _sincosf_st2; \
320 _sincosf_xl = _mm_mul_ps(x,_sincosf_two_over_pi); \
322 _sincosf_xl = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
324 _sincosf_q = _mm_cvttps_epi32(_sincosf_xl); \
325 _sincosf_qf = _mm_cvtepi32_ps(_sincosf_q); \
327 _sincosf_offsetSin = _mm_and_si128(_sincosf_q,_sincosf_ithree); \
328 _sincosf_offsetCos = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione); \
330 _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1); \
331 _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2); \
332 _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1); \
333 _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl); \
335 _sincosf_absxl = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl); \
336 _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl); \
338 _sincosf_xl2 = _mm_mul_ps(_sincosf_xl,_sincosf_xl); \
339 _sincosf_xl3 = _mm_mul_ps(_sincosf_xl2,_sincosf_xl); \
341 _sincosf_ct1 = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2); \
342 _sincosf_ct1 = _mm_add_ps(_sincosf_ct1,_sincosf_cc1); \
343 _sincosf_st1 = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2); \
344 _sincosf_st1 = _mm_add_ps(_sincosf_st1,_sincosf_sc1); \
345 _sincosf_ct2 = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2); \
346 _sincosf_ct2 = _mm_add_ps(_sincosf_ct2,_sincosf_cc2); \
347 _sincosf_st2 = _mm_mul_ps(_sincosf_st1,_sincosf_xl2); \
348 _sincosf_st2 = _mm_add_ps(_sincosf_st2,_sincosf_sc2); \
350 _sincosf_cx = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2); \
351 _sincosf_cx = _mm_add_ps(_sincosf_cx,_sincosf_one); \
353 _sincosf_sx = _mm_mul_ps(_sincosf_st2,_sincosf_xl3); \
354 _sincosf_sx = _mm_add_ps(_sincosf_sx,_sincosf_xl); \
356 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
357 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
359 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
360 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
362 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
363 _sincosf_tsn = _mm_xor_ps(_sincosf_signbit,_sincosf_ts); \
364 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
366 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
367 _sincosf_tcn = _mm_xor_ps(_sincosf_signbit,_sincosf_tc); \
368 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
370 sinval = _sincosf_ts; \
371 cosval = _sincosf_tc; \
376 /* Load a single value from 1-4 places, merge into xmm register */
378 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
380 __m128 _txmm2,_txmm3,_txmm4; \
381 xmm1 = _mm_load_ss(ptr1); \
382 _txmm2 = _mm_load_ss(ptr2); \
383 _txmm3 = _mm_load_ss(ptr3); \
384 _txmm4 = _mm_load_ss(ptr4); \
385 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
386 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
387 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
391 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
393 __m128 _txmm2,_txmm3; \
394 xmm1 = _mm_load_ss(ptr1); \
395 _txmm2 = _mm_load_ss(ptr2); \
396 _txmm3 = _mm_load_ss(ptr3); \
397 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
398 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
402 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
405 xmm1 = _mm_load_ss(ptr1); \
406 _txmm2 = _mm_load_ss(ptr2); \
407 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
411 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
413 xmm1 = _mm_load_ss(ptr1); \
416 /* Store data in an xmm register into 1-4 different places */
417 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
419 __m128 _txmm2,_txmm3,_txmm4; \
420 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
421 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
422 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
423 _mm_store_ss(ptr1,xmm1); \
424 _mm_store_ss(ptr2,_txmm2); \
425 _mm_store_ss(ptr3,_txmm3); \
426 _mm_store_ss(ptr4,_txmm4); \
430 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
432 __m128 _txmm2,_txmm3; \
433 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
434 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
435 _mm_store_ss(ptr1,xmm1); \
436 _mm_store_ss(ptr2,_txmm2); \
437 _mm_store_ss(ptr3,_txmm3); \
441 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
444 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
445 _mm_store_ss(ptr1,xmm1); \
446 _mm_store_ss(ptr2,_txmm2); \
450 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
452 _mm_store_ss(ptr1,xmm1); \
456 /* Similar to store, but increments value in memory */
457 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
459 __m128 _tincr1,_tincr2; \
460 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
461 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
462 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
463 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
464 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
465 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
468 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
471 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
472 _tincr = _mm_add_ps(_tincr,xmm1); \
473 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
476 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
479 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
480 _tincr = _mm_add_ps(_tincr,xmm1); \
481 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
484 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
487 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
488 _tincr = _mm_add_ps(_tincr,xmm1); \
489 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
492 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
495 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
496 _tincr = _mm_add_ss(_tincr,xmm1); \
497 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
502 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
503 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
505 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
506 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
507 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
508 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
509 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
510 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
511 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
512 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
513 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
516 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
518 __m128 _tmp1,_tmp2,_tmp3; \
519 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
520 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
521 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
522 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
523 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
524 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
525 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
528 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
530 __m128 _tmp1,_tmp2; \
531 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
532 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
533 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
534 c12 = _mm_movehl_ps(c12,c6); \
535 c6 = _mm_movelh_ps(c6,_mm_setzero_ps()); \
536 c12 = _mm_movelh_ps(c12,_mm_setzero_ps()); \
539 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
541 c6 = _mm_load_ss(ptr1); \
542 c12 = _mm_load_ss(ptr1+1); \
546 /* Routines to load 1-4 rvecs from 1-4 places.
547 * We mainly use these to load coordinates. The extra routines
548 * are very efficient for the water-water loops, since we e.g.
549 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
551 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
552 jx1 = _mm_load_ss(ptr1); \
553 jy1 = _mm_load_ss((ptr1)+1); \
554 jz1 = _mm_load_ss((ptr1)+2); \
558 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
559 jx1 = _mm_load_ss(ptr1); \
560 jy1 = _mm_load_ss((ptr1)+1); \
561 jz1 = _mm_load_ss((ptr1)+2); \
562 jx2 = _mm_load_ss((ptr1)+3); \
563 jy2 = _mm_load_ss((ptr1)+4); \
564 jz2 = _mm_load_ss((ptr1)+5); \
568 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
569 jx1 = _mm_load_ss(ptr1); \
570 jy1 = _mm_load_ss((ptr1)+1); \
571 jz1 = _mm_load_ss((ptr1)+2); \
572 jx2 = _mm_load_ss((ptr1)+3); \
573 jy2 = _mm_load_ss((ptr1)+4); \
574 jz2 = _mm_load_ss((ptr1)+5); \
575 jx3 = _mm_load_ss((ptr1)+6); \
576 jy3 = _mm_load_ss((ptr1)+7); \
577 jz3 = _mm_load_ss((ptr1)+8); \
581 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
582 jx1 = _mm_load_ss(ptr1); \
583 jy1 = _mm_load_ss((ptr1)+1); \
584 jz1 = _mm_load_ss((ptr1)+2); \
585 jx2 = _mm_load_ss((ptr1)+3); \
586 jy2 = _mm_load_ss((ptr1)+4); \
587 jz2 = _mm_load_ss((ptr1)+5); \
588 jx3 = _mm_load_ss((ptr1)+6); \
589 jy3 = _mm_load_ss((ptr1)+7); \
590 jz3 = _mm_load_ss((ptr1)+8); \
591 jx4 = _mm_load_ss((ptr1)+9); \
592 jy4 = _mm_load_ss((ptr1)+10); \
593 jz4 = _mm_load_ss((ptr1)+11); \
597 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
598 __m128 _tmp1,_tmp2; \
599 _tmp1 = _mm_load_ss(ptr1); \
600 _tmp2 = _mm_load_ss(ptr2); \
601 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
602 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
603 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
604 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
605 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
606 jz1 = _mm_movehl_ps(jz1,jy1); \
607 jx1 = _mm_movelh_ps(jx1,_mm_setzero_ps()); \
608 jy1 = _mm_movelh_ps(jy1,_mm_setzero_ps()); \
609 jz1 = _mm_movelh_ps(jz1,_mm_setzero_ps()); \
613 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
614 __m128 _tmp1, _tmp2; \
615 _tmp1 = _mm_loadu_ps(ptr1); \
616 jy1 = _mm_loadu_ps(ptr2); \
617 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
618 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
619 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
620 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
621 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
622 jy1 = _mm_movehl_ps(jx1,jx1); \
623 jx2 = _mm_movehl_ps(jz1,jz1); \
624 jz2 = _mm_movehl_ps(jy2,jy2); \
625 jx1 = _mm_movelh_ps(jx1,_mm_setzero_ps()); \
626 jy1 = _mm_movelh_ps(jy1,_mm_setzero_ps()); \
627 jz1 = _mm_movelh_ps(jz1,_mm_setzero_ps()); \
628 jx2 = _mm_movelh_ps(jx2,_mm_setzero_ps()); \
629 jy2 = _mm_movelh_ps(jy2,_mm_setzero_ps()); \
630 jz2 = _mm_movelh_ps(jz2,_mm_setzero_ps()); \
634 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
635 __m128 _tmp1, _tmp2, _tmp3; \
636 _tmp1 = _mm_loadu_ps(ptr1); \
637 jy1 = _mm_loadu_ps(ptr2); \
638 _tmp2 = _mm_loadu_ps(ptr1+4); \
639 jz2 = _mm_loadu_ps(ptr2+4); \
640 jz3 = _mm_load_ss(ptr1+8); \
641 _tmp3 = _mm_load_ss(ptr2+8); \
642 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
643 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
644 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
645 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
646 jy1 = _mm_movehl_ps(jx1,jx1); \
647 jx2 = _mm_movehl_ps(jz1,jz1); \
648 jz2 = _mm_movehl_ps(jy2,jy2); \
649 jy3 = _mm_movehl_ps(jx3,jx3); \
650 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
651 jx1 = _mm_movelh_ps(jx1,_mm_setzero_ps()); \
652 jy1 = _mm_movelh_ps(jy1,_mm_setzero_ps()); \
653 jz1 = _mm_movelh_ps(jz1,_mm_setzero_ps()); \
654 jx2 = _mm_movelh_ps(jx2,_mm_setzero_ps()); \
655 jy2 = _mm_movelh_ps(jy2,_mm_setzero_ps()); \
656 jz2 = _mm_movelh_ps(jz2,_mm_setzero_ps()); \
657 jx3 = _mm_movelh_ps(jx3,_mm_setzero_ps()); \
658 jy3 = _mm_movelh_ps(jy3,_mm_setzero_ps()); \
659 jz3 = _mm_movelh_ps(jz3,_mm_setzero_ps()); \
663 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
664 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
665 _tmp1 = _mm_loadu_ps(ptr1); \
666 jy1 = _mm_loadu_ps(ptr2); \
667 _tmp2 = _mm_loadu_ps(ptr1+4); \
668 jz2 = _mm_loadu_ps(ptr2+4); \
669 _tmp3 = _mm_loadu_ps(ptr1+8); \
670 _tmp4 = _mm_loadu_ps(ptr2+8); \
671 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
672 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
673 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
674 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
675 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
676 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
677 jy1 = _mm_movehl_ps(jx1,jx1); \
678 jx2 = _mm_movehl_ps(jz1,jz1); \
679 jz2 = _mm_movehl_ps(jy2,jy2); \
680 jy3 = _mm_movehl_ps(jx3,jx3); \
681 jx4 = _mm_movehl_ps(jz3,jz3); \
682 jz4 = _mm_movehl_ps(jy4,jy4); \
683 jx1 = _mm_movelh_ps(jx1,_mm_setzero_ps()); \
684 jy1 = _mm_movelh_ps(jy1,_mm_setzero_ps()); \
685 jz1 = _mm_movelh_ps(jz1,_mm_setzero_ps()); \
686 jx2 = _mm_movelh_ps(jx2,_mm_setzero_ps()); \
687 jy2 = _mm_movelh_ps(jy2,_mm_setzero_ps()); \
688 jz2 = _mm_movelh_ps(jz2,_mm_setzero_ps()); \
689 jx3 = _mm_movelh_ps(jx3,_mm_setzero_ps()); \
690 jy3 = _mm_movelh_ps(jy3,_mm_setzero_ps()); \
691 jz3 = _mm_movelh_ps(jz3,_mm_setzero_ps()); \
692 jx4 = _mm_movelh_ps(jx4,_mm_setzero_ps()); \
693 jy4 = _mm_movelh_ps(jy4,_mm_setzero_ps()); \
694 jz4 = _mm_movelh_ps(jz4,_mm_setzero_ps()); \
698 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
699 __m128 _tmp1,_tmp3,_tmp4; \
700 jx1 = _mm_load_ss(ptr1); \
701 jy1 = _mm_load_ss(ptr2); \
702 jz1 = _mm_load_ss(ptr3); \
703 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
704 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
705 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
706 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
707 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
708 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
709 jx1 = _mm_movelh_ps(_tmp1,jz1); \
710 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
711 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
715 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
716 __m128 _tmp1, _tmp2; \
717 jx1 = _mm_loadu_ps(ptr1); \
718 jy1 = _mm_loadu_ps(ptr2); \
719 jz1 = _mm_loadu_ps(ptr3); \
720 jx2 = _mm_setzero_ps(); \
721 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
722 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
723 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
724 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
725 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
726 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
727 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
728 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
732 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
733 __m128 _tmp1, _tmp2; \
734 jx1 = _mm_loadu_ps(ptr1); \
735 jy1 = _mm_loadu_ps(ptr2); \
736 jz1 = _mm_loadu_ps(ptr3); \
737 jx2 = _mm_setzero_ps(); \
738 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
739 jy2 = _mm_loadu_ps(ptr1+4); \
740 jz2 = _mm_loadu_ps(ptr2+4); \
741 jx3 = _mm_loadu_ps(ptr3+4); \
742 jy3 = _mm_setzero_ps(); \
743 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
744 jz3 = _mm_load_ss(ptr1+8); \
745 _tmp1 = _mm_load_ss(ptr2+8); \
746 _tmp2 = _mm_load_ss(ptr3+8); \
747 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
748 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
749 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
753 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
754 jx1 = _mm_loadu_ps(ptr1); \
755 jy1 = _mm_loadu_ps(ptr2); \
756 jz1 = _mm_loadu_ps(ptr3); \
757 jx2 = _mm_setzero_ps(); \
758 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
759 jy2 = _mm_loadu_ps(ptr1+4); \
760 jz2 = _mm_loadu_ps(ptr2+4); \
761 jx3 = _mm_loadu_ps(ptr3+4); \
762 jy3 = _mm_setzero_ps(); \
763 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
764 jz3 = _mm_loadu_ps(ptr1+8); \
765 jx4 = _mm_loadu_ps(ptr2+8); \
766 jy4 = _mm_loadu_ps(ptr3+8); \
767 jz4 = _mm_setzero_ps(); \
768 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
773 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
774 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
775 jx1 = _mm_load_ss(ptr1); \
776 _tmp1 = _mm_load_ss(ptr2); \
777 jy1 = _mm_load_ss(ptr3); \
778 jz1 = _mm_load_ss(ptr4); \
779 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
780 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
781 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
782 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
783 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
784 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
785 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
786 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
787 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
788 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
789 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
793 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
794 __m128 _tmp1, _tmp2; \
795 jx1 = _mm_loadu_ps(ptr1); \
796 jy1 = _mm_loadu_ps(ptr2); \
797 jz1 = _mm_loadu_ps(ptr3); \
798 jx2 = _mm_loadu_ps(ptr4); \
799 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
800 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
801 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
802 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
803 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
804 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
805 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
806 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
807 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
811 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
812 __m128 _tmp1, _tmp2, _tmp3; \
813 jx1 = _mm_loadu_ps(ptr1); \
814 jy1 = _mm_loadu_ps(ptr2); \
815 jz1 = _mm_loadu_ps(ptr3); \
816 jx2 = _mm_loadu_ps(ptr4); \
817 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
818 jy2 = _mm_loadu_ps(ptr1+4); \
819 jz2 = _mm_loadu_ps(ptr2+4); \
820 jx3 = _mm_loadu_ps(ptr3+4); \
821 jy3 = _mm_loadu_ps(ptr4+4); \
822 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
823 jz3 = _mm_load_ss(ptr1+8); \
824 _tmp1 = _mm_load_ss(ptr2+8); \
825 _tmp2 = _mm_load_ss(ptr3+8); \
826 _tmp3 = _mm_load_ss(ptr4+8); \
827 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
828 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
829 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
833 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
834 jx1 = _mm_loadu_ps(ptr1); \
835 jy1 = _mm_loadu_ps(ptr2); \
836 jz1 = _mm_loadu_ps(ptr3); \
837 jx2 = _mm_loadu_ps(ptr4); \
838 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
839 jy2 = _mm_loadu_ps(ptr1+4); \
840 jz2 = _mm_loadu_ps(ptr2+4); \
841 jx3 = _mm_loadu_ps(ptr3+4); \
842 jy3 = _mm_loadu_ps(ptr4+4); \
843 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
844 jz3 = _mm_loadu_ps(ptr1+8); \
845 jx4 = _mm_loadu_ps(ptr2+8); \
846 jy4 = _mm_loadu_ps(ptr3+8); \
847 jz4 = _mm_loadu_ps(ptr4+8); \
848 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
852 /* Routines to increment rvecs in memory, typically use for j particle force updates */
853 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
855 jy1 = _mm_unpacklo_ps(jy1,jz1); \
856 jx1 = _mm_movelh_ps(jx1,jy1); \
857 _tmp1 = _mm_load_ss(ptr1); \
858 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
859 _tmp1 = _mm_add_ps(_tmp1,jx1); \
860 _mm_store_ss(ptr1,_tmp1); \
861 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
865 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
866 __m128 _tmp1, _tmp2; \
867 _tmp1 = _mm_loadu_ps(ptr1); \
868 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
869 jx1 = _mm_unpacklo_ps(jx1,jy1); \
870 jz1 = _mm_unpacklo_ps(jz1,jx2); \
871 jy2 = _mm_unpacklo_ps(jy2,jz2); \
872 jx1 = _mm_movelh_ps(jx1,jz1); \
873 _tmp1 = _mm_add_ps(_tmp1,jx1); \
874 _tmp2 = _mm_add_ps(_tmp2,jy2); \
875 _mm_storeu_ps(ptr1,_tmp1); \
876 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
880 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
881 __m128 _tmp1, _tmp2, _tmp3; \
882 _tmp1 = _mm_loadu_ps(ptr1); \
883 _tmp2 = _mm_loadu_ps(ptr1+4); \
884 _tmp3 = _mm_load_ss(ptr1+8); \
885 jx1 = _mm_unpacklo_ps(jx1,jy1); \
886 jz1 = _mm_unpacklo_ps(jz1,jx2); \
887 jy2 = _mm_unpacklo_ps(jy2,jz2); \
888 jx3 = _mm_unpacklo_ps(jx3,jy3); \
889 jx1 = _mm_movelh_ps(jx1,jz1); \
890 jy2 = _mm_movelh_ps(jy2,jx3); \
891 _tmp1 = _mm_add_ps(_tmp1,jx1); \
892 _tmp2 = _mm_add_ps(_tmp2,jy2); \
893 _tmp3 = _mm_add_ss(_tmp3,jz3); \
894 _mm_storeu_ps(ptr1,_tmp1); \
895 _mm_storeu_ps(ptr1+4,_tmp2); \
896 _mm_store_ss(ptr1+8,_tmp3); \
900 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
901 __m128 _tmp1, _tmp2, _tmp3; \
902 _tmp1 = _mm_loadu_ps(ptr1); \
903 _tmp2 = _mm_loadu_ps(ptr1+4); \
904 _tmp3 = _mm_loadu_ps(ptr1+8); \
905 jx1 = _mm_unpacklo_ps(jx1,jy1); \
906 jz1 = _mm_unpacklo_ps(jz1,jx2); \
907 jy2 = _mm_unpacklo_ps(jy2,jz2); \
908 jx3 = _mm_unpacklo_ps(jx3,jy3); \
909 jz3 = _mm_unpacklo_ps(jz3,jx4); \
910 jy4 = _mm_unpacklo_ps(jy4,jz4); \
911 jx1 = _mm_movelh_ps(jx1,jz1); \
912 jy2 = _mm_movelh_ps(jy2,jx3); \
913 jz3 = _mm_movelh_ps(jz3,jy4); \
914 _tmp1 = _mm_add_ps(_tmp1,jx1); \
915 _tmp2 = _mm_add_ps(_tmp2,jy2); \
916 _tmp3 = _mm_add_ps(_tmp3,jz3); \
917 _mm_storeu_ps(ptr1,_tmp1); \
918 _mm_storeu_ps(ptr1+4,_tmp2); \
919 _mm_storeu_ps(ptr1+8,_tmp3); \
923 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
924 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
925 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
926 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
927 _tmp2 = _mm_load_ss(ptr1+2); \
928 _tmp3 = _mm_load_ss(ptr2+2); \
929 jx1 = _mm_unpacklo_ps(jx1,jy1); \
930 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
931 _tmp1 = _mm_add_ps(_tmp1,jx1); \
932 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
933 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
934 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
935 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
939 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
940 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
941 _tmp1 = _mm_loadu_ps(ptr1); \
942 _tmp2 = _mm_loadu_ps(ptr2); \
943 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
944 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
945 jx1 = _mm_unpacklo_ps(jx1,jy1); \
946 jz1 = _mm_unpacklo_ps(jz1,jx2); \
947 jy2 = _mm_unpacklo_ps(jy2,jz2); \
948 _tmp4 = _mm_movelh_ps(jx1,jz1); \
949 _tmp5 = _mm_movehl_ps(jz1,jx1); \
950 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
951 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
952 _tmp3 = _mm_add_ps(_tmp3,jy2); \
953 _mm_storeu_ps(ptr1,_tmp1); \
954 _mm_storeu_ps(ptr2,_tmp2); \
955 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
956 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
960 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
961 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
962 _tmp1 = _mm_loadu_ps(ptr1); \
963 _tmp2 = _mm_loadu_ps(ptr1+4); \
964 _tmp3 = _mm_load_ss(ptr1+8); \
965 _tmp4 = _mm_loadu_ps(ptr2); \
966 _tmp5 = _mm_loadu_ps(ptr2+4); \
967 _tmp6 = _mm_load_ss(ptr2+8); \
968 jx1 = _mm_unpacklo_ps(jx1,jy1); \
969 jz1 = _mm_unpacklo_ps(jz1,jx2); \
970 jy2 = _mm_unpacklo_ps(jy2,jz2); \
971 jx3 = _mm_unpacklo_ps(jx3,jy3); \
972 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
973 _tmp8 = _mm_movelh_ps(jx1,jz1); \
974 _tmp9 = _mm_movehl_ps(jz1,jx1); \
975 _tmp10 = _mm_movelh_ps(jy2,jx3); \
976 _tmp11 = _mm_movehl_ps(jx3,jy2); \
977 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
978 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
979 _tmp3 = _mm_add_ss(_tmp3,jz3); \
980 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
981 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
982 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
983 _mm_storeu_ps(ptr1,_tmp1); \
984 _mm_storeu_ps(ptr1+4,_tmp2); \
985 _mm_store_ss(ptr1+8,_tmp3); \
986 _mm_storeu_ps(ptr2,_tmp4); \
987 _mm_storeu_ps(ptr2+4,_tmp5); \
988 _mm_store_ss(ptr2+8,_tmp6); \
992 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
993 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
994 _tmp1 = _mm_loadu_ps(ptr1); \
995 _tmp2 = _mm_loadu_ps(ptr1+4); \
996 _tmp3 = _mm_loadu_ps(ptr1+8); \
997 _tmp4 = _mm_loadu_ps(ptr2); \
998 _tmp5 = _mm_loadu_ps(ptr2+4); \
999 _tmp6 = _mm_loadu_ps(ptr2+8); \
1000 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1001 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1002 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1003 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1004 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1005 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1006 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1007 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1008 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1009 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1010 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1011 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1012 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
1013 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1014 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1015 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
1016 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
1017 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
1018 _mm_storeu_ps(ptr1,_tmp1); \
1019 _mm_storeu_ps(ptr1+4,_tmp2); \
1020 _mm_storeu_ps(ptr1+8,_tmp3); \
1021 _mm_storeu_ps(ptr2,_tmp4); \
1022 _mm_storeu_ps(ptr2+4,_tmp5); \
1023 _mm_storeu_ps(ptr2+8,_tmp6); \
1027 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1028 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1029 _tmp1 = _mm_load_ss(ptr1); \
1030 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1031 _tmp2 = _mm_load_ss(ptr2); \
1032 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1033 _tmp3 = _mm_load_ss(ptr3); \
1034 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1035 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1036 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1037 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1038 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1039 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1040 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1041 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1042 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1043 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1044 _mm_store_ss(ptr1,_tmp1); \
1045 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1046 _mm_store_ss(ptr2,_tmp2); \
1047 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1048 _mm_store_ss(ptr3,_tmp3); \
1049 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1053 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1054 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1055 _tmp1 = _mm_loadu_ps(ptr1); \
1056 _tmp2 = _mm_loadu_ps(ptr2); \
1057 _tmp3 = _mm_loadu_ps(ptr3); \
1058 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1059 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1060 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1061 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1062 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1063 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1064 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1065 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1066 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1067 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1068 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1069 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1070 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1071 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1072 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1073 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1074 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1075 _mm_storeu_ps(ptr1,_tmp1); \
1076 _mm_storeu_ps(ptr2,_tmp2); \
1077 _mm_storeu_ps(ptr3,_tmp3); \
1078 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1079 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1080 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1084 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1085 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1086 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1087 _tmp1 = _mm_loadu_ps(ptr1); \
1088 _tmp2 = _mm_loadu_ps(ptr1+4); \
1089 _tmp3 = _mm_load_ss(ptr1+8); \
1090 _tmp4 = _mm_loadu_ps(ptr2); \
1091 _tmp5 = _mm_loadu_ps(ptr2+4); \
1092 _tmp6 = _mm_load_ss(ptr2+8); \
1093 _tmp7 = _mm_loadu_ps(ptr3); \
1094 _tmp8 = _mm_loadu_ps(ptr3+4); \
1095 _tmp9 = _mm_load_ss(ptr3+8); \
1096 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1097 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1098 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1099 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1100 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1101 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1102 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1103 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1104 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1105 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1106 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1107 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1108 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1109 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1110 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1111 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1112 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1113 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1114 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1115 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1116 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1117 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1118 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1119 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1120 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1121 _mm_storeu_ps(ptr1,_tmp1); \
1122 _mm_storeu_ps(ptr1+4,_tmp2); \
1123 _mm_store_ss(ptr1+8,_tmp3); \
1124 _mm_storeu_ps(ptr2,_tmp4); \
1125 _mm_storeu_ps(ptr2+4,_tmp5); \
1126 _mm_store_ss(ptr2+8,_tmp6); \
1127 _mm_storeu_ps(ptr3,_tmp7); \
1128 _mm_storeu_ps(ptr3+4,_tmp8); \
1129 _mm_store_ss(ptr3+8,_tmp9); \
1133 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1134 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1135 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1136 _tmp1 = _mm_loadu_ps(ptr1); \
1137 _tmp2 = _mm_loadu_ps(ptr1+4); \
1138 _tmp3 = _mm_loadu_ps(ptr1+8); \
1139 _tmp4 = _mm_loadu_ps(ptr2); \
1140 _tmp5 = _mm_loadu_ps(ptr2+4); \
1141 _tmp6 = _mm_loadu_ps(ptr2+8); \
1142 _tmp7 = _mm_loadu_ps(ptr3); \
1143 _tmp8 = _mm_loadu_ps(ptr3+4); \
1144 _tmp9 = _mm_loadu_ps(ptr3+8); \
1145 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1146 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1147 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1148 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1149 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1150 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1151 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1152 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1153 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1154 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1155 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1156 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1157 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1158 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1159 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1160 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1161 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1162 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1163 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1164 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1165 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1166 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1167 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1168 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1169 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1170 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1171 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1172 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1173 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1174 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1175 _mm_storeu_ps(ptr1,_tmp1); \
1176 _mm_storeu_ps(ptr1+4,_tmp2); \
1177 _mm_storeu_ps(ptr1+8,_tmp3); \
1178 _mm_storeu_ps(ptr2,_tmp4); \
1179 _mm_storeu_ps(ptr2+4,_tmp5); \
1180 _mm_storeu_ps(ptr2+8,_tmp6); \
1181 _mm_storeu_ps(ptr3,_tmp7); \
1182 _mm_storeu_ps(ptr3+4,_tmp8); \
1183 _mm_storeu_ps(ptr3+8,_tmp9); \
1188 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1189 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1190 _tmp1 = _mm_load_ss(ptr1); \
1191 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1192 _tmp2 = _mm_load_ss(ptr2); \
1193 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1194 _tmp3 = _mm_load_ss(ptr3); \
1195 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1196 _tmp4 = _mm_load_ss(ptr4); \
1197 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1198 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1199 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1200 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1201 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1202 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1203 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1204 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1205 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1206 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1207 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1208 _mm_store_ss(ptr1,_tmp1); \
1209 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1210 _mm_store_ss(ptr2,_tmp2); \
1211 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1212 _mm_store_ss(ptr3,_tmp3); \
1213 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1214 _mm_store_ss(ptr4,_tmp4); \
1215 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1219 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1220 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1221 _tmp1 = _mm_loadu_ps(ptr1); \
1222 _tmp2 = _mm_loadu_ps(ptr2); \
1223 _tmp3 = _mm_loadu_ps(ptr3); \
1224 _tmp4 = _mm_loadu_ps(ptr4); \
1225 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1226 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1227 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1228 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1229 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1230 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1231 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1232 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1233 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1234 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1235 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1236 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1237 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1238 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1239 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1240 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1241 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1242 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1243 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1244 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1245 _mm_storeu_ps(ptr1,_tmp1); \
1246 _mm_storeu_ps(ptr2,_tmp2); \
1247 _mm_storeu_ps(ptr3,_tmp3); \
1248 _mm_storeu_ps(ptr4,_tmp4); \
1249 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1250 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1251 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1252 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1256 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1257 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1258 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1259 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1260 _tmp1 = _mm_loadu_ps(ptr1); \
1261 _tmp2 = _mm_loadu_ps(ptr1+4); \
1262 _tmp3 = _mm_load_ss(ptr1+8); \
1263 _tmp4 = _mm_loadu_ps(ptr2); \
1264 _tmp5 = _mm_loadu_ps(ptr2+4); \
1265 _tmp6 = _mm_load_ss(ptr2+8); \
1266 _tmp7 = _mm_loadu_ps(ptr3); \
1267 _tmp8 = _mm_loadu_ps(ptr3+4); \
1268 _tmp9 = _mm_load_ss(ptr3+8); \
1269 _tmp10 = _mm_loadu_ps(ptr4); \
1270 _tmp11 = _mm_loadu_ps(ptr4+4); \
1271 _tmp12 = _mm_load_ss(ptr4+8); \
1272 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1273 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1274 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1275 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1276 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1277 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1278 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1279 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1280 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1281 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1282 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1283 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1284 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1285 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1286 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1287 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1288 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1289 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1290 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1291 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1292 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1293 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1294 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1295 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1296 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1297 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1298 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1299 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1300 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1301 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1302 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1303 _mm_storeu_ps(ptr1,_tmp1); \
1304 _mm_storeu_ps(ptr1+4,_tmp2); \
1305 _mm_store_ss(ptr1+8,_tmp3); \
1306 _mm_storeu_ps(ptr2,_tmp4); \
1307 _mm_storeu_ps(ptr2+4,_tmp5); \
1308 _mm_store_ss(ptr2+8,_tmp6); \
1309 _mm_storeu_ps(ptr3,_tmp7); \
1310 _mm_storeu_ps(ptr3+4,_tmp8); \
1311 _mm_store_ss(ptr3+8,_tmp9); \
1312 _mm_storeu_ps(ptr4,_tmp10); \
1313 _mm_storeu_ps(ptr4+4,_tmp11); \
1314 _mm_store_ss(ptr4+8,_tmp12); \
1318 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1319 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1320 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1321 __m128 _tmp23,_tmp24; \
1322 _tmp1 = _mm_loadu_ps(ptr1); \
1323 _tmp2 = _mm_loadu_ps(ptr1+4); \
1324 _tmp3 = _mm_loadu_ps(ptr1+8); \
1325 _tmp4 = _mm_loadu_ps(ptr2); \
1326 _tmp5 = _mm_loadu_ps(ptr2+4); \
1327 _tmp6 = _mm_loadu_ps(ptr2+8); \
1328 _tmp7 = _mm_loadu_ps(ptr3); \
1329 _tmp8 = _mm_loadu_ps(ptr3+4); \
1330 _tmp9 = _mm_loadu_ps(ptr3+8); \
1331 _tmp10 = _mm_loadu_ps(ptr4); \
1332 _tmp11 = _mm_loadu_ps(ptr4+4); \
1333 _tmp12 = _mm_loadu_ps(ptr4+8); \
1334 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1335 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1336 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1337 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1338 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1339 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1340 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1341 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1342 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1343 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1344 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1345 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1346 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1347 jz1 = _mm_movehl_ps(jz1,jx1); \
1348 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1349 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1350 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1351 jx3 = _mm_movehl_ps(jx3,jy2); \
1352 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1353 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1354 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1355 jy4 = _mm_movehl_ps(jy4,jz3); \
1356 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1357 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1358 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1359 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1360 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1361 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1362 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1363 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1364 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1365 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1366 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1367 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1368 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1369 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1370 _mm_storeu_ps(ptr1,_tmp1); \
1371 _mm_storeu_ps(ptr1+4,_tmp2); \
1372 _mm_storeu_ps(ptr1+8,_tmp3); \
1373 _mm_storeu_ps(ptr2,_tmp4); \
1374 _mm_storeu_ps(ptr2+4,_tmp5); \
1375 _mm_storeu_ps(ptr2+8,_tmp6); \
1376 _mm_storeu_ps(ptr3,_tmp7); \
1377 _mm_storeu_ps(ptr3+4,_tmp8); \
1378 _mm_storeu_ps(ptr3+8,_tmp9); \
1379 _mm_storeu_ps(ptr4,_tmp10); \
1380 _mm_storeu_ps(ptr4+4,_tmp11); \
1381 _mm_storeu_ps(ptr4+8,_tmp12); \
1386 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1388 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1389 jx1 = _mm_movelh_ps(jx1,jy1); \
1390 _tmp1 = _mm_load_ss(ptr1); \
1391 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1392 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1393 _mm_store_ss(ptr1,_tmp1); \
1394 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1398 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1399 __m128 _tmp1, _tmp2; \
1400 _tmp1 = _mm_loadu_ps(ptr1); \
1401 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1402 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1403 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1404 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1405 jx1 = _mm_movelh_ps(jx1,jz1); \
1406 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1407 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1408 _mm_storeu_ps(ptr1,_tmp1); \
1409 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1413 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1414 __m128 _tmp1, _tmp2, _tmp3; \
1415 _tmp1 = _mm_loadu_ps(ptr1); \
1416 _tmp2 = _mm_loadu_ps(ptr1+4); \
1417 _tmp3 = _mm_load_ss(ptr1+8); \
1418 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1419 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1420 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1421 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1422 jx1 = _mm_movelh_ps(jx1,jz1); \
1423 jy2 = _mm_movelh_ps(jy2,jx3); \
1424 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1425 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1426 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1427 _mm_storeu_ps(ptr1,_tmp1); \
1428 _mm_storeu_ps(ptr1+4,_tmp2); \
1429 _mm_store_ss(ptr1+8,_tmp3); \
1433 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1434 __m128 _tmp1, _tmp2, _tmp3; \
1435 _tmp1 = _mm_loadu_ps(ptr1); \
1436 _tmp2 = _mm_loadu_ps(ptr1+4); \
1437 _tmp3 = _mm_loadu_ps(ptr1+8); \
1438 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1439 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1440 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1441 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1442 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1443 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1444 jx1 = _mm_movelh_ps(jx1,jz1); \
1445 jy2 = _mm_movelh_ps(jy2,jx3); \
1446 jz3 = _mm_movelh_ps(jz3,jy4); \
1447 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1448 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1449 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1450 _mm_storeu_ps(ptr1,_tmp1); \
1451 _mm_storeu_ps(ptr1+4,_tmp2); \
1452 _mm_storeu_ps(ptr1+8,_tmp3); \
1456 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1457 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1458 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1459 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1460 _tmp2 = _mm_load_ss(ptr1+2); \
1461 _tmp3 = _mm_load_ss(ptr2+2); \
1462 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1463 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1464 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1465 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1466 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1467 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1468 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1472 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1473 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1474 _tmp1 = _mm_loadu_ps(ptr1); \
1475 _tmp2 = _mm_loadu_ps(ptr2); \
1476 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1477 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1478 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1479 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1480 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1481 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1482 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1483 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1484 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1485 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1486 _mm_storeu_ps(ptr1,_tmp1); \
1487 _mm_storeu_ps(ptr2,_tmp2); \
1488 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1489 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1493 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1494 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1495 _tmp1 = _mm_loadu_ps(ptr1); \
1496 _tmp2 = _mm_loadu_ps(ptr1+4); \
1497 _tmp3 = _mm_load_ss(ptr1+8); \
1498 _tmp4 = _mm_loadu_ps(ptr2); \
1499 _tmp5 = _mm_loadu_ps(ptr2+4); \
1500 _tmp6 = _mm_load_ss(ptr2+8); \
1501 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1502 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1503 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1504 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1505 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1506 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1507 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1508 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1509 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1510 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1511 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1512 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1513 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1514 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1515 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1516 _mm_storeu_ps(ptr1,_tmp1); \
1517 _mm_storeu_ps(ptr1+4,_tmp2); \
1518 _mm_store_ss(ptr1+8,_tmp3); \
1519 _mm_storeu_ps(ptr2,_tmp4); \
1520 _mm_storeu_ps(ptr2+4,_tmp5); \
1521 _mm_store_ss(ptr2+8,_tmp6); \
1525 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1526 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1527 _tmp1 = _mm_loadu_ps(ptr1); \
1528 _tmp2 = _mm_loadu_ps(ptr1+4); \
1529 _tmp3 = _mm_loadu_ps(ptr1+8); \
1530 _tmp4 = _mm_loadu_ps(ptr2); \
1531 _tmp5 = _mm_loadu_ps(ptr2+4); \
1532 _tmp6 = _mm_loadu_ps(ptr2+8); \
1533 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1534 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1535 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1536 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1537 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1538 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1539 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1540 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1541 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1542 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1543 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1544 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1545 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1546 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1547 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1548 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1549 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1550 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1551 _mm_storeu_ps(ptr1,_tmp1); \
1552 _mm_storeu_ps(ptr1+4,_tmp2); \
1553 _mm_storeu_ps(ptr1+8,_tmp3); \
1554 _mm_storeu_ps(ptr2,_tmp4); \
1555 _mm_storeu_ps(ptr2+4,_tmp5); \
1556 _mm_storeu_ps(ptr2+8,_tmp6); \
1560 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1561 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1562 _tmp1 = _mm_load_ss(ptr1); \
1563 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1564 _tmp2 = _mm_load_ss(ptr2); \
1565 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1566 _tmp3 = _mm_load_ss(ptr3); \
1567 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1568 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1569 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1570 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1571 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1572 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1573 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1574 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1575 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1576 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1577 _mm_store_ss(ptr1,_tmp1); \
1578 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1579 _mm_store_ss(ptr2,_tmp2); \
1580 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1581 _mm_store_ss(ptr3,_tmp3); \
1582 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1586 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1587 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1588 _tmp1 = _mm_loadu_ps(ptr1); \
1589 _tmp2 = _mm_loadu_ps(ptr2); \
1590 _tmp3 = _mm_loadu_ps(ptr3); \
1591 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1592 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1593 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1594 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1595 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1596 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1597 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1598 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1599 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1600 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1601 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1602 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1603 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1604 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1605 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1606 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1607 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1608 _mm_storeu_ps(ptr1,_tmp1); \
1609 _mm_storeu_ps(ptr2,_tmp2); \
1610 _mm_storeu_ps(ptr3,_tmp3); \
1611 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1612 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1613 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1617 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1618 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1619 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1620 _tmp1 = _mm_loadu_ps(ptr1); \
1621 _tmp2 = _mm_loadu_ps(ptr1+4); \
1622 _tmp3 = _mm_load_ss(ptr1+8); \
1623 _tmp4 = _mm_loadu_ps(ptr2); \
1624 _tmp5 = _mm_loadu_ps(ptr2+4); \
1625 _tmp6 = _mm_load_ss(ptr2+8); \
1626 _tmp7 = _mm_loadu_ps(ptr3); \
1627 _tmp8 = _mm_loadu_ps(ptr3+4); \
1628 _tmp9 = _mm_load_ss(ptr3+8); \
1629 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1630 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1631 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1632 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1633 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1634 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1635 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1636 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1637 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1638 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1639 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1640 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1641 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1642 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1643 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1644 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1645 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1646 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1647 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1648 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1649 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1650 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1651 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1652 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1653 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1654 _mm_storeu_ps(ptr1,_tmp1); \
1655 _mm_storeu_ps(ptr1+4,_tmp2); \
1656 _mm_store_ss(ptr1+8,_tmp3); \
1657 _mm_storeu_ps(ptr2,_tmp4); \
1658 _mm_storeu_ps(ptr2+4,_tmp5); \
1659 _mm_store_ss(ptr2+8,_tmp6); \
1660 _mm_storeu_ps(ptr3,_tmp7); \
1661 _mm_storeu_ps(ptr3+4,_tmp8); \
1662 _mm_store_ss(ptr3+8,_tmp9); \
1666 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1667 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1668 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1669 _tmp1 = _mm_loadu_ps(ptr1); \
1670 _tmp2 = _mm_loadu_ps(ptr1+4); \
1671 _tmp3 = _mm_loadu_ps(ptr1+8); \
1672 _tmp4 = _mm_loadu_ps(ptr2); \
1673 _tmp5 = _mm_loadu_ps(ptr2+4); \
1674 _tmp6 = _mm_loadu_ps(ptr2+8); \
1675 _tmp7 = _mm_loadu_ps(ptr3); \
1676 _tmp8 = _mm_loadu_ps(ptr3+4); \
1677 _tmp9 = _mm_loadu_ps(ptr3+8); \
1678 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1679 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1680 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1681 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1682 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1683 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1684 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1685 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1686 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1687 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1688 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1689 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1690 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1691 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1692 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1693 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1694 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1695 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1696 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1697 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1698 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1699 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1700 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1701 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
1702 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1703 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1704 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
1705 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1706 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1707 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
1708 _mm_storeu_ps(ptr1,_tmp1); \
1709 _mm_storeu_ps(ptr1+4,_tmp2); \
1710 _mm_storeu_ps(ptr1+8,_tmp3); \
1711 _mm_storeu_ps(ptr2,_tmp4); \
1712 _mm_storeu_ps(ptr2+4,_tmp5); \
1713 _mm_storeu_ps(ptr2+8,_tmp6); \
1714 _mm_storeu_ps(ptr3,_tmp7); \
1715 _mm_storeu_ps(ptr3+4,_tmp8); \
1716 _mm_storeu_ps(ptr3+8,_tmp9); \
1722 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1723 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1724 _tmp1 = _mm_load_ss(ptr1); \
1725 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1726 _tmp2 = _mm_load_ss(ptr2); \
1727 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1728 _tmp3 = _mm_load_ss(ptr3); \
1729 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1730 _tmp4 = _mm_load_ss(ptr4); \
1731 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1732 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1733 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1734 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1735 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1736 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1737 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1738 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
1739 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
1740 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
1741 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
1742 _mm_store_ss(ptr1,_tmp1); \
1743 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1744 _mm_store_ss(ptr2,_tmp2); \
1745 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1746 _mm_store_ss(ptr3,_tmp3); \
1747 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1748 _mm_store_ss(ptr4,_tmp4); \
1749 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1754 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1755 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1756 _tmp1 = _mm_loadu_ps(ptr1); \
1757 _tmp2 = _mm_loadu_ps(ptr2); \
1758 _tmp3 = _mm_loadu_ps(ptr3); \
1759 _tmp4 = _mm_loadu_ps(ptr4); \
1760 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1761 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1762 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1763 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1764 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1765 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1766 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1767 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1768 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1769 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1770 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1771 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1772 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1773 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1774 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
1775 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
1776 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1777 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
1778 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
1779 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
1780 _mm_storeu_ps(ptr1,_tmp1); \
1781 _mm_storeu_ps(ptr2,_tmp2); \
1782 _mm_storeu_ps(ptr3,_tmp3); \
1783 _mm_storeu_ps(ptr4,_tmp4); \
1784 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1785 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1786 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1787 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1791 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1792 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1793 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1794 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1795 _tmp1 = _mm_loadu_ps(ptr1); \
1796 _tmp2 = _mm_loadu_ps(ptr1+4); \
1797 _tmp3 = _mm_load_ss(ptr1+8); \
1798 _tmp4 = _mm_loadu_ps(ptr2); \
1799 _tmp5 = _mm_loadu_ps(ptr2+4); \
1800 _tmp6 = _mm_load_ss(ptr2+8); \
1801 _tmp7 = _mm_loadu_ps(ptr3); \
1802 _tmp8 = _mm_loadu_ps(ptr3+4); \
1803 _tmp9 = _mm_load_ss(ptr3+8); \
1804 _tmp10 = _mm_loadu_ps(ptr4); \
1805 _tmp11 = _mm_loadu_ps(ptr4+4); \
1806 _tmp12 = _mm_load_ss(ptr4+8); \
1807 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1808 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1809 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1810 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1811 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1812 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1813 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1814 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1815 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1816 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1817 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1818 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1819 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1820 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1821 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1822 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1823 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1824 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1825 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1826 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
1827 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
1828 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1829 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
1830 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
1831 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
1832 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
1833 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
1834 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
1835 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1836 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1837 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
1838 _mm_storeu_ps(ptr1,_tmp1); \
1839 _mm_storeu_ps(ptr1+4,_tmp2); \
1840 _mm_store_ss(ptr1+8,_tmp3); \
1841 _mm_storeu_ps(ptr2,_tmp4); \
1842 _mm_storeu_ps(ptr2+4,_tmp5); \
1843 _mm_store_ss(ptr2+8,_tmp6); \
1844 _mm_storeu_ps(ptr3,_tmp7); \
1845 _mm_storeu_ps(ptr3+4,_tmp8); \
1846 _mm_store_ss(ptr3+8,_tmp9); \
1847 _mm_storeu_ps(ptr4,_tmp10); \
1848 _mm_storeu_ps(ptr4+4,_tmp11); \
1849 _mm_store_ss(ptr4+8,_tmp12); \
1853 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1854 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1855 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1856 __m128 _tmp23,_tmp24; \
1857 _tmp1 = _mm_loadu_ps(ptr1); \
1858 _tmp2 = _mm_loadu_ps(ptr1+4); \
1859 _tmp3 = _mm_loadu_ps(ptr1+8); \
1860 _tmp4 = _mm_loadu_ps(ptr2); \
1861 _tmp5 = _mm_loadu_ps(ptr2+4); \
1862 _tmp6 = _mm_loadu_ps(ptr2+8); \
1863 _tmp7 = _mm_loadu_ps(ptr3); \
1864 _tmp8 = _mm_loadu_ps(ptr3+4); \
1865 _tmp9 = _mm_loadu_ps(ptr3+8); \
1866 _tmp10 = _mm_loadu_ps(ptr4); \
1867 _tmp11 = _mm_loadu_ps(ptr4+4); \
1868 _tmp12 = _mm_loadu_ps(ptr4+8); \
1869 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1870 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1871 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1872 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1873 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1874 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1875 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1876 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1877 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1878 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1879 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1880 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1881 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1882 jz1 = _mm_movehl_ps(jz1,jx1); \
1883 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1884 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1885 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1886 jx3 = _mm_movehl_ps(jx3,jy2); \
1887 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1888 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1889 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1890 jy4 = _mm_movehl_ps(jy4,jz3); \
1891 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1892 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1893 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
1894 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
1895 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
1896 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
1897 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
1898 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
1899 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
1900 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
1901 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
1902 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1903 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1904 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
1905 _mm_storeu_ps(ptr1,_tmp1); \
1906 _mm_storeu_ps(ptr1+4,_tmp2); \
1907 _mm_storeu_ps(ptr1+8,_tmp3); \
1908 _mm_storeu_ps(ptr2,_tmp4); \
1909 _mm_storeu_ps(ptr2+4,_tmp5); \
1910 _mm_storeu_ps(ptr2+8,_tmp6); \
1911 _mm_storeu_ps(ptr3,_tmp7); \
1912 _mm_storeu_ps(ptr3+4,_tmp8); \
1913 _mm_storeu_ps(ptr3+8,_tmp9); \
1914 _mm_storeu_ps(ptr4,_tmp10); \
1915 _mm_storeu_ps(ptr4+4,_tmp11); \
1916 _mm_storeu_ps(ptr4+8,_tmp12); \
1924 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1925 * to set up the 7 constants used for analytic 5th order switch calculations.
1927 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
1928 const __m128 _swsetup_cm6 = { -6.0, -6.0, -6.0, -6.0}; \
1929 const __m128 _swsetup_cm10 = {-10.0,-10.0,-10.0,-10.0}; \
1930 const __m128 _swsetup_c15 = { 15.0, 15.0, 15.0, 15.0}; \
1931 const __m128 _swsetup_cm30 = {-30.0,-30.0,-30.0,-30.0}; \
1932 const __m128 _swsetup_c60 = { 60.0, 60.0, 60.0, 60.0}; \
1934 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
1936 d = _mm_sub_ps(rcut,rswitch); \
1937 dinv = gmx_mm_inv_ps(d); \
1938 dinv2 = _mm_mul_ps(dinv,dinv); \
1939 dinv3 = _mm_mul_ps(dinv2,dinv); \
1940 dinv4 = _mm_mul_ps(dinv2,dinv2); \
1941 dinv5 = _mm_mul_ps(dinv3,dinv2); \
1943 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
1944 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
1945 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
1946 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
1947 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
1948 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
1952 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1953 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
1955 d = _mm_max_ps(r,rswitch); \
1956 d = _mm_min_ps(d,rcut); \
1957 d = _mm_sub_ps(d,rswitch); \
1958 d2 = _mm_mul_ps(d,d); \
1959 sw = _mm_mul_ps(d,sw_C5); \
1960 dsw = _mm_mul_ps(d,sw_D4); \
1961 sw = _mm_add_ps(sw,sw_C4); \
1962 dsw = _mm_add_ps(dsw,sw_D3); \
1963 sw = _mm_mul_ps(sw,d); \
1964 dsw = _mm_mul_ps(dsw,d); \
1965 sw = _mm_add_ps(sw,sw_C3); \
1966 dsw = _mm_add_ps(dsw,sw_D2); \
1967 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
1968 dsw = _mm_mul_ps(dsw,d2); \
1969 sw = _mm_add_ps(sw,_sw_one); \
1973 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1974 static inline __m128
1975 gmx_mm_interaction_coulomb_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1977 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1978 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1984 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1986 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1987 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1991 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1992 static inline __m128
1993 gmx_mm_interaction_coulombrf_ps(const __m128 rinv
, const __m128 rsq
, const __m128 krf
, const __m128 crf
, const __m128 qq
,__m128
*vctot
)
1995 const __m128 two
= {2.0,2.0,2.0,2.0};
1998 krsq
= _mm_mul_ps(krf
,rsq
);
1999 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
2000 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2002 return _mm_mul_ps(qq
, _mm_sub_ps(rinv
, _mm_mul_ps(two
,krsq
)));
2007 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv
, __m128 rsq
, __m128 krf
, __m128 crf
, __m128 qq
,__m128
*vctot
)
2011 krsq
= _mm_mul_ps(krf
,rsq
);
2012 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
2013 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2026 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
2027 static inline __m128
2028 gmx_mm_int_lj_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2030 const __m128 six
= {6.0,6.0,6.0,6.0};
2031 const __m128 twelve
= {12.0,12.0,12.0,12.0};
2033 __m128 rinvsix
,vvdw6
,vvdw12
;
2035 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2036 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2037 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2038 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2040 return _mm_sub_ps( _mm_mul_ps(twelve
,vvdw12
),_mm_mul_ps(six
,vvdw6
));
2045 gmx_mm_int_lj_potonly_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2047 __m128 rinvsix
,vvdw6
,vvdw12
;
2049 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2050 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2051 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2052 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2059 /* Return force should be multiplied by -rinv to get fscal */
2060 static inline __m128
2061 gmx_mm_int_4_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2063 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2065 int n_a
,n_b
,n_c
,n_d
;
2067 rt
= _mm_mul_ps(r
,tabscale
);
2068 n0
= _mm_cvttps_epi32(rt
);
2069 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2070 eps2
= _mm_mul_ps(eps
,eps
);
2072 /* Extract indices from n0 */
2073 n_a
= gmx_mm_extract_epi32(n0
,0);
2074 n_b
= gmx_mm_extract_epi32(n0
,1);
2075 n_c
= gmx_mm_extract_epi32(n0
,2);
2076 n_d
= gmx_mm_extract_epi32(n0
,3);
2077 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2078 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2079 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2080 H
= _mm_load_ps(VFtab
+ 4* n_d
);
2081 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2082 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2083 G
= _mm_mul_ps(G
,eps
); /* Geps */
2084 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2085 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2086 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2088 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2090 return _mm_mul_ps(F
,tabscale
);
2095 /* Return force should be multiplied by -rinv to get fscal */
2096 static inline __m128
2097 gmx_mm_int_4_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2099 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2101 int n_a
,n_b
,n_c
,n_d
;
2103 rt
= _mm_mul_ps(r
,tabscale
);
2104 n0
= _mm_cvttps_epi32(rt
);
2105 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2106 eps2
= _mm_mul_ps(eps
,eps
);
2108 /* Extract indices from n0 */
2109 n_a
= gmx_mm_extract_epi32(n0
,0);
2110 n_b
= gmx_mm_extract_epi32(n0
,1);
2111 n_c
= gmx_mm_extract_epi32(n0
,2);
2112 n_d
= gmx_mm_extract_epi32(n0
,3);
2114 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2115 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2116 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2118 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
);
2119 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
);
2120 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
);
2121 Hd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
);
2122 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
+ 4);
2123 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
+ 4);
2124 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
+ 4);
2125 Hr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
+ 4);
2126 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2127 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2128 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2129 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2130 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2131 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2132 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2133 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2134 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2135 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2136 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2138 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2139 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2141 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2145 /* Return force should be multiplied by -rinv to get fscal */
2146 static inline __m128
2147 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2148 __m128
*vctot
, __m128
*vvdwtot
)
2150 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2152 int n_a
,n_b
,n_c
,n_d
;
2154 rt
= _mm_mul_ps(r
,tabscale
);
2155 n0
= _mm_cvttps_epi32(rt
);
2156 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2157 eps2
= _mm_mul_ps(eps
,eps
);
2159 /* Extract indices from n0 */
2160 n_a
= gmx_mm_extract_epi32(n0
,0);
2161 n_b
= gmx_mm_extract_epi32(n0
,1);
2162 n_c
= gmx_mm_extract_epi32(n0
,2);
2163 n_d
= gmx_mm_extract_epi32(n0
,3);
2166 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2167 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2168 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2169 Hc
= _mm_load_ps(VFtab
+ 12* n_d
);
2170 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2171 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2172 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2173 Hd
= _mm_load_ps(VFtab
+ 12* n_d
+ 4);
2174 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2175 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2176 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2177 Hr
= _mm_load_ps(VFtab
+ 12* n_d
+ 8);
2178 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2179 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2180 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2181 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2182 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2183 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2184 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2185 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2186 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2187 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2188 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2189 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2191 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2192 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2194 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2195 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2196 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2198 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2199 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2200 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2202 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2207 /* Return force should be multiplied by -rinv to get fscal */
2208 static inline __m128
2209 gmx_mm_int_3_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2211 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2215 rt
= _mm_mul_ps(r
,tabscale
);
2216 n0
= _mm_cvttps_epi32(rt
);
2217 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2218 eps2
= _mm_mul_ps(eps
,eps
);
2220 /* Extract indices from n0 */
2221 n_a
= gmx_mm_extract_epi32(n0
,0);
2222 n_b
= gmx_mm_extract_epi32(n0
,1);
2223 n_c
= gmx_mm_extract_epi32(n0
,2);
2224 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2225 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2226 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2227 H
= _mm_setzero_ps();
2228 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2229 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2230 G
= _mm_mul_ps(G
,eps
); /* Geps */
2231 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2232 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2233 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2235 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2237 return _mm_mul_ps(F
,tabscale
);
2242 /* Return force should be multiplied by -rinv to get fscal */
2243 static inline __m128
2244 gmx_mm_int_3_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2246 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2250 rt
= _mm_mul_ps(r
,tabscale
);
2251 n0
= _mm_cvttps_epi32(rt
);
2252 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2253 eps2
= _mm_mul_ps(eps
,eps
);
2255 /* Extract indices from n0 */
2256 n_a
= gmx_mm_extract_epi32(n0
,0);
2257 n_b
= gmx_mm_extract_epi32(n0
,1);
2258 n_c
= gmx_mm_extract_epi32(n0
,2);
2260 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2261 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2262 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2264 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2265 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2266 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
);
2267 Hd
= _mm_setzero_ps();
2268 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2269 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2270 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
+ 4);
2271 Hr
= _mm_setzero_ps();
2272 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2273 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2274 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2275 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2276 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2277 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2278 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2279 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2280 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2281 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2282 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2284 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2285 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2287 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2291 /* Return force should be multiplied by -rinv to get fscal */
2292 static inline __m128
2293 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2294 __m128
*vctot
, __m128
*vvdwtot
)
2296 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2300 rt
= _mm_mul_ps(r
,tabscale
);
2301 n0
= _mm_cvttps_epi32(rt
);
2302 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2303 eps2
= _mm_mul_ps(eps
,eps
);
2305 /* Extract indices from n0 */
2306 n_a
= gmx_mm_extract_epi32(n0
,0);
2307 n_b
= gmx_mm_extract_epi32(n0
,1);
2308 n_c
= gmx_mm_extract_epi32(n0
,2);
2311 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2312 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2313 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2314 Hc
= _mm_setzero_ps();
2315 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2316 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2317 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2318 Hd
= _mm_setzero_ps();
2319 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2320 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2321 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2322 Hr
= _mm_setzero_ps();
2323 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2324 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2325 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2326 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2327 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2328 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2329 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2330 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2331 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2332 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2333 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2334 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2336 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2337 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2339 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2340 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2341 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2343 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2344 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2345 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2347 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2354 /* Return force should be multiplied by -rinv to get fscal */
2355 static inline __m128
2356 gmx_mm_int_2_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2358 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2362 rt
= _mm_mul_ps(r
,tabscale
);
2363 n0
= _mm_cvttps_epi32(rt
);
2364 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2365 eps2
= _mm_mul_ps(eps
,eps
);
2367 /* Extract indices from n0 */
2368 n_a
= gmx_mm_extract_epi32(n0
,0);
2369 n_b
= gmx_mm_extract_epi32(n0
,1);
2370 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2371 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2372 G
= _mm_setzero_ps();
2373 H
= _mm_setzero_ps();
2374 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2375 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2376 G
= _mm_mul_ps(G
,eps
); /* Geps */
2377 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2378 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2379 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2381 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2383 return _mm_mul_ps(F
,tabscale
);
2388 /* Return force should be multiplied by -rinv to get fscal */
2389 static inline __m128
2390 gmx_mm_int_2_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2392 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2396 rt
= _mm_mul_ps(r
,tabscale
);
2397 n0
= _mm_cvttps_epi32(rt
);
2398 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2399 eps2
= _mm_mul_ps(eps
,eps
);
2401 /* Extract indices from n0 */
2402 n_a
= gmx_mm_extract_epi32(n0
,0);
2403 n_b
= gmx_mm_extract_epi32(n0
,1);
2405 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2406 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2407 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2409 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2410 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2411 Gd
= _mm_setzero_ps();
2412 Hd
= _mm_setzero_ps();
2413 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2414 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2415 Gr
= _mm_setzero_ps();
2416 Hr
= _mm_setzero_ps();
2417 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2418 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2419 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2420 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2421 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2422 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2423 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2424 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2425 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2426 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2427 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2429 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2430 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2432 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2436 /* Return force should be multiplied by -rinv to get fscal */
2437 static inline __m128
2438 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2439 __m128
*vctot
, __m128
*vvdwtot
)
2441 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2445 rt
= _mm_mul_ps(r
,tabscale
);
2446 n0
= _mm_cvttps_epi32(rt
);
2447 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2448 eps2
= _mm_mul_ps(eps
,eps
);
2450 /* Extract indices from n0 */
2451 n_a
= gmx_mm_extract_epi32(n0
,0);
2452 n_b
= gmx_mm_extract_epi32(n0
,1);
2454 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2455 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2456 Gc
= _mm_setzero_ps();
2457 Hc
= _mm_setzero_ps();
2458 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2459 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2460 Gd
= _mm_setzero_ps();
2461 Hd
= _mm_setzero_ps();
2462 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2463 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2464 Gr
= _mm_setzero_ps();
2465 Hr
= _mm_setzero_ps();
2466 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2467 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2468 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2469 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2470 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2471 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2472 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2473 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2474 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2475 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2476 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2477 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2479 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2480 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2482 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2483 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2484 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2486 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2487 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2488 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2490 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2496 /* Return force should be multiplied by -rinv to get fscal */
2497 static inline __m128
2498 gmx_mm_int_1_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2500 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2504 rt
= _mm_mul_ps(r
,tabscale
);
2505 n0
= _mm_cvttps_epi32(rt
);
2506 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2507 eps2
= _mm_mul_ps(eps
,eps
);
2509 /* Extract indices from n0 */
2510 n_a
= gmx_mm_extract_epi32(n0
,0);
2511 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2512 F
= _mm_setzero_ps();
2513 G
= _mm_setzero_ps();
2514 H
= _mm_setzero_ps();
2515 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2516 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2517 G
= _mm_mul_ps(G
,eps
); /* Geps */
2518 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2519 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2520 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2522 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2524 return _mm_mul_ps(F
,tabscale
);
2529 /* Return force should be multiplied by -rinv to get fscal */
2530 static inline __m128
2531 gmx_mm_int_1_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2533 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2537 rt
= _mm_mul_ps(r
,tabscale
);
2538 n0
= _mm_cvttps_epi32(rt
);
2539 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2540 eps2
= _mm_mul_ps(eps
,eps
);
2542 /* Extract indices from n0 */
2543 n_a
= gmx_mm_extract_epi32(n0
,0);
2545 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2546 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2547 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2549 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2550 Fd
= _mm_setzero_ps();
2551 Gd
= _mm_setzero_ps();
2552 Hd
= _mm_setzero_ps();
2553 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2554 Fr
= _mm_setzero_ps();
2555 Gr
= _mm_setzero_ps();
2556 Hr
= _mm_setzero_ps();
2557 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2558 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2559 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2560 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2561 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2562 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2563 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2564 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2565 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2566 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2567 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2569 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2570 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2572 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2576 /* Return force should be multiplied by -rinv to get fscal */
2577 static inline __m128
2578 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2579 __m128
*vctot
, __m128
*vvdwtot
)
2581 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2585 rt
= _mm_mul_ps(r
,tabscale
);
2586 n0
= _mm_cvttps_epi32(rt
);
2587 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2588 eps2
= _mm_mul_ps(eps
,eps
);
2590 /* Extract indices from n0 */
2591 n_a
= gmx_mm_extract_epi32(n0
,0);
2593 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2594 Fc
= _mm_setzero_ps();
2595 Gc
= _mm_setzero_ps();
2596 Hc
= _mm_setzero_ps();
2597 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2598 Fd
= _mm_setzero_ps();
2599 Gd
= _mm_setzero_ps();
2600 Hd
= _mm_setzero_ps();
2601 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2602 Fr
= _mm_setzero_ps();
2603 Gr
= _mm_setzero_ps();
2604 Hr
= _mm_setzero_ps();
2605 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2606 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2607 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2608 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2609 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2610 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2611 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2612 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2613 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2614 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2615 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2616 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2618 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2619 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2621 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2622 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2623 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2625 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2626 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2627 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2629 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2636 /* Return force should be multiplied by +rinv to get fscal */
2637 static inline __m128
2638 gmx_mm_int_4_genborn_ps(__m128 r
, __m128 isai
,
2639 float * isaj1
, float *isaj2
, float *isaj3
, float *isaj4
,
2640 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2641 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
, float *dvdaj4
,
2644 const __m128 half
= {0.5,0.5,0.5,0.5};
2646 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2648 int n_a
,n_b
,n_c
,n_d
;
2651 isaj
= _mm_load_ss(isaj1
);
2652 t2
= _mm_load_ss(isaj2
);
2653 t3
= _mm_load_ss(isaj3
);
2654 t4
= _mm_load_ss(isaj4
);
2655 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2656 t3
= _mm_unpacklo_ps(t3
,t4
); /* - - t4 t3 */
2657 isaj
= _mm_movelh_ps(isaj
,t3
); /* t4 t3 t2 t1 */
2659 isaprod
= _mm_mul_ps(isai
,isaj
);
2660 qq
= _mm_mul_ps(qq
,isaprod
);
2661 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2663 rt
= _mm_mul_ps(r
,gbtabscale
);
2664 n0
= _mm_cvttps_epi32(rt
);
2665 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2666 eps2
= _mm_mul_ps(eps
,eps
);
2668 /* Extract indices from n0 */
2669 n_a
= gmx_mm_extract_epi32(n0
,0);
2670 n_b
= gmx_mm_extract_epi32(n0
,1);
2671 n_c
= gmx_mm_extract_epi32(n0
,2);
2672 n_d
= gmx_mm_extract_epi32(n0
,3);
2673 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2674 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2675 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2676 H
= _mm_load_ps(GBtab
+ 4* n_d
);
2677 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2678 G
= _mm_mul_ps(G
,eps
); /* Geps */
2679 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2680 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2682 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2683 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2685 vgb
= _mm_mul_ps(qq
, VV
);
2686 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2688 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2690 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2692 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2694 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2696 /* Update 4 dada[j] values */
2697 Y
= _mm_load_ss(dvdaj1
);
2698 F
= _mm_load_ss(dvdaj2
);
2699 G
= _mm_load_ss(dvdaj3
);
2700 H
= _mm_load_ss(dvdaj4
);
2701 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2702 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2703 t4
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(0,0,0,1));
2705 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2706 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2707 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2708 _mm_store_ss( dvdaj4
, _mm_add_ss( H
, t4
) );
2715 /* Return force should be multiplied by +rinv to get fscal */
2716 static inline __m128
2717 gmx_mm_int_3_genborn_ps(__m128 r
, __m128 isai
,
2718 float * isaj1
, float *isaj2
, float *isaj3
,
2719 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2720 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
,
2723 const __m128 half
= {0.5,0.5,0.5,0.5};
2725 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2727 int n_a
,n_b
,n_c
,n_d
;
2730 isaj
= _mm_load_ss(isaj1
);
2731 t2
= _mm_load_ss(isaj2
);
2732 t3
= _mm_load_ss(isaj3
);
2733 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2734 t3
= _mm_unpacklo_ps(t3
,t3
); /* - - t3 t3 */
2735 isaj
= _mm_movelh_ps(isaj
,t3
); /* t3 t3 t2 t1 */
2737 isaprod
= _mm_mul_ps(isai
,isaj
);
2738 qq
= _mm_mul_ps(qq
,isaprod
);
2739 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2741 rt
= _mm_mul_ps(r
,gbtabscale
);
2742 n0
= _mm_cvttps_epi32(rt
);
2743 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2744 eps2
= _mm_mul_ps(eps
,eps
);
2746 /* Extract indices from n0 */
2747 n_a
= gmx_mm_extract_epi32(n0
,0);
2748 n_b
= gmx_mm_extract_epi32(n0
,1);
2749 n_c
= gmx_mm_extract_epi32(n0
,2);
2750 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2751 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2752 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2753 H
= _mm_setzero_ps();
2754 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2755 G
= _mm_mul_ps(G
,eps
); /* Geps */
2756 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2757 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2759 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2760 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2762 vgb
= _mm_mul_ps(qq
, VV
);
2763 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2765 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2767 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2769 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2771 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2773 /* Update 3 dada[j] values */
2774 Y
= _mm_load_ss(dvdaj1
);
2775 F
= _mm_load_ss(dvdaj2
);
2776 G
= _mm_load_ss(dvdaj3
);
2777 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2778 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2780 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2781 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2782 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2790 /* Return force should be multiplied by +rinv to get fscal */
2791 static inline __m128
2792 gmx_mm_int_2_genborn_ps(__m128 r
, __m128 isai
,
2793 float * isaj1
, float *isaj2
,
2794 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2795 float *dvdaj1
, float *dvdaj2
,
2798 const __m128 half
= {0.5,0.5,0.5,0.5};
2800 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2802 int n_a
,n_b
,n_c
,n_d
;
2805 isaj
= _mm_load_ss(isaj1
);
2806 t2
= _mm_load_ss(isaj2
);
2807 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2809 isaprod
= _mm_mul_ps(isai
,isaj
);
2810 qq
= _mm_mul_ps(qq
,isaprod
);
2811 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2813 rt
= _mm_mul_ps(r
,gbtabscale
);
2814 n0
= _mm_cvttps_epi32(rt
);
2815 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2816 eps2
= _mm_mul_ps(eps
,eps
);
2818 /* Extract indices from n0 */
2819 n_a
= gmx_mm_extract_epi32(n0
,0);
2820 n_b
= gmx_mm_extract_epi32(n0
,1);
2821 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2822 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2823 G
= _mm_setzero_ps();
2824 H
= _mm_setzero_ps();
2825 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2826 G
= _mm_mul_ps(G
,eps
); /* Geps */
2827 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2828 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2830 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2831 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2833 vgb
= _mm_mul_ps(qq
, VV
);
2834 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2836 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2838 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2840 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2842 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2844 /* Update 2 dada[j] values */
2845 Y
= _mm_load_ss(dvdaj1
);
2846 F
= _mm_load_ss(dvdaj2
);
2847 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2849 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2850 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2855 /* Return force should be multiplied by +rinv to get fscal */
2856 static inline __m128
2857 gmx_mm_int_1_genborn_ps(__m128 r
, __m128 isai
,
2859 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2863 const __m128 half
= {0.5,0.5,0.5,0.5};
2865 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2867 int n_a
,n_b
,n_c
,n_d
;
2870 isaj
= _mm_load_ss(isaj1
);
2872 isaprod
= _mm_mul_ps(isai
,isaj
);
2873 qq
= _mm_mul_ps(qq
,isaprod
);
2874 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2876 rt
= _mm_mul_ps(r
,gbtabscale
);
2877 n0
= _mm_cvttps_epi32(rt
);
2878 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2879 eps2
= _mm_mul_ps(eps
,eps
);
2881 /* Extract indices from n0 */
2882 n_a
= gmx_mm_extract_epi32(n0
,0);
2883 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2884 F
= _mm_setzero_ps();
2885 G
= _mm_setzero_ps();
2886 H
= _mm_setzero_ps();
2887 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2888 G
= _mm_mul_ps(G
,eps
); /* Geps */
2889 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2890 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2892 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2893 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2895 vgb
= _mm_mul_ps(qq
, VV
);
2896 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2898 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2900 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2902 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2904 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2906 /* Update 1 dada[j] values */
2907 Y
= _mm_load_ss(dvdaj1
);
2909 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2919 gmx_mm_update_iforce_1atom_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2926 fix1
= _mm_hadd_ps(fix1
,fix1
);
2927 fiy1
= _mm_hadd_ps(fiy1
,fiz1
);
2929 fix1
= _mm_hadd_ps(fix1
,fiy1
); /* fiz1 fiy1 fix1 fix1 */
2932 /* transpose data */
2934 _MM_TRANSPOSE4_PS(fix1
,t1
,fiy1
,fiz1
);
2935 fix1
= _mm_add_ps(_mm_add_ps(fix1
,t1
), _mm_add_ps(fiy1
,fiz1
));
2937 t2
= _mm_load_ss(fptr
);
2938 t2
= _mm_loadh_pi(t2
,(__m64
*)(fptr
+1));
2939 t3
= _mm_load_ss(fshiftptr
);
2940 t3
= _mm_loadh_pi(t3
,(__m64
*)(fshiftptr
+1));
2942 t2
= _mm_add_ps(t2
,fix1
);
2943 t3
= _mm_add_ps(t3
,fix1
);
2945 _mm_store_ss(fptr
,t2
);
2946 _mm_storeh_pi((__m64
*)(fptr
+1),t2
);
2947 _mm_store_ss(fshiftptr
,t3
);
2948 _mm_storeh_pi((__m64
*)(fshiftptr
+1),t3
);
2952 gmx_mm_update_iforce_2atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2953 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
2960 fix1
= _mm_hadd_ps(fix1
,fiy1
);
2961 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
2962 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
2964 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
2965 fiy2
= _mm_hadd_ps(fiy2
,fiy2
); /* - - fiz2 fiy2 */
2968 /* transpose data */
2969 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
2970 t1
= _mm_unpacklo_ps(fiy2
,fiz2
);
2971 t2
= _mm_unpackhi_ps(fiy2
,fiz2
);
2973 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
2974 t1
= _mm_add_ps(t1
,t2
);
2975 t2
= _mm_movehl_ps(t2
,t1
);
2976 fiy2
= _mm_add_ps(t1
,t2
);
2978 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
2979 t1
= _mm_loadl_pi(t1
,(__m64
*)(fptr
+4));
2980 _mm_storel_pi((__m64
*)(fptr
+4), _mm_add_ps(fiy2
,t1
));
2982 t4
= _mm_load_ss(fshiftptr
+2);
2983 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
2985 t1
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
2986 t1
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
2987 t2
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
2989 t1
= _mm_add_ps(t1
,t2
);
2990 t1
= _mm_add_ps(t1
,t4
); /* y x - z */
2992 _mm_store_ss(fshiftptr
+2,t1
);
2993 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
2999 gmx_mm_update_iforce_3atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
3000 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
3001 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
3008 fix1
= _mm_hadd_ps(fix1
,fiy1
);
3009 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
3010 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
3011 fix3
= _mm_hadd_ps(fix3
,fiy3
);
3012 fiz3
= _mm_hadd_ps(fiz3
,fiz3
);
3014 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
3015 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
3016 fiz3
= _mm_hadd_ps(fiz3
,fiz3
); /* - - - fiz3 */
3019 /* transpose data */
3020 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
3021 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
3022 t2
= _mm_movehl_ps(_mm_setzero_ps(),fiz3
);
3023 t1
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(0,0,0,1));
3024 t3
= _mm_shuffle_ps(t2
,t2
,_MM_SHUFFLE(0,0,0,1));
3026 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
3027 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
3028 fiz3
= _mm_add_ss(_mm_add_ps(fiz3
,t1
) , _mm_add_ps(t2
,t3
));
3030 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3031 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3032 _mm_store_ss (fptr
+8,_mm_add_ss(fiz3
,_mm_load_ss(fptr
+8) ));
3034 t4
= _mm_load_ss(fshiftptr
+2);
3035 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
3037 t1
= _mm_shuffle_ps(fiz3
,fix1
,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
3038 t2
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
3039 t3
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
3040 t3
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
3042 t1
= _mm_add_ps(t1
,t2
);
3043 t3
= _mm_add_ps(t3
,t4
);
3044 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3046 _mm_store_ss(fshiftptr
+2,t1
);
3047 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
3052 gmx_mm_update_iforce_4atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
3053 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
3054 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
3055 __m128 fix4
, __m128 fiy4
, __m128 fiz4
,
3059 __m128 t1
,t2
,t3
,t4
,t5
;
3062 fix1
= _mm_hadd_ps(fix1
,fiy1
);
3063 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
3064 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
3065 fix3
= _mm_hadd_ps(fix3
,fiy3
);
3066 fiz3
= _mm_hadd_ps(fiz3
,fix4
);
3067 fiy4
= _mm_hadd_ps(fiy4
,fiz4
);
3069 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
3070 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
3071 fiz3
= _mm_hadd_ps(fiz3
,fiy4
); /* fiz4 fiy4 fix4 fiz3 */
3074 /* transpose data */
3075 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
3076 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
3077 _MM_TRANSPOSE4_PS(fiz3
,fix4
,fiy4
,fiz4
);
3079 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
3080 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
3081 fiz3
= _mm_add_ps(_mm_add_ps(fiz3
,fix4
), _mm_add_ps(fiy4
,fiz4
));
3083 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3084 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3085 _mm_storeu_ps(fptr
+8,_mm_add_ps(fiz3
,_mm_loadu_ps(fptr
+8)));
3087 t5
= _mm_load_ss(fshiftptr
+2);
3088 t5
= _mm_loadh_pi(t5
,(__m64
*)(fshiftptr
));
3090 t1
= _mm_shuffle_ps(fix1
,fix1
,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
3091 t2
= _mm_shuffle_ps(fiy2
,fiy2
,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
3092 t3
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
3093 t4
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
3094 t4
= _mm_shuffle_ps(fiz3
,t4
,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
3096 t1
= _mm_add_ps(t1
,t2
);
3097 t3
= _mm_add_ps(t3
,t4
);
3098 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3099 t5
= _mm_add_ps(t5
,t1
);
3101 _mm_store_ss(fshiftptr
+2,t5
);
3102 _mm_storeh_pi((__m64
*)(fshiftptr
),t5
);
3107 gmx_mm_update_1pot_ps(__m128 pot1
, float *ptr1
)
3110 pot1
= _mm_hadd_ps(pot1
,pot1
);
3111 pot1
= _mm_hadd_ps(pot1
,pot1
);
3114 pot1
= _mm_add_ps(pot1
,_mm_movehl_ps(pot1
,pot1
));
3115 pot1
= _mm_add_ps(pot1
,_mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1)));
3117 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3122 gmx_mm_update_2pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
)
3125 pot1
= _mm_hadd_ps(pot1
,pot2
);
3126 pot1
= _mm_hadd_ps(pot1
,pot1
);
3127 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1));
3131 t1
= _mm_movehl_ps(pot2
,pot1
); /* 2d 2c 1d 1c */
3132 t2
= _mm_movelh_ps(pot1
,pot2
); /* 2b 2a 1b 1a */
3133 t1
= _mm_add_ps(t1
,t2
); /* 2 2 1 1 */
3134 t2
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,3,1,1));
3135 pot1
= _mm_add_ps(t1
,t2
); /* - 2 - 1 */
3136 pot2
= _mm_movehl_ps(t2
,pot1
); /* - - - 2 */
3139 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3140 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3145 gmx_mm_update_4pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
, __m128 pot3
, float *ptr3
, __m128 pot4
, float *ptr4
)
3147 _MM_TRANSPOSE4_PS(pot1
,pot2
,pot3
,pot4
);
3149 pot1
= _mm_add_ps(_mm_add_ps(pot1
,pot2
),_mm_add_ps(pot3
,pot4
));
3150 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(1,1,1,1));
3151 pot3
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(2,2,2,2));
3152 pot4
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(3,3,3,3));
3154 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3155 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3156 _mm_store_ss(ptr3
,_mm_add_ss(pot3
,_mm_load_ss(ptr3
)));
3157 _mm_store_ss(ptr4
,_mm_add_ss(pot4
,_mm_load_ss(ptr4
)));