2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
34 /* We require SSE2 now! */
39 #include <xmmintrin.h> /* SSE */
40 #include <emmintrin.h> /* SSE2 */
43 # include <pmmintrin.h> /* SSE3 */
46 # include <smmintrin.h> /* SSE4.1 */
51 /***************************************************
53 * COMPILER RANT WARNING: *
55 * Ideally, this header would be filled with *
56 * simple static inline functions. Unfortunately, *
57 * many vendors provide really braindead compilers *
58 * that either cannot handle more than 1-2 SSE *
59 * function parameters, and some cannot handle *
60 * pointers to SSE __m128 datatypes as parameters *
61 * at all. Thus, for portability we have had to *
62 * implement all but the simplest routines as *
65 ***************************************************/
68 /***************************************************
70 * Wrappers/replacements for some instructions *
71 * not available in all SSE versions. *
73 ***************************************************/
76 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
78 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
82 * Some compilers require a cast to change the interpretation
83 * of a register from FP to Int and vice versa, and not all of
84 * the provide instructions to do this. Roll our own wrappers...
87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
88 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
89 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
90 #elif defined(__GNUC__)
91 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
92 # define gmx_mm_castps_si128(a) ((__m128i)(a))
94 static __m128
gmx_mm_castsi128_ps(__m128i a
) { return *(__m128
*) &a
; }
95 static __m128i
gmx_mm_castps_si128(__m128 a
) { return *(__m128i
*) &a
; }
100 /* IO functions, just for debugging */
103 printxmm(const char *s
,__m128 xmm
)
107 _mm_storeu_ps(f
,xmm
);
108 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s
,f
[0],f
[1],f
[2],f
[3]);
113 printxmmsum(const char *s
,__m128 xmm
)
117 _mm_storeu_ps(f
,xmm
);
118 printf("%s (sum): %15.10g\n",s
,f
[0]+f
[1]+f
[2]+f
[3]);
123 printxmmi(const char *s
,__m128i xmmi
)
127 _mm_storeu_si128((__m128i
*)i
,xmmi
);
128 printf("%10s: %2d %2d %2d %2d\n",s
,i
[0],i
[1],i
[2],i
[3]);
132 /************************
134 * Simple math routines *
136 ************************/
139 gmx_mm_invsqrt_ps(__m128 x
)
141 const __m128 half
= {0.5,0.5,0.5,0.5};
142 const __m128 three
= {3.0,3.0,3.0,3.0};
144 __m128 lu
= _mm_rsqrt_ps(x
);
146 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
150 gmx_mm_inv_ps(__m128 x
)
152 const __m128 two
= {2.0f
,2.0f
,2.0f
,2.0f
};
154 __m128 lu
= _mm_rcp_ps(x
);
156 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
161 gmx_mm_calc_rsq_ps(__m128 dx
, __m128 dy
, __m128 dz
)
163 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
), _mm_mul_ps(dy
,dy
) ), _mm_mul_ps(dz
,dz
) );
166 /* Normal sum of four xmm registers */
168 gmx_mm_sum4_ps(__m128 t0
, __m128 t1
, __m128 t2
, __m128 t3
)
170 t0
= _mm_add_ps(t0
,t1
);
171 t2
= _mm_add_ps(t2
,t3
);
172 return _mm_add_ps(t0
,t2
);
177 gmx_mm_log_ps(__m128 x
)
179 const __m128 exp_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
180 const __m128 one_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
181 const __m128 off_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
182 const __m128 mant_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
183 const __m128 base_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
184 const __m128 loge_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
186 const __m128 D5
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
187 const __m128 D4
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
188 const __m128 D3
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
189 const __m128 D2
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
190 const __m128 D1
= gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
191 const __m128 D0
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
193 __m128 xmm0
,xmm1
,xmm2
;
197 xmm1
= _mm_and_ps(xmm1
, exp_ps
);
198 xmm1
= gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1
),8) );
200 xmm1
= _mm_or_ps(xmm1
, one_ps
);
201 xmm1
= _mm_sub_ps(xmm1
, off_ps
);
203 xmm1
= _mm_mul_ps(xmm1
, base_ps
);
204 xmm0
= _mm_and_ps(xmm0
, mant_ps
);
205 xmm0
= _mm_or_ps(xmm0
, one_ps
);
207 xmm2
= _mm_mul_ps(xmm0
, D5
);
208 xmm2
= _mm_add_ps(xmm2
, D4
);
209 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
210 xmm2
= _mm_add_ps(xmm2
, D3
);
211 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
212 xmm2
= _mm_add_ps(xmm2
, D2
);
213 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
214 xmm2
= _mm_add_ps(xmm2
, D1
);
215 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
216 xmm2
= _mm_add_ps(xmm2
, D0
);
217 xmm0
= _mm_sub_ps(xmm0
, one_ps
);
218 xmm0
= _mm_mul_ps(xmm0
,xmm2
);
219 xmm1
= _mm_add_ps(xmm1
,xmm0
);
222 x
= _mm_mul_ps(x
, loge_ps
);
228 /* This exp-routine has a relative precision of 2^-22.33 bits (essentially single precision :-) ) */
230 gmx_mm_exp_ps(__m128 x
)
232 const __m128i half
= _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000); // 0.5e+0f
233 const __m128i base
= _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F); // 127
234 const __m128i CC
= _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B); // log2(e)
236 const __m128i D5
= _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905); // 1.8775767e-3f
237 const __m128i D4
= _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806); // 8.9893397e-3f
238 const __m128i D3
= _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23); // 5.5826318e-2f
239 const __m128i D2
= _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4); // 2.4015361e-1f
240 const __m128i D1
= _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B); // 6.9315308e-1f
241 const __m128i D0
= _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF); // 9.9999994e-1f
246 xmm0
= _mm_mul_ps(x
,(__m128
) CC
);
247 xmm1
= _mm_sub_ps(xmm0
, (__m128
) half
);
248 xmm2
= _mm_cvtps_epi32(xmm1
);
249 xmm1
= _mm_cvtepi32_ps(xmm2
);
251 xmm2
= _mm_add_epi32(xmm2
,(__m128i
) base
);
252 xmm2
= _mm_slli_epi32(xmm2
,23);
254 xmm0
= _mm_sub_ps(xmm0
,xmm1
);
255 xmm1
= _mm_mul_ps(xmm0
,(__m128
) D5
);
256 xmm1
= _mm_add_ps(xmm1
,(__m128
) D4
);
257 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
258 xmm1
= _mm_add_ps(xmm1
,(__m128
) D3
);
259 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
260 xmm1
= _mm_add_ps(xmm1
,(__m128
) D2
);
261 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
262 xmm1
= _mm_add_ps(xmm1
,(__m128
) D1
);
263 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
264 xmm1
= _mm_add_ps(xmm1
,(__m128
) D0
);
265 xmm1
= _mm_mul_ps(xmm1
,(__m128
) xmm2
);
267 /* 18 instructions currently */
273 #define GMX_MM_SINCOS_PS(x,sinval,cosval) \
275 const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI}; \
276 const __m128 _sincosf_half = {0.5,0.5,0.5,0.5}; \
277 const __m128 _sincosf_one = {1.0,1.0,1.0,1.0}; \
279 const __m128i _sincosf_izero = _mm_set1_epi32(0); \
280 const __m128i _sincosf_ione = _mm_set1_epi32(1); \
281 const __m128i _sincosf_itwo = _mm_set1_epi32(2); \
282 const __m128i _sincosf_ithree = _mm_set1_epi32(3); \
284 const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129}; \
285 const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8}; \
286 const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249}; \
287 const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950}; \
288 const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225}; \
289 const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727}; \
290 const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758}; \
291 const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247}; \
293 __m128 _sincosf_signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); \
294 __m128 _sincosf_tiny = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) ); \
296 __m128 _sincosf_xl; \
297 __m128 _sincosf_xl2; \
298 __m128 _sincosf_xl3; \
299 __m128 _sincosf_qf; \
300 __m128 _sincosf_absxl; \
301 __m128 _sincosf_p1; \
302 __m128 _sincosf_cx; \
303 __m128 _sincosf_sx; \
304 __m128 _sincosf_ts; \
305 __m128 _sincosf_tc; \
306 __m128 _sincosf_tsn; \
307 __m128 _sincosf_tcn; \
308 __m128i _sincosf_q; \
309 __m128i _sincosf_offsetSin; \
310 __m128i _sincosf_offsetCos; \
311 __m128 _sincosf_sinMask; \
312 __m128 _sincosf_cosMask; \
313 __m128 _sincosf_isTiny; \
314 __m128 _sincosf_ct0; \
315 __m128 _sincosf_ct1; \
316 __m128 _sincosf_ct2; \
317 __m128 _sincosf_st1; \
318 __m128 _sincosf_st2; \
320 _sincosf_xl = _mm_mul_ps(x,_sincosf_two_over_pi); \
322 _sincosf_xl = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
324 _sincosf_q = _mm_cvttps_epi32(_sincosf_xl); \
325 _sincosf_qf = _mm_cvtepi32_ps(_sincosf_q); \
327 _sincosf_offsetSin = _mm_and_si128(_sincosf_q,_sincosf_ithree); \
328 _sincosf_offsetCos = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione); \
330 _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1); \
331 _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2); \
332 _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1); \
333 _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl); \
335 _sincosf_absxl = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl); \
336 _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl); \
338 _sincosf_xl2 = _mm_mul_ps(_sincosf_xl,_sincosf_xl); \
339 _sincosf_xl3 = _mm_mul_ps(_sincosf_xl2,_sincosf_xl); \
341 _sincosf_ct1 = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2); \
342 _sincosf_ct1 = _mm_add_ps(_sincosf_ct1,_sincosf_cc1); \
343 _sincosf_st1 = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2); \
344 _sincosf_st1 = _mm_add_ps(_sincosf_st1,_sincosf_sc1); \
345 _sincosf_ct2 = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2); \
346 _sincosf_ct2 = _mm_add_ps(_sincosf_ct2,_sincosf_cc2); \
347 _sincosf_st2 = _mm_mul_ps(_sincosf_st1,_sincosf_xl2); \
348 _sincosf_st2 = _mm_add_ps(_sincosf_st2,_sincosf_sc2); \
350 _sincosf_cx = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2); \
351 _sincosf_cx = _mm_add_ps(_sincosf_cx,_sincosf_one); \
353 _sincosf_sx = _mm_mul_ps(_sincosf_st2,_sincosf_xl3); \
354 _sincosf_sx = _mm_add_ps(_sincosf_sx,_sincosf_xl); \
356 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
357 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
359 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
360 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
362 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
363 _sincosf_tsn = _mm_xor_ps(_sincosf_signbit,_sincosf_ts); \
364 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
366 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
367 _sincosf_tcn = _mm_xor_ps(_sincosf_signbit,_sincosf_tc); \
368 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
370 sinval = _sincosf_ts; \
371 cosval = _sincosf_tc; \
376 /* Load a single value from 1-4 places, merge into xmm register */
378 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
380 __m128 _txmm2,_txmm3,_txmm4; \
381 xmm1 = _mm_load_ss(ptr1); \
382 _txmm2 = _mm_load_ss(ptr2); \
383 _txmm3 = _mm_load_ss(ptr3); \
384 _txmm4 = _mm_load_ss(ptr4); \
385 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
386 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
387 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
391 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
393 __m128 _txmm2,_txmm3; \
394 xmm1 = _mm_load_ss(ptr1); \
395 _txmm2 = _mm_load_ss(ptr2); \
396 _txmm3 = _mm_load_ss(ptr3); \
397 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
398 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
402 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
405 xmm1 = _mm_load_ss(ptr1); \
406 _txmm2 = _mm_load_ss(ptr2); \
407 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
411 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
413 xmm1 = _mm_load_ss(ptr1); \
416 /* Store data in an xmm register into 1-4 different places */
417 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
419 __m128 _txmm2,_txmm3,_txmm4; \
420 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
421 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
422 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
423 _mm_store_ss(ptr1,xmm1); \
424 _mm_store_ss(ptr2,_txmm2); \
425 _mm_store_ss(ptr3,_txmm3); \
426 _mm_store_ss(ptr4,_txmm4); \
430 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
432 __m128 _txmm2,_txmm3; \
433 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
434 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
435 _mm_store_ss(ptr1,xmm1); \
436 _mm_store_ss(ptr2,_txmm2); \
437 _mm_store_ss(ptr3,_txmm3); \
441 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
444 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
445 _mm_store_ss(ptr1,xmm1); \
446 _mm_store_ss(ptr2,_txmm2); \
450 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
452 _mm_store_ss(ptr1,xmm1); \
456 /* Similar to store, but increments value in memory */
457 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
459 __m128 _tincr1,_tincr2; \
460 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
461 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
462 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
463 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
464 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
465 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
468 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
471 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
472 _tincr = _mm_add_ps(_tincr,xmm1); \
473 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
476 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
479 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
480 _tincr = _mm_add_ps(_tincr,xmm1); \
481 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
484 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
487 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
488 _tincr = _mm_add_ps(_tincr,xmm1); \
489 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
492 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
495 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
496 _tincr = _mm_add_ss(_tincr,xmm1); \
497 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
502 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
503 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
505 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
506 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
507 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
508 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
509 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
510 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
511 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
512 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
513 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
516 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
518 __m128 _tmp1,_tmp2,_tmp3; \
519 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
520 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
521 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
522 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
523 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
524 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
525 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
529 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
531 __m128 _tmp1,_tmp2; \
532 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
533 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
534 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
535 c12 = _mm_movehl_ps(c12,c6); \
538 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
540 c6 = _mm_load_ss(ptr1); \
541 c12 = _mm_load_ss(ptr1+1); \
545 /* Routines to load 1-4 rvecs from 1-4 places.
546 * We mainly use these to load coordinates. The extra routines
547 * are very efficient for the water-water loops, since we e.g.
548 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
550 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
551 jx1 = _mm_load_ss(ptr1); \
552 jy1 = _mm_load_ss((ptr1)+1); \
553 jz1 = _mm_load_ss((ptr1)+2); \
556 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
557 jx1 = _mm_load_ss(ptr1); \
558 jy1 = _mm_load_ss((ptr1)+1); \
559 jz1 = _mm_load_ss((ptr1)+2); \
560 jx2 = _mm_load_ss((ptr1)+3); \
561 jy2 = _mm_load_ss((ptr1)+4); \
562 jz2 = _mm_load_ss((ptr1)+5); \
566 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
567 jx1 = _mm_load_ss(ptr1); \
568 jy1 = _mm_load_ss((ptr1)+1); \
569 jz1 = _mm_load_ss((ptr1)+2); \
570 jx2 = _mm_load_ss((ptr1)+3); \
571 jy2 = _mm_load_ss((ptr1)+4); \
572 jz2 = _mm_load_ss((ptr1)+5); \
573 jx3 = _mm_load_ss((ptr1)+6); \
574 jy3 = _mm_load_ss((ptr1)+7); \
575 jz3 = _mm_load_ss((ptr1)+8); \
579 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
580 jx1 = _mm_load_ss(ptr1); \
581 jy1 = _mm_load_ss((ptr1)+1); \
582 jz1 = _mm_load_ss((ptr1)+2); \
583 jx2 = _mm_load_ss((ptr1)+3); \
584 jy2 = _mm_load_ss((ptr1)+4); \
585 jz2 = _mm_load_ss((ptr1)+5); \
586 jx3 = _mm_load_ss((ptr1)+6); \
587 jy3 = _mm_load_ss((ptr1)+7); \
588 jz3 = _mm_load_ss((ptr1)+8); \
589 jx4 = _mm_load_ss((ptr1)+9); \
590 jy4 = _mm_load_ss((ptr1)+10); \
591 jz4 = _mm_load_ss((ptr1)+11); \
595 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
596 __m128 _tmp1,_tmp2; \
597 _tmp1 = _mm_load_ss(ptr1); \
598 _tmp2 = _mm_load_ss(ptr2); \
599 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
600 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
601 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
602 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
603 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
604 jz1 = _mm_movehl_ps(jz1,jy1); \
607 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
608 __m128 _tmp1, _tmp2; \
609 _tmp1 = _mm_loadu_ps(ptr1); \
610 jy1 = _mm_loadu_ps(ptr2); \
611 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
612 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
613 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
614 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
615 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
616 jy1 = _mm_movehl_ps(jx1,jx1); \
617 jx2 = _mm_movehl_ps(jz1,jz1); \
618 jz2 = _mm_movehl_ps(jy2,jy2); \
622 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
623 __m128 _tmp1, _tmp2, _tmp3; \
624 _tmp1 = _mm_loadu_ps(ptr1); \
625 jy1 = _mm_loadu_ps(ptr2); \
626 _tmp2 = _mm_loadu_ps(ptr1+4); \
627 jz2 = _mm_loadu_ps(ptr2+4); \
628 jz3 = _mm_load_ss(ptr1+8); \
629 _tmp3 = _mm_load_ss(ptr2+8); \
630 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
631 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
632 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
633 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
634 jy1 = _mm_movehl_ps(jx1,jx1); \
635 jx2 = _mm_movehl_ps(jz1,jz1); \
636 jz2 = _mm_movehl_ps(jy2,jy2); \
637 jy3 = _mm_movehl_ps(jx3,jx3); \
638 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
642 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
643 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
644 _tmp1 = _mm_loadu_ps(ptr1); \
645 jy1 = _mm_loadu_ps(ptr2); \
646 _tmp2 = _mm_loadu_ps(ptr1+4); \
647 jz2 = _mm_loadu_ps(ptr2+4); \
648 _tmp3 = _mm_loadu_ps(ptr1+8); \
649 _tmp4 = _mm_loadu_ps(ptr2+8); \
650 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
651 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
652 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
653 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
654 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
655 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
656 jy1 = _mm_movehl_ps(jx1,jx1); \
657 jx2 = _mm_movehl_ps(jz1,jz1); \
658 jz2 = _mm_movehl_ps(jy2,jy2); \
659 jy3 = _mm_movehl_ps(jx3,jx3); \
660 jx4 = _mm_movehl_ps(jz3,jz3); \
661 jz4 = _mm_movehl_ps(jy4,jy4); \
665 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
666 __m128 _tmp1,_tmp3,_tmp4; \
667 jx1 = _mm_load_ss(ptr1); \
668 jy1 = _mm_load_ss(ptr2); \
669 jz1 = _mm_load_ss(ptr3); \
670 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
671 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
672 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
673 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
674 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
675 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
676 jx1 = _mm_movelh_ps(_tmp1,jz1); \
677 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
678 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
682 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
683 __m128 _tmp1, _tmp2; \
684 jx1 = _mm_loadu_ps(ptr1); \
685 jy1 = _mm_loadu_ps(ptr2); \
686 jz1 = _mm_loadu_ps(ptr3); \
687 jx2 = _mm_setzero_ps(); \
688 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
689 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
690 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
691 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
692 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
693 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
694 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
695 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
699 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
700 __m128 _tmp1, _tmp2; \
701 jx1 = _mm_loadu_ps(ptr1); \
702 jy1 = _mm_loadu_ps(ptr2); \
703 jz1 = _mm_loadu_ps(ptr3); \
704 jx2 = _mm_setzero_ps(); \
705 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
706 jy2 = _mm_loadu_ps(ptr1+4); \
707 jz2 = _mm_loadu_ps(ptr2+4); \
708 jx3 = _mm_loadu_ps(ptr3+4); \
709 jy3 = _mm_setzero_ps(); \
710 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
711 jz3 = _mm_load_ss(ptr1+8); \
712 _tmp1 = _mm_load_ss(ptr2+8); \
713 _tmp2 = _mm_load_ss(ptr3+8); \
714 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
715 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
716 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
720 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
721 jx1 = _mm_loadu_ps(ptr1); \
722 jy1 = _mm_loadu_ps(ptr2); \
723 jz1 = _mm_loadu_ps(ptr3); \
724 jx2 = _mm_setzero_ps(); \
725 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
726 jy2 = _mm_loadu_ps(ptr1+4); \
727 jz2 = _mm_loadu_ps(ptr2+4); \
728 jx3 = _mm_loadu_ps(ptr3+4); \
729 jy3 = _mm_setzero_ps(); \
730 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
731 jz3 = _mm_loadu_ps(ptr1+8); \
732 jx4 = _mm_loadu_ps(ptr2+8); \
733 jy4 = _mm_loadu_ps(ptr3+8); \
734 jz4 = _mm_setzero_ps(); \
735 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
740 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
741 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
742 jx1 = _mm_load_ss(ptr1); \
743 _tmp1 = _mm_load_ss(ptr2); \
744 jy1 = _mm_load_ss(ptr3); \
745 jz1 = _mm_load_ss(ptr4); \
746 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
747 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
748 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
749 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
750 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
751 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
752 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
753 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
754 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
755 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
756 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
760 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
761 __m128 _tmp1, _tmp2; \
762 jx1 = _mm_loadu_ps(ptr1); \
763 jy1 = _mm_loadu_ps(ptr2); \
764 jz1 = _mm_loadu_ps(ptr3); \
765 jx2 = _mm_loadu_ps(ptr4); \
766 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
767 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
768 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
769 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
770 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
771 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
772 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
773 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
774 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
778 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
779 __m128 _tmp1, _tmp2, _tmp3; \
780 jx1 = _mm_loadu_ps(ptr1); \
781 jy1 = _mm_loadu_ps(ptr2); \
782 jz1 = _mm_loadu_ps(ptr3); \
783 jx2 = _mm_loadu_ps(ptr4); \
784 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
785 jy2 = _mm_loadu_ps(ptr1+4); \
786 jz2 = _mm_loadu_ps(ptr2+4); \
787 jx3 = _mm_loadu_ps(ptr3+4); \
788 jy3 = _mm_loadu_ps(ptr4+4); \
789 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
790 jz3 = _mm_load_ss(ptr1+8); \
791 _tmp1 = _mm_load_ss(ptr2+8); \
792 _tmp2 = _mm_load_ss(ptr3+8); \
793 _tmp3 = _mm_load_ss(ptr4+8); \
794 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
795 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
796 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
800 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
801 jx1 = _mm_loadu_ps(ptr1); \
802 jy1 = _mm_loadu_ps(ptr2); \
803 jz1 = _mm_loadu_ps(ptr3); \
804 jx2 = _mm_loadu_ps(ptr4); \
805 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
806 jy2 = _mm_loadu_ps(ptr1+4); \
807 jz2 = _mm_loadu_ps(ptr2+4); \
808 jx3 = _mm_loadu_ps(ptr3+4); \
809 jy3 = _mm_loadu_ps(ptr4+4); \
810 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
811 jz3 = _mm_loadu_ps(ptr1+8); \
812 jx4 = _mm_loadu_ps(ptr2+8); \
813 jy4 = _mm_loadu_ps(ptr3+8); \
814 jz4 = _mm_loadu_ps(ptr4+8); \
815 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
819 /* Routines to increment rvecs in memory, typically use for j particle force updates */
820 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
822 jy1 = _mm_unpacklo_ps(jy1,jz1); \
823 jx1 = _mm_movelh_ps(jx1,jy1); \
824 _tmp1 = _mm_load_ss(ptr1); \
825 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
826 _tmp1 = _mm_add_ps(_tmp1,jx1); \
827 _mm_store_ss(ptr1,_tmp1); \
828 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
832 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
833 __m128 _tmp1, _tmp2; \
834 _tmp1 = _mm_loadu_ps(ptr1); \
835 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
836 jx1 = _mm_unpacklo_ps(jx1,jy1); \
837 jz1 = _mm_unpacklo_ps(jz1,jx2); \
838 jy2 = _mm_unpacklo_ps(jy2,jz2); \
839 jx1 = _mm_movelh_ps(jx1,jz1); \
840 _tmp1 = _mm_add_ps(_tmp1,jx1); \
841 _tmp2 = _mm_add_ps(_tmp2,jy2); \
842 _mm_storeu_ps(ptr1,_tmp1); \
843 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
847 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
848 __m128 _tmp1, _tmp2, _tmp3; \
849 _tmp1 = _mm_loadu_ps(ptr1); \
850 _tmp2 = _mm_loadu_ps(ptr1+4); \
851 _tmp3 = _mm_load_ss(ptr1+8); \
852 jx1 = _mm_unpacklo_ps(jx1,jy1); \
853 jz1 = _mm_unpacklo_ps(jz1,jx2); \
854 jy2 = _mm_unpacklo_ps(jy2,jz2); \
855 jx3 = _mm_unpacklo_ps(jx3,jy3); \
856 jx1 = _mm_movelh_ps(jx1,jz1); \
857 jy2 = _mm_movelh_ps(jy2,jx3); \
858 _tmp1 = _mm_add_ps(_tmp1,jx1); \
859 _tmp2 = _mm_add_ps(_tmp2,jy2); \
860 _tmp3 = _mm_add_ss(_tmp3,jz3); \
861 _mm_storeu_ps(ptr1,_tmp1); \
862 _mm_storeu_ps(ptr1+4,_tmp2); \
863 _mm_store_ss(ptr1+8,_tmp3); \
867 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
868 __m128 _tmp1, _tmp2, _tmp3; \
869 _tmp1 = _mm_loadu_ps(ptr1); \
870 _tmp2 = _mm_loadu_ps(ptr1+4); \
871 _tmp3 = _mm_loadu_ps(ptr1+8); \
872 jx1 = _mm_unpacklo_ps(jx1,jy1); \
873 jz1 = _mm_unpacklo_ps(jz1,jx2); \
874 jy2 = _mm_unpacklo_ps(jy2,jz2); \
875 jx3 = _mm_unpacklo_ps(jx3,jy3); \
876 jz3 = _mm_unpacklo_ps(jz3,jx4); \
877 jy4 = _mm_unpacklo_ps(jy4,jz4); \
878 jx1 = _mm_movelh_ps(jx1,jz1); \
879 jy2 = _mm_movelh_ps(jy2,jx3); \
880 jz3 = _mm_movelh_ps(jz3,jy4); \
881 _tmp1 = _mm_add_ps(_tmp1,jx1); \
882 _tmp2 = _mm_add_ps(_tmp2,jy2); \
883 _tmp3 = _mm_add_ps(_tmp3,jz3); \
884 _mm_storeu_ps(ptr1,_tmp1); \
885 _mm_storeu_ps(ptr1+4,_tmp2); \
886 _mm_storeu_ps(ptr1+8,_tmp3); \
890 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
891 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
892 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
893 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
894 _tmp2 = _mm_load_ss(ptr1+2); \
895 _tmp3 = _mm_load_ss(ptr2+2); \
896 jx1 = _mm_unpacklo_ps(jx1,jy1); \
897 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
898 _tmp1 = _mm_add_ps(_tmp1,jx1); \
899 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
900 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
901 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
902 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
906 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
907 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
908 _tmp1 = _mm_loadu_ps(ptr1); \
909 _tmp2 = _mm_loadu_ps(ptr2); \
910 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
911 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
912 jx1 = _mm_unpacklo_ps(jx1,jy1); \
913 jz1 = _mm_unpacklo_ps(jz1,jx2); \
914 jy2 = _mm_unpacklo_ps(jy2,jz2); \
915 _tmp4 = _mm_movelh_ps(jx1,jz1); \
916 _tmp5 = _mm_movehl_ps(jz1,jx1); \
917 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
918 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
919 _tmp3 = _mm_add_ps(_tmp3,jy2); \
920 _mm_storeu_ps(ptr1,_tmp1); \
921 _mm_storeu_ps(ptr2,_tmp2); \
922 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
923 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
927 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
928 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
929 _tmp1 = _mm_loadu_ps(ptr1); \
930 _tmp2 = _mm_loadu_ps(ptr1+4); \
931 _tmp3 = _mm_load_ss(ptr1+8); \
932 _tmp4 = _mm_loadu_ps(ptr2); \
933 _tmp5 = _mm_loadu_ps(ptr2+4); \
934 _tmp6 = _mm_load_ss(ptr2+8); \
935 jx1 = _mm_unpacklo_ps(jx1,jy1); \
936 jz1 = _mm_unpacklo_ps(jz1,jx2); \
937 jy2 = _mm_unpacklo_ps(jy2,jz2); \
938 jx3 = _mm_unpacklo_ps(jx3,jy3); \
939 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
940 _tmp8 = _mm_movelh_ps(jx1,jz1); \
941 _tmp9 = _mm_movehl_ps(jz1,jx1); \
942 _tmp10 = _mm_movelh_ps(jy2,jx3); \
943 _tmp11 = _mm_movehl_ps(jx3,jy2); \
944 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
945 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
946 _tmp3 = _mm_add_ss(_tmp3,jz3); \
947 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
948 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
949 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
950 _mm_storeu_ps(ptr1,_tmp1); \
951 _mm_storeu_ps(ptr1+4,_tmp2); \
952 _mm_store_ss(ptr1+8,_tmp3); \
953 _mm_storeu_ps(ptr2,_tmp4); \
954 _mm_storeu_ps(ptr2+4,_tmp5); \
955 _mm_store_ss(ptr2+8,_tmp6); \
959 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
960 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
961 _tmp1 = _mm_loadu_ps(ptr1); \
962 _tmp2 = _mm_loadu_ps(ptr1+4); \
963 _tmp3 = _mm_loadu_ps(ptr1+8); \
964 _tmp4 = _mm_loadu_ps(ptr2); \
965 _tmp5 = _mm_loadu_ps(ptr2+4); \
966 _tmp6 = _mm_loadu_ps(ptr2+8); \
967 jx1 = _mm_unpacklo_ps(jx1,jy1); \
968 jz1 = _mm_unpacklo_ps(jz1,jx2); \
969 jy2 = _mm_unpacklo_ps(jy2,jz2); \
970 jx3 = _mm_unpacklo_ps(jx3,jy3); \
971 jz3 = _mm_unpacklo_ps(jz3,jx4); \
972 jy4 = _mm_unpacklo_ps(jy4,jz4); \
973 _tmp8 = _mm_movelh_ps(jx1,jz1); \
974 _tmp9 = _mm_movehl_ps(jz1,jx1); \
975 _tmp10 = _mm_movelh_ps(jy2,jx3); \
976 _tmp11 = _mm_movehl_ps(jx3,jy2); \
977 _tmp12 = _mm_movelh_ps(jz3,jy4); \
978 _tmp13 = _mm_movehl_ps(jy4,jz3); \
979 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
980 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
981 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
982 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
983 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
984 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
985 _mm_storeu_ps(ptr1,_tmp1); \
986 _mm_storeu_ps(ptr1+4,_tmp2); \
987 _mm_storeu_ps(ptr1+8,_tmp3); \
988 _mm_storeu_ps(ptr2,_tmp4); \
989 _mm_storeu_ps(ptr2+4,_tmp5); \
990 _mm_storeu_ps(ptr2+8,_tmp6); \
994 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
995 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
996 _tmp1 = _mm_load_ss(ptr1); \
997 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
998 _tmp2 = _mm_load_ss(ptr2); \
999 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1000 _tmp3 = _mm_load_ss(ptr3); \
1001 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1002 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1003 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1004 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1005 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1006 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1007 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1008 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1009 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1010 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1011 _mm_store_ss(ptr1,_tmp1); \
1012 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1013 _mm_store_ss(ptr2,_tmp2); \
1014 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1015 _mm_store_ss(ptr3,_tmp3); \
1016 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1020 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1021 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1022 _tmp1 = _mm_loadu_ps(ptr1); \
1023 _tmp2 = _mm_loadu_ps(ptr2); \
1024 _tmp3 = _mm_loadu_ps(ptr3); \
1025 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1026 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1027 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1028 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1029 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1030 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1031 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1032 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1033 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1034 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1035 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1036 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1037 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1038 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1039 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1040 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1041 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1042 _mm_storeu_ps(ptr1,_tmp1); \
1043 _mm_storeu_ps(ptr2,_tmp2); \
1044 _mm_storeu_ps(ptr3,_tmp3); \
1045 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1046 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1047 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1051 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1052 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1053 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1054 _tmp1 = _mm_loadu_ps(ptr1); \
1055 _tmp2 = _mm_loadu_ps(ptr1+4); \
1056 _tmp3 = _mm_load_ss(ptr1+8); \
1057 _tmp4 = _mm_loadu_ps(ptr2); \
1058 _tmp5 = _mm_loadu_ps(ptr2+4); \
1059 _tmp6 = _mm_load_ss(ptr2+8); \
1060 _tmp7 = _mm_loadu_ps(ptr3); \
1061 _tmp8 = _mm_loadu_ps(ptr3+4); \
1062 _tmp9 = _mm_load_ss(ptr3+8); \
1063 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1064 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1065 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1066 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1067 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1068 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1069 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1070 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1071 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1072 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1073 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1074 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1075 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1076 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1077 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1078 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1079 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1080 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1081 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1082 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1083 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1084 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1085 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1086 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1087 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1088 _mm_storeu_ps(ptr1,_tmp1); \
1089 _mm_storeu_ps(ptr1+4,_tmp2); \
1090 _mm_store_ss(ptr1+8,_tmp3); \
1091 _mm_storeu_ps(ptr2,_tmp4); \
1092 _mm_storeu_ps(ptr2+4,_tmp5); \
1093 _mm_store_ss(ptr2+8,_tmp6); \
1094 _mm_storeu_ps(ptr3,_tmp7); \
1095 _mm_storeu_ps(ptr3+4,_tmp8); \
1096 _mm_store_ss(ptr3+8,_tmp9); \
1100 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1101 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1102 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1103 _tmp1 = _mm_loadu_ps(ptr1); \
1104 _tmp2 = _mm_loadu_ps(ptr1+4); \
1105 _tmp3 = _mm_loadu_ps(ptr1+8); \
1106 _tmp4 = _mm_loadu_ps(ptr2); \
1107 _tmp5 = _mm_loadu_ps(ptr2+4); \
1108 _tmp6 = _mm_loadu_ps(ptr2+8); \
1109 _tmp7 = _mm_loadu_ps(ptr3); \
1110 _tmp8 = _mm_loadu_ps(ptr3+4); \
1111 _tmp9 = _mm_loadu_ps(ptr3+8); \
1112 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1113 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1114 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1115 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1116 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1117 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1118 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1119 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1120 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1121 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1122 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1123 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1124 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1125 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1126 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1127 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1128 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1129 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1130 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1131 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1132 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1133 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1134 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1135 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1136 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1137 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1138 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1139 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1140 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1141 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1142 _mm_storeu_ps(ptr1,_tmp1); \
1143 _mm_storeu_ps(ptr1+4,_tmp2); \
1144 _mm_storeu_ps(ptr1+8,_tmp3); \
1145 _mm_storeu_ps(ptr2,_tmp4); \
1146 _mm_storeu_ps(ptr2+4,_tmp5); \
1147 _mm_storeu_ps(ptr2+8,_tmp6); \
1148 _mm_storeu_ps(ptr3,_tmp7); \
1149 _mm_storeu_ps(ptr3+4,_tmp8); \
1150 _mm_storeu_ps(ptr3+8,_tmp9); \
1155 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1156 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1157 _tmp1 = _mm_load_ss(ptr1); \
1158 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1159 _tmp2 = _mm_load_ss(ptr2); \
1160 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1161 _tmp3 = _mm_load_ss(ptr3); \
1162 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1163 _tmp4 = _mm_load_ss(ptr4); \
1164 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1165 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1166 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1167 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1168 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1169 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1170 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1171 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1172 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1173 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1174 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1175 _mm_store_ss(ptr1,_tmp1); \
1176 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1177 _mm_store_ss(ptr2,_tmp2); \
1178 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1179 _mm_store_ss(ptr3,_tmp3); \
1180 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1181 _mm_store_ss(ptr4,_tmp4); \
1182 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1186 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1187 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1188 _tmp1 = _mm_loadu_ps(ptr1); \
1189 _tmp2 = _mm_loadu_ps(ptr2); \
1190 _tmp3 = _mm_loadu_ps(ptr3); \
1191 _tmp4 = _mm_loadu_ps(ptr4); \
1192 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1193 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1194 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1195 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1196 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1197 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1198 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1199 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1200 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1201 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1202 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1203 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1204 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1205 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1206 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1207 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1208 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1209 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1210 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1211 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1212 _mm_storeu_ps(ptr1,_tmp1); \
1213 _mm_storeu_ps(ptr2,_tmp2); \
1214 _mm_storeu_ps(ptr3,_tmp3); \
1215 _mm_storeu_ps(ptr4,_tmp4); \
1216 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1217 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1218 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1219 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1223 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1224 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1225 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1226 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1227 _tmp1 = _mm_loadu_ps(ptr1); \
1228 _tmp2 = _mm_loadu_ps(ptr1+4); \
1229 _tmp3 = _mm_load_ss(ptr1+8); \
1230 _tmp4 = _mm_loadu_ps(ptr2); \
1231 _tmp5 = _mm_loadu_ps(ptr2+4); \
1232 _tmp6 = _mm_load_ss(ptr2+8); \
1233 _tmp7 = _mm_loadu_ps(ptr3); \
1234 _tmp8 = _mm_loadu_ps(ptr3+4); \
1235 _tmp9 = _mm_load_ss(ptr3+8); \
1236 _tmp10 = _mm_loadu_ps(ptr4); \
1237 _tmp11 = _mm_loadu_ps(ptr4+4); \
1238 _tmp12 = _mm_load_ss(ptr4+8); \
1239 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1240 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1241 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1242 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1243 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1244 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1245 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1246 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1247 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1248 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1249 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1250 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1251 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1252 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1253 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1254 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1255 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1256 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1257 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1258 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1259 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1260 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1261 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1262 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1263 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1264 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1265 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1266 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1267 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1268 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1269 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1270 _mm_storeu_ps(ptr1,_tmp1); \
1271 _mm_storeu_ps(ptr1+4,_tmp2); \
1272 _mm_store_ss(ptr1+8,_tmp3); \
1273 _mm_storeu_ps(ptr2,_tmp4); \
1274 _mm_storeu_ps(ptr2+4,_tmp5); \
1275 _mm_store_ss(ptr2+8,_tmp6); \
1276 _mm_storeu_ps(ptr3,_tmp7); \
1277 _mm_storeu_ps(ptr3+4,_tmp8); \
1278 _mm_store_ss(ptr3+8,_tmp9); \
1279 _mm_storeu_ps(ptr4,_tmp10); \
1280 _mm_storeu_ps(ptr4+4,_tmp11); \
1281 _mm_store_ss(ptr4+8,_tmp12); \
1285 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1286 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1287 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1288 __m128 _tmp23,_tmp24; \
1289 _tmp1 = _mm_loadu_ps(ptr1); \
1290 _tmp2 = _mm_loadu_ps(ptr1+4); \
1291 _tmp3 = _mm_loadu_ps(ptr1+8); \
1292 _tmp4 = _mm_loadu_ps(ptr2); \
1293 _tmp5 = _mm_loadu_ps(ptr2+4); \
1294 _tmp6 = _mm_loadu_ps(ptr2+8); \
1295 _tmp7 = _mm_loadu_ps(ptr3); \
1296 _tmp8 = _mm_loadu_ps(ptr3+4); \
1297 _tmp9 = _mm_loadu_ps(ptr3+8); \
1298 _tmp10 = _mm_loadu_ps(ptr4); \
1299 _tmp11 = _mm_loadu_ps(ptr4+4); \
1300 _tmp12 = _mm_loadu_ps(ptr4+8); \
1301 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1302 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1303 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1304 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1305 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1306 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1307 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1308 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1309 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1310 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1311 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1312 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1313 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1314 jz1 = _mm_movehl_ps(jz1,jx1); \
1315 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1316 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1317 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1318 jx3 = _mm_movehl_ps(jx3,jy2); \
1319 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1320 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1321 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1322 jy4 = _mm_movehl_ps(jy4,jz3); \
1323 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1324 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1325 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1326 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1327 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1328 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1329 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1330 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1331 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1332 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1333 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1334 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1335 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1336 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1337 _mm_storeu_ps(ptr1,_tmp1); \
1338 _mm_storeu_ps(ptr1+4,_tmp2); \
1339 _mm_storeu_ps(ptr1+8,_tmp3); \
1340 _mm_storeu_ps(ptr2,_tmp4); \
1341 _mm_storeu_ps(ptr2+4,_tmp5); \
1342 _mm_storeu_ps(ptr2+8,_tmp6); \
1343 _mm_storeu_ps(ptr3,_tmp7); \
1344 _mm_storeu_ps(ptr3+4,_tmp8); \
1345 _mm_storeu_ps(ptr3+8,_tmp9); \
1346 _mm_storeu_ps(ptr4,_tmp10); \
1347 _mm_storeu_ps(ptr4+4,_tmp11); \
1348 _mm_storeu_ps(ptr4+8,_tmp12); \
1353 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1355 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1356 jx1 = _mm_movelh_ps(jx1,jy1); \
1357 _tmp1 = _mm_load_ss(ptr1); \
1358 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1359 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1360 _mm_store_ss(ptr1,_tmp1); \
1361 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1365 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1366 __m128 _tmp1, _tmp2; \
1367 _tmp1 = _mm_loadu_ps(ptr1); \
1368 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1369 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1370 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1371 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1372 jx1 = _mm_movelh_ps(jx1,jz1); \
1373 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1374 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1375 _mm_storeu_ps(ptr1,_tmp1); \
1376 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1380 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1381 __m128 _tmp1, _tmp2, _tmp3; \
1382 _tmp1 = _mm_loadu_ps(ptr1); \
1383 _tmp2 = _mm_loadu_ps(ptr1+4); \
1384 _tmp3 = _mm_load_ss(ptr1+8); \
1385 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1386 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1387 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1388 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1389 jx1 = _mm_movelh_ps(jx1,jz1); \
1390 jy2 = _mm_movelh_ps(jy2,jx3); \
1391 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1392 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1393 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1394 _mm_storeu_ps(ptr1,_tmp1); \
1395 _mm_storeu_ps(ptr1+4,_tmp2); \
1396 _mm_store_ss(ptr1+8,_tmp3); \
1400 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1401 __m128 _tmp1, _tmp2, _tmp3; \
1402 _tmp1 = _mm_loadu_ps(ptr1); \
1403 _tmp2 = _mm_loadu_ps(ptr1+4); \
1404 _tmp3 = _mm_loadu_ps(ptr1+8); \
1405 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1406 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1407 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1408 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1409 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1410 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1411 jx1 = _mm_movelh_ps(jx1,jz1); \
1412 jy2 = _mm_movelh_ps(jy2,jx3); \
1413 jz3 = _mm_movelh_ps(jz3,jy4); \
1414 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1415 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1416 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1417 _mm_storeu_ps(ptr1,_tmp1); \
1418 _mm_storeu_ps(ptr1+4,_tmp2); \
1419 _mm_storeu_ps(ptr1+8,_tmp3); \
1423 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1424 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1425 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1426 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1427 _tmp2 = _mm_load_ss(ptr1+2); \
1428 _tmp3 = _mm_load_ss(ptr2+2); \
1429 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1430 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1431 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1432 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1433 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1434 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1435 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1439 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1440 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1441 _tmp1 = _mm_loadu_ps(ptr1); \
1442 _tmp2 = _mm_loadu_ps(ptr2); \
1443 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1444 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1445 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1446 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1447 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1448 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1449 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1450 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1451 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1452 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1453 _mm_storeu_ps(ptr1,_tmp1); \
1454 _mm_storeu_ps(ptr2,_tmp2); \
1455 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1456 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1460 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1461 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1462 _tmp1 = _mm_loadu_ps(ptr1); \
1463 _tmp2 = _mm_loadu_ps(ptr1+4); \
1464 _tmp3 = _mm_load_ss(ptr1+8); \
1465 _tmp4 = _mm_loadu_ps(ptr2); \
1466 _tmp5 = _mm_loadu_ps(ptr2+4); \
1467 _tmp6 = _mm_load_ss(ptr2+8); \
1468 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1469 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1470 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1471 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1472 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1473 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1474 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1475 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1476 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1477 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1478 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1479 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1480 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1481 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1482 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1483 _mm_storeu_ps(ptr1,_tmp1); \
1484 _mm_storeu_ps(ptr1+4,_tmp2); \
1485 _mm_store_ss(ptr1+8,_tmp3); \
1486 _mm_storeu_ps(ptr2,_tmp4); \
1487 _mm_storeu_ps(ptr2+4,_tmp5); \
1488 _mm_store_ss(ptr2+8,_tmp6); \
1492 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1493 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1494 _tmp1 = _mm_loadu_ps(ptr1); \
1495 _tmp2 = _mm_loadu_ps(ptr1+4); \
1496 _tmp3 = _mm_loadu_ps(ptr1+8); \
1497 _tmp4 = _mm_loadu_ps(ptr2); \
1498 _tmp5 = _mm_loadu_ps(ptr2+4); \
1499 _tmp6 = _mm_loadu_ps(ptr2+8); \
1500 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1501 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1502 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1503 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1504 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1505 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1506 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1507 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1508 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1509 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1510 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1511 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1512 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1513 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1514 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1515 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1516 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1517 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1518 _mm_storeu_ps(ptr1,_tmp1); \
1519 _mm_storeu_ps(ptr1+4,_tmp2); \
1520 _mm_storeu_ps(ptr1+8,_tmp3); \
1521 _mm_storeu_ps(ptr2,_tmp4); \
1522 _mm_storeu_ps(ptr2+4,_tmp5); \
1523 _mm_storeu_ps(ptr2+8,_tmp6); \
1527 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1528 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1529 _tmp1 = _mm_load_ss(ptr1); \
1530 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1531 _tmp2 = _mm_load_ss(ptr2); \
1532 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1533 _tmp3 = _mm_load_ss(ptr3); \
1534 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1535 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1536 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1537 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1538 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1539 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1540 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1541 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1542 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1543 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1544 _mm_store_ss(ptr1,_tmp1); \
1545 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1546 _mm_store_ss(ptr2,_tmp2); \
1547 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1548 _mm_store_ss(ptr3,_tmp3); \
1549 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1553 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1554 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1555 _tmp1 = _mm_loadu_ps(ptr1); \
1556 _tmp2 = _mm_loadu_ps(ptr2); \
1557 _tmp3 = _mm_loadu_ps(ptr3); \
1558 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1559 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1560 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1561 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1562 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1563 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1564 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1565 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1566 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1567 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1568 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1569 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1570 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1571 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1572 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1573 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1574 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1575 _mm_storeu_ps(ptr1,_tmp1); \
1576 _mm_storeu_ps(ptr2,_tmp2); \
1577 _mm_storeu_ps(ptr3,_tmp3); \
1578 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1579 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1580 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1584 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1585 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1586 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1587 _tmp1 = _mm_loadu_ps(ptr1); \
1588 _tmp2 = _mm_loadu_ps(ptr1+4); \
1589 _tmp3 = _mm_load_ss(ptr1+8); \
1590 _tmp4 = _mm_loadu_ps(ptr2); \
1591 _tmp5 = _mm_loadu_ps(ptr2+4); \
1592 _tmp6 = _mm_load_ss(ptr2+8); \
1593 _tmp7 = _mm_loadu_ps(ptr3); \
1594 _tmp8 = _mm_loadu_ps(ptr3+4); \
1595 _tmp9 = _mm_load_ss(ptr3+8); \
1596 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1597 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1598 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1599 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1600 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1601 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1602 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1603 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1604 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1605 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1606 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1607 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1608 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1609 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1610 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1611 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1612 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1613 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1614 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1615 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1616 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1617 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1618 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1619 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1620 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1621 _mm_storeu_ps(ptr1,_tmp1); \
1622 _mm_storeu_ps(ptr1+4,_tmp2); \
1623 _mm_store_ss(ptr1+8,_tmp3); \
1624 _mm_storeu_ps(ptr2,_tmp4); \
1625 _mm_storeu_ps(ptr2+4,_tmp5); \
1626 _mm_store_ss(ptr2+8,_tmp6); \
1627 _mm_storeu_ps(ptr3,_tmp7); \
1628 _mm_storeu_ps(ptr3+4,_tmp8); \
1629 _mm_store_ss(ptr3+8,_tmp9); \
1633 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1634 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1635 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1636 _tmp1 = _mm_loadu_ps(ptr1); \
1637 _tmp2 = _mm_loadu_ps(ptr1+4); \
1638 _tmp3 = _mm_loadu_ps(ptr1+8); \
1639 _tmp4 = _mm_loadu_ps(ptr2); \
1640 _tmp5 = _mm_loadu_ps(ptr2+4); \
1641 _tmp6 = _mm_loadu_ps(ptr2+8); \
1642 _tmp7 = _mm_loadu_ps(ptr3); \
1643 _tmp8 = _mm_loadu_ps(ptr3+4); \
1644 _tmp9 = _mm_loadu_ps(ptr3+8); \
1645 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1646 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1647 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1648 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1649 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1650 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1651 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1652 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1653 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1654 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1655 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1656 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1657 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1658 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1659 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1660 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1661 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1662 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1663 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1664 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1665 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1666 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1667 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1668 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
1669 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1670 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1671 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
1672 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1673 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1674 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
1675 _mm_storeu_ps(ptr1,_tmp1); \
1676 _mm_storeu_ps(ptr1+4,_tmp2); \
1677 _mm_storeu_ps(ptr1+8,_tmp3); \
1678 _mm_storeu_ps(ptr2,_tmp4); \
1679 _mm_storeu_ps(ptr2+4,_tmp5); \
1680 _mm_storeu_ps(ptr2+8,_tmp6); \
1681 _mm_storeu_ps(ptr3,_tmp7); \
1682 _mm_storeu_ps(ptr3+4,_tmp8); \
1683 _mm_storeu_ps(ptr3+8,_tmp9); \
1689 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1690 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1691 _tmp1 = _mm_load_ss(ptr1); \
1692 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1693 _tmp2 = _mm_load_ss(ptr2); \
1694 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1695 _tmp3 = _mm_load_ss(ptr3); \
1696 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1697 _tmp4 = _mm_load_ss(ptr4); \
1698 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1699 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1700 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1701 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1702 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1703 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1704 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1705 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
1706 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
1707 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
1708 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
1709 _mm_store_ss(ptr1,_tmp1); \
1710 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1711 _mm_store_ss(ptr2,_tmp2); \
1712 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1713 _mm_store_ss(ptr3,_tmp3); \
1714 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1715 _mm_store_ss(ptr4,_tmp4); \
1716 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1721 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1722 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1723 _tmp1 = _mm_loadu_ps(ptr1); \
1724 _tmp2 = _mm_loadu_ps(ptr2); \
1725 _tmp3 = _mm_loadu_ps(ptr3); \
1726 _tmp4 = _mm_loadu_ps(ptr4); \
1727 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1728 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1729 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1730 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1731 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1732 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1733 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1734 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1735 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1736 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1737 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1738 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1739 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1740 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1741 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
1742 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
1743 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1744 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
1745 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
1746 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
1747 _mm_storeu_ps(ptr1,_tmp1); \
1748 _mm_storeu_ps(ptr2,_tmp2); \
1749 _mm_storeu_ps(ptr3,_tmp3); \
1750 _mm_storeu_ps(ptr4,_tmp4); \
1751 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1752 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1753 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1754 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1758 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1759 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1760 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1761 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1762 _tmp1 = _mm_loadu_ps(ptr1); \
1763 _tmp2 = _mm_loadu_ps(ptr1+4); \
1764 _tmp3 = _mm_load_ss(ptr1+8); \
1765 _tmp4 = _mm_loadu_ps(ptr2); \
1766 _tmp5 = _mm_loadu_ps(ptr2+4); \
1767 _tmp6 = _mm_load_ss(ptr2+8); \
1768 _tmp7 = _mm_loadu_ps(ptr3); \
1769 _tmp8 = _mm_loadu_ps(ptr3+4); \
1770 _tmp9 = _mm_load_ss(ptr3+8); \
1771 _tmp10 = _mm_loadu_ps(ptr4); \
1772 _tmp11 = _mm_loadu_ps(ptr4+4); \
1773 _tmp12 = _mm_load_ss(ptr4+8); \
1774 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1775 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1776 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1777 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1778 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1779 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1780 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1781 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1782 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1783 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1784 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1785 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1786 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1787 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1788 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1789 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1790 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1791 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1792 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1793 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
1794 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
1795 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1796 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
1797 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
1798 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
1799 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
1800 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
1801 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
1802 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1803 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1804 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
1805 _mm_storeu_ps(ptr1,_tmp1); \
1806 _mm_storeu_ps(ptr1+4,_tmp2); \
1807 _mm_store_ss(ptr1+8,_tmp3); \
1808 _mm_storeu_ps(ptr2,_tmp4); \
1809 _mm_storeu_ps(ptr2+4,_tmp5); \
1810 _mm_store_ss(ptr2+8,_tmp6); \
1811 _mm_storeu_ps(ptr3,_tmp7); \
1812 _mm_storeu_ps(ptr3+4,_tmp8); \
1813 _mm_store_ss(ptr3+8,_tmp9); \
1814 _mm_storeu_ps(ptr4,_tmp10); \
1815 _mm_storeu_ps(ptr4+4,_tmp11); \
1816 _mm_store_ss(ptr4+8,_tmp12); \
1820 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1821 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1822 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1823 __m128 _tmp23,_tmp24; \
1824 _tmp1 = _mm_loadu_ps(ptr1); \
1825 _tmp2 = _mm_loadu_ps(ptr1+4); \
1826 _tmp3 = _mm_loadu_ps(ptr1+8); \
1827 _tmp4 = _mm_loadu_ps(ptr2); \
1828 _tmp5 = _mm_loadu_ps(ptr2+4); \
1829 _tmp6 = _mm_loadu_ps(ptr2+8); \
1830 _tmp7 = _mm_loadu_ps(ptr3); \
1831 _tmp8 = _mm_loadu_ps(ptr3+4); \
1832 _tmp9 = _mm_loadu_ps(ptr3+8); \
1833 _tmp10 = _mm_loadu_ps(ptr4); \
1834 _tmp11 = _mm_loadu_ps(ptr4+4); \
1835 _tmp12 = _mm_loadu_ps(ptr4+8); \
1836 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1837 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1838 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1839 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1840 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1841 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1842 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1843 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1844 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1845 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1846 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1847 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1848 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1849 jz1 = _mm_movehl_ps(jz1,jx1); \
1850 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1851 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1852 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1853 jx3 = _mm_movehl_ps(jx3,jy2); \
1854 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1855 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1856 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1857 jy4 = _mm_movehl_ps(jy4,jz3); \
1858 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1859 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1860 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
1861 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
1862 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
1863 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
1864 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
1865 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
1866 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
1867 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
1868 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
1869 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1870 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1871 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
1872 _mm_storeu_ps(ptr1,_tmp1); \
1873 _mm_storeu_ps(ptr1+4,_tmp2); \
1874 _mm_storeu_ps(ptr1+8,_tmp3); \
1875 _mm_storeu_ps(ptr2,_tmp4); \
1876 _mm_storeu_ps(ptr2+4,_tmp5); \
1877 _mm_storeu_ps(ptr2+8,_tmp6); \
1878 _mm_storeu_ps(ptr3,_tmp7); \
1879 _mm_storeu_ps(ptr3+4,_tmp8); \
1880 _mm_storeu_ps(ptr3+8,_tmp9); \
1881 _mm_storeu_ps(ptr4,_tmp10); \
1882 _mm_storeu_ps(ptr4+4,_tmp11); \
1883 _mm_storeu_ps(ptr4+8,_tmp12); \
1891 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1892 * to set up the 7 constants used for analytic 5th order switch calculations.
1894 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
1895 const __m128 _swsetup_cm6 = { -6.0, -6.0, -6.0, -6.0}; \
1896 const __m128 _swsetup_cm10 = {-10.0,-10.0,-10.0,-10.0}; \
1897 const __m128 _swsetup_c15 = { 15.0, 15.0, 15.0, 15.0}; \
1898 const __m128 _swsetup_cm30 = {-30.0,-30.0,-30.0,-30.0}; \
1899 const __m128 _swsetup_c60 = { 60.0, 60.0, 60.0, 60.0}; \
1901 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
1903 d = _mm_sub_ps(rcut,rswitch); \
1904 dinv = gmx_mm_inv_ps(d); \
1905 dinv2 = _mm_mul_ps(dinv,dinv); \
1906 dinv3 = _mm_mul_ps(dinv2,dinv); \
1907 dinv4 = _mm_mul_ps(dinv2,dinv2); \
1908 dinv5 = _mm_mul_ps(dinv3,dinv2); \
1910 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
1911 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
1912 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
1913 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
1914 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
1915 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
1919 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1920 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
1922 d = _mm_max_ps(r,rswitch); \
1923 d = _mm_min_ps(d,rcut); \
1924 d = _mm_sub_ps(d,rswitch); \
1925 d2 = _mm_mul_ps(d,d); \
1926 sw = _mm_mul_ps(d,sw_C5); \
1927 dsw = _mm_mul_ps(d,sw_D4); \
1928 sw = _mm_add_ps(sw,sw_C4); \
1929 dsw = _mm_add_ps(dsw,sw_D3); \
1930 sw = _mm_mul_ps(sw,d); \
1931 dsw = _mm_mul_ps(dsw,d); \
1932 sw = _mm_add_ps(sw,sw_C3); \
1933 dsw = _mm_add_ps(dsw,sw_D2); \
1934 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
1935 dsw = _mm_mul_ps(dsw,d2); \
1936 sw = _mm_add_ps(sw,_sw_one); \
1940 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1941 static inline __m128
1942 gmx_mm_interaction_coulomb_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1944 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1945 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1951 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1953 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1954 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1958 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1959 static inline __m128
1960 gmx_mm_interaction_coulombrf_ps(const __m128 rinv
, const __m128 rsq
, const __m128 krf
, const __m128 crf
, const __m128 qq
,__m128
*vctot
)
1962 const __m128 two
= {2.0,2.0,2.0,2.0};
1965 krsq
= _mm_mul_ps(krf
,rsq
);
1966 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
1967 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1969 return _mm_mul_ps(qq
, _mm_sub_ps(rinv
, _mm_mul_ps(two
,krsq
)));
1974 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv
, __m128 rsq
, __m128 krf
, __m128 crf
, __m128 qq
,__m128
*vctot
)
1978 krsq
= _mm_mul_ps(krf
,rsq
);
1979 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
1980 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1993 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1994 static inline __m128
1995 gmx_mm_int_lj_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
1997 const __m128 six
= {6.0,6.0,6.0,6.0};
1998 const __m128 twelve
= {12.0,12.0,12.0,12.0};
2000 __m128 rinvsix
,vvdw6
,vvdw12
;
2002 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2003 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2004 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2005 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2007 return _mm_sub_ps( _mm_mul_ps(twelve
,vvdw12
),_mm_mul_ps(six
,vvdw6
));
2012 gmx_mm_int_lj_potonly_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2014 __m128 rinvsix
,vvdw6
,vvdw12
;
2016 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2017 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2018 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2019 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2026 /* Return force should be multiplied by -rinv to get fscal */
2027 static inline __m128
2028 gmx_mm_int_4_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2030 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2032 int n_a
,n_b
,n_c
,n_d
;
2034 rt
= _mm_mul_ps(r
,tabscale
);
2035 n0
= _mm_cvttps_epi32(rt
);
2036 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2037 eps2
= _mm_mul_ps(eps
,eps
);
2039 /* Extract indices from n0 */
2040 n_a
= gmx_mm_extract_epi32(n0
,0);
2041 n_b
= gmx_mm_extract_epi32(n0
,1);
2042 n_c
= gmx_mm_extract_epi32(n0
,2);
2043 n_d
= gmx_mm_extract_epi32(n0
,3);
2044 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2045 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2046 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2047 H
= _mm_load_ps(VFtab
+ 4* n_d
);
2048 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2049 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2050 G
= _mm_mul_ps(G
,eps
); /* Geps */
2051 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2052 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2053 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2055 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2057 return _mm_mul_ps(F
,tabscale
);
2062 /* Return force should be multiplied by -rinv to get fscal */
2063 static inline __m128
2064 gmx_mm_int_4_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2066 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2068 int n_a
,n_b
,n_c
,n_d
;
2070 rt
= _mm_mul_ps(r
,tabscale
);
2071 n0
= _mm_cvttps_epi32(rt
);
2072 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2073 eps2
= _mm_mul_ps(eps
,eps
);
2075 /* Extract indices from n0 */
2076 n_a
= gmx_mm_extract_epi32(n0
,0);
2077 n_b
= gmx_mm_extract_epi32(n0
,1);
2078 n_c
= gmx_mm_extract_epi32(n0
,2);
2079 n_d
= gmx_mm_extract_epi32(n0
,3);
2081 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2082 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2083 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2085 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
);
2086 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
);
2087 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
);
2088 Hd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
);
2089 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
+ 4);
2090 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
+ 4);
2091 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
+ 4);
2092 Hr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
+ 4);
2093 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2094 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2095 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2096 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2097 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2098 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2099 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2100 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2101 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2102 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2103 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2105 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2106 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2108 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2112 /* Return force should be multiplied by -rinv to get fscal */
2113 static inline __m128
2114 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2115 __m128
*vctot
, __m128
*vvdwtot
)
2117 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2119 int n_a
,n_b
,n_c
,n_d
;
2121 rt
= _mm_mul_ps(r
,tabscale
);
2122 n0
= _mm_cvttps_epi32(rt
);
2123 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2124 eps2
= _mm_mul_ps(eps
,eps
);
2126 /* Extract indices from n0 */
2127 n_a
= gmx_mm_extract_epi32(n0
,0);
2128 n_b
= gmx_mm_extract_epi32(n0
,1);
2129 n_c
= gmx_mm_extract_epi32(n0
,2);
2130 n_d
= gmx_mm_extract_epi32(n0
,3);
2133 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2134 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2135 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2136 Hc
= _mm_load_ps(VFtab
+ 12* n_d
);
2137 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2138 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2139 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2140 Hd
= _mm_load_ps(VFtab
+ 12* n_d
+ 4);
2141 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2142 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2143 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2144 Hr
= _mm_load_ps(VFtab
+ 12* n_d
+ 8);
2145 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2146 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2147 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2148 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2149 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2150 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2151 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2152 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2153 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2154 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2155 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2156 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2158 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2159 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2161 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2162 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2163 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2165 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2166 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2167 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2169 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2174 /* Return force should be multiplied by -rinv to get fscal */
2175 static inline __m128
2176 gmx_mm_int_3_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2178 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2182 rt
= _mm_mul_ps(r
,tabscale
);
2183 n0
= _mm_cvttps_epi32(rt
);
2184 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2185 eps2
= _mm_mul_ps(eps
,eps
);
2187 /* Extract indices from n0 */
2188 n_a
= gmx_mm_extract_epi32(n0
,0);
2189 n_b
= gmx_mm_extract_epi32(n0
,1);
2190 n_c
= gmx_mm_extract_epi32(n0
,2);
2191 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2192 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2193 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2194 H
= _mm_setzero_ps();
2195 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2196 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2197 G
= _mm_mul_ps(G
,eps
); /* Geps */
2198 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2199 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2200 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2202 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2204 return _mm_mul_ps(F
,tabscale
);
2209 /* Return force should be multiplied by -rinv to get fscal */
2210 static inline __m128
2211 gmx_mm_int_3_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2213 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2217 rt
= _mm_mul_ps(r
,tabscale
);
2218 n0
= _mm_cvttps_epi32(rt
);
2219 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2220 eps2
= _mm_mul_ps(eps
,eps
);
2222 /* Extract indices from n0 */
2223 n_a
= gmx_mm_extract_epi32(n0
,0);
2224 n_b
= gmx_mm_extract_epi32(n0
,1);
2225 n_c
= gmx_mm_extract_epi32(n0
,2);
2227 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2228 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2229 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2231 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2232 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2233 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
);
2234 Hd
= _mm_setzero_ps();
2235 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2236 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2237 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
+ 4);
2238 Hr
= _mm_setzero_ps();
2239 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2240 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2241 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2242 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2243 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2244 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2245 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2246 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2247 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2248 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2249 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2251 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2252 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2254 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2258 /* Return force should be multiplied by -rinv to get fscal */
2259 static inline __m128
2260 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2261 __m128
*vctot
, __m128
*vvdwtot
)
2263 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2267 rt
= _mm_mul_ps(r
,tabscale
);
2268 n0
= _mm_cvttps_epi32(rt
);
2269 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2270 eps2
= _mm_mul_ps(eps
,eps
);
2272 /* Extract indices from n0 */
2273 n_a
= gmx_mm_extract_epi32(n0
,0);
2274 n_b
= gmx_mm_extract_epi32(n0
,1);
2275 n_c
= gmx_mm_extract_epi32(n0
,2);
2278 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2279 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2280 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2281 Hc
= _mm_setzero_ps();
2282 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2283 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2284 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2285 Hd
= _mm_setzero_ps();
2286 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2287 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2288 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2289 Hr
= _mm_setzero_ps();
2290 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2291 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2292 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2293 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2294 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2295 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2296 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2297 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2298 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2299 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2300 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2301 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2303 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2304 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2306 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2307 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2308 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2310 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2311 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2312 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2314 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2321 /* Return force should be multiplied by -rinv to get fscal */
2322 static inline __m128
2323 gmx_mm_int_2_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2325 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2329 rt
= _mm_mul_ps(r
,tabscale
);
2330 n0
= _mm_cvttps_epi32(rt
);
2331 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2332 eps2
= _mm_mul_ps(eps
,eps
);
2334 /* Extract indices from n0 */
2335 n_a
= gmx_mm_extract_epi32(n0
,0);
2336 n_b
= gmx_mm_extract_epi32(n0
,1);
2337 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2338 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2339 G
= _mm_setzero_ps();
2340 H
= _mm_setzero_ps();
2341 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2342 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2343 G
= _mm_mul_ps(G
,eps
); /* Geps */
2344 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2345 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2346 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2348 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2350 return _mm_mul_ps(F
,tabscale
);
2355 /* Return force should be multiplied by -rinv to get fscal */
2356 static inline __m128
2357 gmx_mm_int_2_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2359 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2363 rt
= _mm_mul_ps(r
,tabscale
);
2364 n0
= _mm_cvttps_epi32(rt
);
2365 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2366 eps2
= _mm_mul_ps(eps
,eps
);
2368 /* Extract indices from n0 */
2369 n_a
= gmx_mm_extract_epi32(n0
,0);
2370 n_b
= gmx_mm_extract_epi32(n0
,1);
2372 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2373 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2374 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2376 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2377 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2378 Gd
= _mm_setzero_ps();
2379 Hd
= _mm_setzero_ps();
2380 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2381 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2382 Gr
= _mm_setzero_ps();
2383 Hr
= _mm_setzero_ps();
2384 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2385 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2386 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2387 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2388 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2389 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2390 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2391 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2392 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2393 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2394 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2396 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2397 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2399 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2403 /* Return force should be multiplied by -rinv to get fscal */
2404 static inline __m128
2405 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2406 __m128
*vctot
, __m128
*vvdwtot
)
2408 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2412 rt
= _mm_mul_ps(r
,tabscale
);
2413 n0
= _mm_cvttps_epi32(rt
);
2414 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2415 eps2
= _mm_mul_ps(eps
,eps
);
2417 /* Extract indices from n0 */
2418 n_a
= gmx_mm_extract_epi32(n0
,0);
2419 n_b
= gmx_mm_extract_epi32(n0
,1);
2421 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2422 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2423 Gc
= _mm_setzero_ps();
2424 Hc
= _mm_setzero_ps();
2425 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2426 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2427 Gd
= _mm_setzero_ps();
2428 Hd
= _mm_setzero_ps();
2429 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2430 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2431 Gr
= _mm_setzero_ps();
2432 Hr
= _mm_setzero_ps();
2433 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2434 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2435 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2436 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2437 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2438 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2439 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2440 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2441 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2442 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2443 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2444 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2446 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2447 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2449 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2450 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2451 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2453 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2454 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2455 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2457 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2463 /* Return force should be multiplied by -rinv to get fscal */
2464 static inline __m128
2465 gmx_mm_int_1_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2467 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2471 rt
= _mm_mul_ps(r
,tabscale
);
2472 n0
= _mm_cvttps_epi32(rt
);
2473 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2474 eps2
= _mm_mul_ps(eps
,eps
);
2476 /* Extract indices from n0 */
2477 n_a
= gmx_mm_extract_epi32(n0
,0);
2478 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2479 F
= _mm_setzero_ps();
2480 G
= _mm_setzero_ps();
2481 H
= _mm_setzero_ps();
2482 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2483 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2484 G
= _mm_mul_ps(G
,eps
); /* Geps */
2485 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2486 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2487 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2489 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2491 return _mm_mul_ps(F
,tabscale
);
2496 /* Return force should be multiplied by -rinv to get fscal */
2497 static inline __m128
2498 gmx_mm_int_1_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2500 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2504 rt
= _mm_mul_ps(r
,tabscale
);
2505 n0
= _mm_cvttps_epi32(rt
);
2506 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2507 eps2
= _mm_mul_ps(eps
,eps
);
2509 /* Extract indices from n0 */
2510 n_a
= gmx_mm_extract_epi32(n0
,0);
2512 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2513 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2514 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2516 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2517 Fd
= _mm_setzero_ps();
2518 Gd
= _mm_setzero_ps();
2519 Hd
= _mm_setzero_ps();
2520 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2521 Fr
= _mm_setzero_ps();
2522 Gr
= _mm_setzero_ps();
2523 Hr
= _mm_setzero_ps();
2524 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2525 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2526 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2527 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2528 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2529 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2530 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2531 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2532 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2533 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2534 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2536 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2537 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2539 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2543 /* Return force should be multiplied by -rinv to get fscal */
2544 static inline __m128
2545 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2546 __m128
*vctot
, __m128
*vvdwtot
)
2548 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2552 rt
= _mm_mul_ps(r
,tabscale
);
2553 n0
= _mm_cvttps_epi32(rt
);
2554 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2555 eps2
= _mm_mul_ps(eps
,eps
);
2557 /* Extract indices from n0 */
2558 n_a
= gmx_mm_extract_epi32(n0
,0);
2560 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2561 Fc
= _mm_setzero_ps();
2562 Gc
= _mm_setzero_ps();
2563 Hc
= _mm_setzero_ps();
2564 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2565 Fd
= _mm_setzero_ps();
2566 Gd
= _mm_setzero_ps();
2567 Hd
= _mm_setzero_ps();
2568 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2569 Fr
= _mm_setzero_ps();
2570 Gr
= _mm_setzero_ps();
2571 Hr
= _mm_setzero_ps();
2572 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2573 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2574 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2575 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2576 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2577 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2578 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2579 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2580 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2581 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2582 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2583 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2585 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2586 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2588 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2589 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2590 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2592 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2593 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2594 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2596 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2603 /* Return force should be multiplied by +rinv to get fscal */
2604 static inline __m128
2605 gmx_mm_int_4_genborn_ps(__m128 r
, __m128 isai
,
2606 float * isaj1
, float *isaj2
, float *isaj3
, float *isaj4
,
2607 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2608 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
, float *dvdaj4
,
2611 const __m128 half
= {0.5,0.5,0.5,0.5};
2613 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2615 int n_a
,n_b
,n_c
,n_d
;
2618 isaj
= _mm_load_ss(isaj1
);
2619 t2
= _mm_load_ss(isaj2
);
2620 t3
= _mm_load_ss(isaj3
);
2621 t4
= _mm_load_ss(isaj4
);
2622 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2623 t3
= _mm_unpacklo_ps(t3
,t4
); /* - - t4 t3 */
2624 isaj
= _mm_movelh_ps(isaj
,t3
); /* t4 t3 t2 t1 */
2626 isaprod
= _mm_mul_ps(isai
,isaj
);
2627 qq
= _mm_mul_ps(qq
,isaprod
);
2628 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2630 rt
= _mm_mul_ps(r
,gbtabscale
);
2631 n0
= _mm_cvttps_epi32(rt
);
2632 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2633 eps2
= _mm_mul_ps(eps
,eps
);
2635 /* Extract indices from n0 */
2636 n_a
= gmx_mm_extract_epi32(n0
,0);
2637 n_b
= gmx_mm_extract_epi32(n0
,1);
2638 n_c
= gmx_mm_extract_epi32(n0
,2);
2639 n_d
= gmx_mm_extract_epi32(n0
,3);
2640 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2641 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2642 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2643 H
= _mm_load_ps(GBtab
+ 4* n_d
);
2644 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2645 G
= _mm_mul_ps(G
,eps
); /* Geps */
2646 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2647 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2649 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2650 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2652 vgb
= _mm_mul_ps(qq
, VV
);
2653 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2655 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2657 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2659 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2661 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2663 /* Update 4 dada[j] values */
2664 Y
= _mm_load_ss(dvdaj1
);
2665 F
= _mm_load_ss(dvdaj2
);
2666 G
= _mm_load_ss(dvdaj3
);
2667 H
= _mm_load_ss(dvdaj4
);
2668 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2669 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2670 t4
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(0,0,0,1));
2672 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2673 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2674 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2675 _mm_store_ss( dvdaj4
, _mm_add_ss( H
, t4
) );
2682 /* Return force should be multiplied by +rinv to get fscal */
2683 static inline __m128
2684 gmx_mm_int_3_genborn_ps(__m128 r
, __m128 isai
,
2685 float * isaj1
, float *isaj2
, float *isaj3
,
2686 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2687 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
,
2690 const __m128 half
= {0.5,0.5,0.5,0.5};
2692 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2694 int n_a
,n_b
,n_c
,n_d
;
2697 isaj
= _mm_load_ss(isaj1
);
2698 t2
= _mm_load_ss(isaj2
);
2699 t3
= _mm_load_ss(isaj3
);
2700 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2701 t3
= _mm_unpacklo_ps(t3
,t3
); /* - - t3 t3 */
2702 isaj
= _mm_movelh_ps(isaj
,t3
); /* t3 t3 t2 t1 */
2704 isaprod
= _mm_mul_ps(isai
,isaj
);
2705 qq
= _mm_mul_ps(qq
,isaprod
);
2706 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2708 rt
= _mm_mul_ps(r
,gbtabscale
);
2709 n0
= _mm_cvttps_epi32(rt
);
2710 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2711 eps2
= _mm_mul_ps(eps
,eps
);
2713 /* Extract indices from n0 */
2714 n_a
= gmx_mm_extract_epi32(n0
,0);
2715 n_b
= gmx_mm_extract_epi32(n0
,1);
2716 n_c
= gmx_mm_extract_epi32(n0
,2);
2717 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2718 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2719 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2720 H
= _mm_setzero_ps();
2721 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2722 G
= _mm_mul_ps(G
,eps
); /* Geps */
2723 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2724 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2726 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2727 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2729 vgb
= _mm_mul_ps(qq
, VV
);
2730 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2732 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2734 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2736 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2738 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2740 /* Update 3 dada[j] values */
2741 Y
= _mm_load_ss(dvdaj1
);
2742 F
= _mm_load_ss(dvdaj2
);
2743 G
= _mm_load_ss(dvdaj3
);
2744 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2745 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2747 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2748 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2749 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2757 /* Return force should be multiplied by +rinv to get fscal */
2758 static inline __m128
2759 gmx_mm_int_2_genborn_ps(__m128 r
, __m128 isai
,
2760 float * isaj1
, float *isaj2
,
2761 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2762 float *dvdaj1
, float *dvdaj2
,
2765 const __m128 half
= {0.5,0.5,0.5,0.5};
2767 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2769 int n_a
,n_b
,n_c
,n_d
;
2772 isaj
= _mm_load_ss(isaj1
);
2773 t2
= _mm_load_ss(isaj2
);
2774 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2776 isaprod
= _mm_mul_ps(isai
,isaj
);
2777 qq
= _mm_mul_ps(qq
,isaprod
);
2778 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2780 rt
= _mm_mul_ps(r
,gbtabscale
);
2781 n0
= _mm_cvttps_epi32(rt
);
2782 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2783 eps2
= _mm_mul_ps(eps
,eps
);
2785 /* Extract indices from n0 */
2786 n_a
= gmx_mm_extract_epi32(n0
,0);
2787 n_b
= gmx_mm_extract_epi32(n0
,1);
2788 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2789 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2790 G
= _mm_setzero_ps();
2791 H
= _mm_setzero_ps();
2792 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2793 G
= _mm_mul_ps(G
,eps
); /* Geps */
2794 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2795 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2797 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2798 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2800 vgb
= _mm_mul_ps(qq
, VV
);
2801 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2803 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2805 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2807 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2809 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2811 /* Update 2 dada[j] values */
2812 Y
= _mm_load_ss(dvdaj1
);
2813 F
= _mm_load_ss(dvdaj2
);
2814 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2816 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2817 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2822 /* Return force should be multiplied by +rinv to get fscal */
2823 static inline __m128
2824 gmx_mm_int_1_genborn_ps(__m128 r
, __m128 isai
,
2826 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2830 const __m128 half
= {0.5,0.5,0.5,0.5};
2832 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2834 int n_a
,n_b
,n_c
,n_d
;
2837 isaj
= _mm_load_ss(isaj1
);
2839 isaprod
= _mm_mul_ps(isai
,isaj
);
2840 qq
= _mm_mul_ps(qq
,isaprod
);
2841 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2843 rt
= _mm_mul_ps(r
,gbtabscale
);
2844 n0
= _mm_cvttps_epi32(rt
);
2845 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2846 eps2
= _mm_mul_ps(eps
,eps
);
2848 /* Extract indices from n0 */
2849 n_a
= gmx_mm_extract_epi32(n0
,0);
2850 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2851 F
= _mm_setzero_ps();
2852 G
= _mm_setzero_ps();
2853 H
= _mm_setzero_ps();
2854 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2855 G
= _mm_mul_ps(G
,eps
); /* Geps */
2856 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2857 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2859 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2860 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2862 vgb
= _mm_mul_ps(qq
, VV
);
2863 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2865 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2867 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2869 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2871 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2873 /* Update 1 dada[j] values */
2874 Y
= _mm_load_ss(dvdaj1
);
2876 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2886 gmx_mm_update_iforce_1atom_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2893 fix1
= _mm_hadd_ps(fix1
,fix1
);
2894 fiy1
= _mm_hadd_ps(fiy1
,fiz1
);
2896 fix1
= _mm_hadd_ps(fix1
,fiy1
); /* fiz1 fiy1 fix1 fix1 */
2899 /* transpose data */
2901 _MM_TRANSPOSE4_PS(fix1
,t1
,fiy1
,fiz1
);
2902 fix1
= _mm_add_ps(_mm_add_ps(fix1
,t1
), _mm_add_ps(fiy1
,fiz1
));
2904 t2
= _mm_load_ss(fptr
);
2905 t2
= _mm_loadh_pi(t2
,(__m64
*)(fptr
+1));
2906 t3
= _mm_load_ss(fshiftptr
);
2907 t3
= _mm_loadh_pi(t3
,(__m64
*)(fshiftptr
+1));
2909 t2
= _mm_add_ps(t2
,fix1
);
2910 t3
= _mm_add_ps(t3
,fix1
);
2912 _mm_store_ss(fptr
,t2
);
2913 _mm_storeh_pi((__m64
*)(fptr
+1),t2
);
2914 _mm_store_ss(fshiftptr
,t3
);
2915 _mm_storeh_pi((__m64
*)(fshiftptr
+1),t3
);
2919 gmx_mm_update_iforce_2atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2920 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
2927 fix1
= _mm_hadd_ps(fix1
,fiy1
);
2928 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
2929 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
2931 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
2932 fiy2
= _mm_hadd_ps(fiy2
,fiy2
); /* - - fiz2 fiy2 */
2935 /* transpose data */
2936 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
2937 t1
= _mm_unpacklo_ps(fiy2
,fiz2
);
2938 t2
= _mm_unpackhi_ps(fiy2
,fiz2
);
2940 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
2941 t1
= _mm_add_ps(t1
,t2
);
2942 t2
= _mm_movehl_ps(t2
,t1
);
2943 fiy2
= _mm_add_ps(t1
,t2
);
2945 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
2946 t1
= _mm_loadl_pi(t1
,(__m64
*)(fptr
+4));
2947 _mm_storel_pi((__m64
*)(fptr
+4), _mm_add_ps(fiy2
,t1
));
2949 t4
= _mm_load_ss(fshiftptr
+2);
2950 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
2952 t1
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
2953 t1
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
2954 t2
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
2956 t1
= _mm_add_ps(t1
,t2
);
2957 t1
= _mm_add_ps(t1
,t4
); /* y x - z */
2959 _mm_store_ss(fshiftptr
+2,t1
);
2960 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
2966 gmx_mm_update_iforce_3atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2967 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
2968 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
2975 fix1
= _mm_hadd_ps(fix1
,fiy1
);
2976 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
2977 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
2978 fix3
= _mm_hadd_ps(fix3
,fiy3
);
2979 fiz3
= _mm_hadd_ps(fiz3
,fiz3
);
2981 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
2982 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
2983 fiz3
= _mm_hadd_ps(fiz3
,fiz3
); /* - - - fiz3 */
2986 /* transpose data */
2987 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
2988 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
2989 t2
= _mm_movehl_ps(_mm_setzero_ps(),fiz3
);
2990 t1
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(0,0,0,1));
2991 t3
= _mm_shuffle_ps(t2
,t2
,_MM_SHUFFLE(0,0,0,1));
2993 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
2994 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
2995 fiz3
= _mm_add_ss(_mm_add_ps(fiz3
,t1
) , _mm_add_ps(t2
,t3
));
2997 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
2998 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
2999 _mm_store_ss (fptr
+8,_mm_add_ss(fiz3
,_mm_load_ss(fptr
+8) ));
3001 t4
= _mm_load_ss(fshiftptr
+2);
3002 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
3004 t1
= _mm_shuffle_ps(fiz3
,fix1
,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
3005 t2
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
3006 t3
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
3007 t3
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
3009 t1
= _mm_add_ps(t1
,t2
);
3010 t3
= _mm_add_ps(t3
,t4
);
3011 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3013 _mm_store_ss(fshiftptr
+2,t1
);
3014 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
3019 gmx_mm_update_iforce_4atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
3020 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
3021 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
3022 __m128 fix4
, __m128 fiy4
, __m128 fiz4
,
3026 __m128 t1
,t2
,t3
,t4
,t5
;
3029 fix1
= _mm_hadd_ps(fix1
,fiy1
);
3030 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
3031 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
3032 fix3
= _mm_hadd_ps(fix3
,fiy3
);
3033 fiz3
= _mm_hadd_ps(fiz3
,fix4
);
3034 fiy4
= _mm_hadd_ps(fiy4
,fiz4
);
3036 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
3037 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
3038 fiz3
= _mm_hadd_ps(fiz3
,fiy4
); /* fiz4 fiy4 fix4 fiz3 */
3041 /* transpose data */
3042 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
3043 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
3044 _MM_TRANSPOSE4_PS(fiz3
,fix4
,fiy4
,fiz4
);
3046 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
3047 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
3048 fiz3
= _mm_add_ps(_mm_add_ps(fiz3
,fix4
), _mm_add_ps(fiy4
,fiz4
));
3050 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3051 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3052 _mm_storeu_ps(fptr
+8,_mm_add_ps(fiz3
,_mm_loadu_ps(fptr
+8)));
3054 t5
= _mm_load_ss(fshiftptr
+2);
3055 t5
= _mm_loadh_pi(t5
,(__m64
*)(fshiftptr
));
3057 t1
= _mm_shuffle_ps(fix1
,fix1
,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
3058 t2
= _mm_shuffle_ps(fiy2
,fiy2
,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
3059 t3
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
3060 t4
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
3061 t4
= _mm_shuffle_ps(fiz3
,t4
,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
3063 t1
= _mm_add_ps(t1
,t2
);
3064 t3
= _mm_add_ps(t3
,t4
);
3065 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3066 t5
= _mm_add_ps(t5
,t1
);
3068 _mm_store_ss(fshiftptr
+2,t5
);
3069 _mm_storeh_pi((__m64
*)(fshiftptr
),t5
);
3074 gmx_mm_update_1pot_ps(__m128 pot1
, float *ptr1
)
3077 pot1
= _mm_hadd_ps(pot1
,pot1
);
3078 pot1
= _mm_hadd_ps(pot1
,pot1
);
3081 pot1
= _mm_add_ps(pot1
,_mm_movehl_ps(pot1
,pot1
));
3082 pot1
= _mm_add_ps(pot1
,_mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1)));
3084 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3089 gmx_mm_update_2pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
)
3092 pot1
= _mm_hadd_ps(pot1
,pot2
);
3093 pot1
= _mm_hadd_ps(pot1
,pot1
);
3094 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1));
3098 t1
= _mm_movehl_ps(pot2
,pot1
); /* 2d 2c 1d 1c */
3099 t2
= _mm_movelh_ps(pot1
,pot2
); /* 2b 2a 1b 1a */
3100 t1
= _mm_add_ps(t1
,t2
); /* 2 2 1 1 */
3101 t2
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,3,1,1));
3102 pot1
= _mm_add_ps(t1
,t2
); /* - 2 - 1 */
3103 pot2
= _mm_movehl_ps(t2
,pot1
); /* - - - 2 */
3106 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3107 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3112 gmx_mm_update_4pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
, __m128 pot3
, float *ptr3
, __m128 pot4
, float *ptr4
)
3114 _MM_TRANSPOSE4_PS(pot1
,pot2
,pot3
,pot4
);
3116 pot1
= _mm_add_ps(_mm_add_ps(pot1
,pot2
),_mm_add_ps(pot3
,pot4
));
3117 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(1,1,1,1));
3118 pot3
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(2,2,2,2));
3119 pot4
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(3,3,3,3));
3121 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3122 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3123 _mm_store_ss(ptr3
,_mm_add_ss(pot3
,_mm_load_ss(ptr3
)));
3124 _mm_store_ss(ptr4
,_mm_add_ss(pot4
,_mm_load_ss(ptr4
)));