2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
34 /* We require SSE2 now! */
39 #include <xmmintrin.h> /* SSE */
40 #include <emmintrin.h> /* SSE2 */
43 # include <pmmintrin.h> /* SSE3 */
46 # include <smmintrin.h> /* SSE4.1 */
51 /***************************************************
53 * COMPILER RANT WARNING: *
55 * Ideally, this header would be filled with *
56 * simple static inline functions. Unfortunately, *
57 * many vendors provide really braindead compilers *
58 * that either cannot handle more than 1-2 SSE *
59 * function parameters, and some cannot handle *
60 * pointers to SSE __m128 datatypes as parameters *
61 * at all. Thus, for portability we have had to *
62 * implement all but the simplest routines as *
65 ***************************************************/
68 /***************************************************
70 * Wrappers/replacements for some instructions *
71 * not available in all SSE versions. *
73 ***************************************************/
76 # define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
78 # define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
82 * Some compilers require a cast to change the interpretation
83 * of a register from FP to Int and vice versa, and not all of
84 * the provide instructions to do this. Roll our own wrappers...
87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
88 # define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
89 # define gmx_mm_castps_si128(a) _mm_castps_si128(a)
90 # define gmx_mm_castps_ps128(a) (a)
91 #elif defined(__GNUC__)
92 # define gmx_mm_castsi128_ps(a) ((__m128)(a))
93 # define gmx_mm_castps_si128(a) ((__m128i)(a))
94 # define gmx_mm_castps_ps128(a) ((__m128)(a))
96 static __m128
gmx_mm_castsi128_ps(__m128i a
) { return *(__m128
*) &a
; }
97 static __m128i
gmx_mm_castps_si128(__m128 a
) { return *(__m128i
*) &a
; }
98 static __m128
gmx_mm_castps_ps128(__m128 a
) { return *(__m128
*) &a
; }
103 /* IO functions, just for debugging */
106 printxmm(const char *s
,__m128 xmm
)
110 _mm_storeu_ps(f
,xmm
);
111 printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s
,f
[0],f
[1],f
[2],f
[3]);
116 printxmmsum(const char *s
,__m128 xmm
)
120 _mm_storeu_ps(f
,xmm
);
121 printf("%s (sum): %15.10g\n",s
,f
[0]+f
[1]+f
[2]+f
[3]);
126 printxmmi(const char *s
,__m128i xmmi
)
130 _mm_storeu_si128((__m128i
*)i
,xmmi
);
131 printf("%10s: %2d %2d %2d %2d\n",s
,i
[0],i
[1],i
[2],i
[3]);
135 /************************
137 * Simple math routines *
139 ************************/
142 gmx_mm_invsqrt_ps(__m128 x
)
144 const __m128 half
= {0.5,0.5,0.5,0.5};
145 const __m128 three
= {3.0,3.0,3.0,3.0};
147 __m128 lu
= _mm_rsqrt_ps(x
);
149 return _mm_mul_ps(half
,_mm_mul_ps(_mm_sub_ps(three
,_mm_mul_ps(_mm_mul_ps(lu
,lu
),x
)),lu
));
153 gmx_mm_inv_ps(__m128 x
)
155 const __m128 two
= {2.0f
,2.0f
,2.0f
,2.0f
};
157 __m128 lu
= _mm_rcp_ps(x
);
159 return _mm_mul_ps(lu
,_mm_sub_ps(two
,_mm_mul_ps(lu
,x
)));
164 gmx_mm_calc_rsq_ps(__m128 dx
, __m128 dy
, __m128 dz
)
166 return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
), _mm_mul_ps(dy
,dy
) ), _mm_mul_ps(dz
,dz
) );
169 /* Normal sum of four xmm registers */
171 gmx_mm_sum4_ps(__m128 t0
, __m128 t1
, __m128 t2
, __m128 t3
)
173 t0
= _mm_add_ps(t0
,t1
);
174 t2
= _mm_add_ps(t2
,t3
);
175 return _mm_add_ps(t0
,t2
);
180 gmx_mm_log_ps(__m128 x
)
182 const __m128 exp_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
183 const __m128 one_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
184 const __m128 off_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
185 const __m128 mant_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
186 const __m128 base_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
187 const __m128 loge_ps
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
189 const __m128 D5
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
190 const __m128 D4
= gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
191 const __m128 D3
= gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
192 const __m128 D2
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
193 const __m128 D1
= gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
194 const __m128 D0
= gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
196 __m128 xmm0
,xmm1
,xmm2
;
200 xmm1
= _mm_and_ps(xmm1
, exp_ps
);
201 xmm1
= gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1
),8) );
203 xmm1
= _mm_or_ps(xmm1
, one_ps
);
204 xmm1
= _mm_sub_ps(xmm1
, off_ps
);
206 xmm1
= _mm_mul_ps(xmm1
, base_ps
);
207 xmm0
= _mm_and_ps(xmm0
, mant_ps
);
208 xmm0
= _mm_or_ps(xmm0
, one_ps
);
210 xmm2
= _mm_mul_ps(xmm0
, D5
);
211 xmm2
= _mm_add_ps(xmm2
, D4
);
212 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
213 xmm2
= _mm_add_ps(xmm2
, D3
);
214 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
215 xmm2
= _mm_add_ps(xmm2
, D2
);
216 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
217 xmm2
= _mm_add_ps(xmm2
, D1
);
218 xmm2
= _mm_mul_ps(xmm2
,xmm0
);
219 xmm2
= _mm_add_ps(xmm2
, D0
);
220 xmm0
= _mm_sub_ps(xmm0
, one_ps
);
221 xmm0
= _mm_mul_ps(xmm0
,xmm2
);
222 xmm1
= _mm_add_ps(xmm1
,xmm0
);
225 x
= _mm_mul_ps(x
, loge_ps
);
231 /* This exp-routine has a relative precision of 2^-22.33 bits (essentially single precision :-) ) */
233 gmx_mm_exp_ps(__m128 x
)
235 const __m128i half
= _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000); // 0.5e+0f
236 const __m128i base
= _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F); // 127
237 const __m128i CC
= _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B); // log2(e)
239 const __m128i D5
= _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905); // 1.8775767e-3f
240 const __m128i D4
= _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806); // 8.9893397e-3f
241 const __m128i D3
= _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23); // 5.5826318e-2f
242 const __m128i D2
= _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4); // 2.4015361e-1f
243 const __m128i D1
= _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B); // 6.9315308e-1f
244 const __m128i D0
= _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF); // 9.9999994e-1f
249 xmm0
= _mm_mul_ps(x
,gmx_mm_castsi128_ps(CC
));
250 xmm1
= _mm_sub_ps(xmm0
, gmx_mm_castsi128_ps(half
));
251 xmm2
= _mm_cvtps_epi32(xmm1
);
252 xmm1
= _mm_cvtepi32_ps(xmm2
);
254 xmm2
= _mm_add_epi32(xmm2
,gmx_mm_castps_si128(base
));
255 xmm2
= _mm_slli_epi32(xmm2
,23);
257 xmm0
= _mm_sub_ps(xmm0
,xmm1
);
258 xmm1
= _mm_mul_ps(xmm0
,gmx_mm_castsi128_ps( D5
));
259 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D4
));
260 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
261 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D3
));
262 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
263 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D2
));
264 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
265 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D1
));
266 xmm1
= _mm_mul_ps(xmm1
,xmm0
);
267 xmm1
= _mm_add_ps(xmm1
,gmx_mm_castsi128_ps(D0
));
268 xmm1
= _mm_mul_ps(xmm1
,gmx_mm_castsi128_ps(xmm2
));
270 /* 18 instructions currently */
276 #define GMX_MM_SINCOS_PS(x,sinval,cosval) \
278 const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI}; \
279 const __m128 _sincosf_half = {0.5,0.5,0.5,0.5}; \
280 const __m128 _sincosf_one = {1.0,1.0,1.0,1.0}; \
282 const __m128i _sincosf_izero = _mm_set1_epi32(0); \
283 const __m128i _sincosf_ione = _mm_set1_epi32(1); \
284 const __m128i _sincosf_itwo = _mm_set1_epi32(2); \
285 const __m128i _sincosf_ithree = _mm_set1_epi32(3); \
287 const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129}; \
288 const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8}; \
289 const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249}; \
290 const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950}; \
291 const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225}; \
292 const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727}; \
293 const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758}; \
294 const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247}; \
296 __m128 _sincosf_signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); \
297 __m128 _sincosf_tiny = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) ); \
299 __m128 _sincosf_xl; \
300 __m128 _sincosf_xl2; \
301 __m128 _sincosf_xl3; \
302 __m128 _sincosf_qf; \
303 __m128 _sincosf_absxl; \
304 __m128 _sincosf_p1; \
305 __m128 _sincosf_cx; \
306 __m128 _sincosf_sx; \
307 __m128 _sincosf_ts; \
308 __m128 _sincosf_tc; \
309 __m128 _sincosf_tsn; \
310 __m128 _sincosf_tcn; \
311 __m128i _sincosf_q; \
312 __m128i _sincosf_offsetSin; \
313 __m128i _sincosf_offsetCos; \
314 __m128 _sincosf_sinMask; \
315 __m128 _sincosf_cosMask; \
316 __m128 _sincosf_isTiny; \
317 __m128 _sincosf_ct0; \
318 __m128 _sincosf_ct1; \
319 __m128 _sincosf_ct2; \
320 __m128 _sincosf_st1; \
321 __m128 _sincosf_st2; \
323 _sincosf_xl = _mm_mul_ps(x,_sincosf_two_over_pi); \
325 _sincosf_xl = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
327 _sincosf_q = _mm_cvttps_epi32(_sincosf_xl); \
328 _sincosf_qf = _mm_cvtepi32_ps(_sincosf_q); \
330 _sincosf_offsetSin = _mm_and_si128(_sincosf_q,_sincosf_ithree); \
331 _sincosf_offsetCos = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione); \
333 _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1); \
334 _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2); \
335 _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1); \
336 _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl); \
338 _sincosf_absxl = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl); \
339 _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl); \
341 _sincosf_xl2 = _mm_mul_ps(_sincosf_xl,_sincosf_xl); \
342 _sincosf_xl3 = _mm_mul_ps(_sincosf_xl2,_sincosf_xl); \
344 _sincosf_ct1 = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2); \
345 _sincosf_ct1 = _mm_add_ps(_sincosf_ct1,_sincosf_cc1); \
346 _sincosf_st1 = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2); \
347 _sincosf_st1 = _mm_add_ps(_sincosf_st1,_sincosf_sc1); \
348 _sincosf_ct2 = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2); \
349 _sincosf_ct2 = _mm_add_ps(_sincosf_ct2,_sincosf_cc2); \
350 _sincosf_st2 = _mm_mul_ps(_sincosf_st1,_sincosf_xl2); \
351 _sincosf_st2 = _mm_add_ps(_sincosf_st2,_sincosf_sc2); \
353 _sincosf_cx = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2); \
354 _sincosf_cx = _mm_add_ps(_sincosf_cx,_sincosf_one); \
356 _sincosf_sx = _mm_mul_ps(_sincosf_st2,_sincosf_xl3); \
357 _sincosf_sx = _mm_add_ps(_sincosf_sx,_sincosf_xl); \
359 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
360 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
362 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
363 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
365 _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
366 _sincosf_tsn = _mm_xor_ps(_sincosf_signbit,_sincosf_ts); \
367 _sincosf_ts = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
369 _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
370 _sincosf_tcn = _mm_xor_ps(_sincosf_signbit,_sincosf_tc); \
371 _sincosf_tc = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
373 sinval = _sincosf_ts; \
374 cosval = _sincosf_tc; \
379 /* Load a single value from 1-4 places, merge into xmm register */
381 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
383 __m128 _txmm2,_txmm3,_txmm4; \
384 xmm1 = _mm_load_ss(ptr1); \
385 _txmm2 = _mm_load_ss(ptr2); \
386 _txmm3 = _mm_load_ss(ptr3); \
387 _txmm4 = _mm_load_ss(ptr4); \
388 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
389 _txmm2 = _mm_unpacklo_ps(_txmm2,_txmm4); \
390 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
394 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
396 __m128 _txmm2,_txmm3; \
397 xmm1 = _mm_load_ss(ptr1); \
398 _txmm2 = _mm_load_ss(ptr2); \
399 _txmm3 = _mm_load_ss(ptr3); \
400 xmm1 = _mm_unpacklo_ps(xmm1,_txmm3); \
401 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
405 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1) \
408 xmm1 = _mm_load_ss(ptr1); \
409 _txmm2 = _mm_load_ss(ptr2); \
410 xmm1 = _mm_unpacklo_ps(xmm1,_txmm2); \
414 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
416 xmm1 = _mm_load_ss(ptr1); \
419 /* Store data in an xmm register into 1-4 different places */
420 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
422 __m128 _txmm2,_txmm3,_txmm4; \
423 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
424 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
425 _txmm4 = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
426 _mm_store_ss(ptr1,xmm1); \
427 _mm_store_ss(ptr2,_txmm2); \
428 _mm_store_ss(ptr3,_txmm3); \
429 _mm_store_ss(ptr4,_txmm4); \
433 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
435 __m128 _txmm2,_txmm3; \
436 _txmm3 = _mm_movehl_ps(_mm_setzero_ps(),xmm1); \
437 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
438 _mm_store_ss(ptr1,xmm1); \
439 _mm_store_ss(ptr2,_txmm2); \
440 _mm_store_ss(ptr3,_txmm3); \
444 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1) \
447 _txmm2 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
448 _mm_store_ss(ptr1,xmm1); \
449 _mm_store_ss(ptr2,_txmm2); \
453 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
455 _mm_store_ss(ptr1,xmm1); \
459 /* Similar to store, but increments value in memory */
460 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2) \
462 __m128 _tincr1,_tincr2; \
463 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
464 GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
465 _tincr1 = _mm_add_ps(_tincr1,xmm1); \
466 _tincr2 = _mm_add_ps(_tincr2,xmm2); \
467 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1); \
468 GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2); \
471 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
474 GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
475 _tincr = _mm_add_ps(_tincr,xmm1); \
476 GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr); \
479 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
482 GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
483 _tincr = _mm_add_ps(_tincr,xmm1); \
484 GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr); \
487 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1) \
490 GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr); \
491 _tincr = _mm_add_ps(_tincr,xmm1); \
492 GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr); \
495 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1) \
498 GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr); \
499 _tincr = _mm_add_ss(_tincr,xmm1); \
500 GMX_MM_STORE_1VALUE_PS(ptr1,_tincr); \
505 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
506 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12) \
508 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
509 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
510 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
511 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
512 _tmp4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4)); \
513 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
514 _tmp2 = _mm_unpacklo_ps(_tmp2,_tmp4); \
515 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
516 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
519 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12) \
521 __m128 _tmp1,_tmp2,_tmp3; \
522 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
523 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
524 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
525 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
526 _tmp2 = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps()); \
527 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
528 c12 = _mm_unpackhi_ps(_tmp1,_tmp2); \
532 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12) \
534 __m128 _tmp1,_tmp2; \
535 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
536 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
537 c6 = _mm_unpacklo_ps(_tmp1,_tmp2); \
538 c12 = _mm_movehl_ps(c12,c6); \
541 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12) \
543 c6 = _mm_load_ss(ptr1); \
544 c12 = _mm_load_ss(ptr1+1); \
548 /* Routines to load 1-4 rvecs from 1-4 places.
549 * We mainly use these to load coordinates. The extra routines
550 * are very efficient for the water-water loops, since we e.g.
551 * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
553 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
554 jx1 = _mm_load_ss(ptr1); \
555 jy1 = _mm_load_ss((ptr1)+1); \
556 jz1 = _mm_load_ss((ptr1)+2); \
559 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
560 jx1 = _mm_load_ss(ptr1); \
561 jy1 = _mm_load_ss((ptr1)+1); \
562 jz1 = _mm_load_ss((ptr1)+2); \
563 jx2 = _mm_load_ss((ptr1)+3); \
564 jy2 = _mm_load_ss((ptr1)+4); \
565 jz2 = _mm_load_ss((ptr1)+5); \
569 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
570 jx1 = _mm_load_ss(ptr1); \
571 jy1 = _mm_load_ss((ptr1)+1); \
572 jz1 = _mm_load_ss((ptr1)+2); \
573 jx2 = _mm_load_ss((ptr1)+3); \
574 jy2 = _mm_load_ss((ptr1)+4); \
575 jz2 = _mm_load_ss((ptr1)+5); \
576 jx3 = _mm_load_ss((ptr1)+6); \
577 jy3 = _mm_load_ss((ptr1)+7); \
578 jz3 = _mm_load_ss((ptr1)+8); \
582 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
583 jx1 = _mm_load_ss(ptr1); \
584 jy1 = _mm_load_ss((ptr1)+1); \
585 jz1 = _mm_load_ss((ptr1)+2); \
586 jx2 = _mm_load_ss((ptr1)+3); \
587 jy2 = _mm_load_ss((ptr1)+4); \
588 jz2 = _mm_load_ss((ptr1)+5); \
589 jx3 = _mm_load_ss((ptr1)+6); \
590 jy3 = _mm_load_ss((ptr1)+7); \
591 jz3 = _mm_load_ss((ptr1)+8); \
592 jx4 = _mm_load_ss((ptr1)+9); \
593 jy4 = _mm_load_ss((ptr1)+10); \
594 jz4 = _mm_load_ss((ptr1)+11); \
598 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
599 __m128 _tmp1,_tmp2; \
600 _tmp1 = _mm_load_ss(ptr1); \
601 _tmp2 = _mm_load_ss(ptr2); \
602 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
603 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
604 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
605 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
606 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
607 jz1 = _mm_movehl_ps(jz1,jy1); \
610 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
611 __m128 _tmp1, _tmp2; \
612 _tmp1 = _mm_loadu_ps(ptr1); \
613 jy1 = _mm_loadu_ps(ptr2); \
614 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
615 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
616 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
617 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
618 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
619 jy1 = _mm_movehl_ps(jx1,jx1); \
620 jx2 = _mm_movehl_ps(jz1,jz1); \
621 jz2 = _mm_movehl_ps(jy2,jy2); \
625 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
626 __m128 _tmp1, _tmp2, _tmp3; \
627 _tmp1 = _mm_loadu_ps(ptr1); \
628 jy1 = _mm_loadu_ps(ptr2); \
629 _tmp2 = _mm_loadu_ps(ptr1+4); \
630 jz2 = _mm_loadu_ps(ptr2+4); \
631 jz3 = _mm_load_ss(ptr1+8); \
632 _tmp3 = _mm_load_ss(ptr2+8); \
633 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
634 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
635 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
636 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
637 jy1 = _mm_movehl_ps(jx1,jx1); \
638 jx2 = _mm_movehl_ps(jz1,jz1); \
639 jz2 = _mm_movehl_ps(jy2,jy2); \
640 jy3 = _mm_movehl_ps(jx3,jx3); \
641 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
645 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
646 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
647 _tmp1 = _mm_loadu_ps(ptr1); \
648 jy1 = _mm_loadu_ps(ptr2); \
649 _tmp2 = _mm_loadu_ps(ptr1+4); \
650 jz2 = _mm_loadu_ps(ptr2+4); \
651 _tmp3 = _mm_loadu_ps(ptr1+8); \
652 _tmp4 = _mm_loadu_ps(ptr2+8); \
653 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
654 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
655 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
656 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
657 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
658 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
659 jy1 = _mm_movehl_ps(jx1,jx1); \
660 jx2 = _mm_movehl_ps(jz1,jz1); \
661 jz2 = _mm_movehl_ps(jy2,jy2); \
662 jy3 = _mm_movehl_ps(jx3,jx3); \
663 jx4 = _mm_movehl_ps(jz3,jz3); \
664 jz4 = _mm_movehl_ps(jy4,jy4); \
668 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
669 __m128 _tmp1,_tmp3,_tmp4; \
670 jx1 = _mm_load_ss(ptr1); \
671 jy1 = _mm_load_ss(ptr2); \
672 jz1 = _mm_load_ss(ptr3); \
673 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
674 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
675 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
676 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
677 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
678 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
679 jx1 = _mm_movelh_ps(_tmp1,jz1); \
680 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
681 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
685 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
686 __m128 _tmp1, _tmp2; \
687 jx1 = _mm_loadu_ps(ptr1); \
688 jy1 = _mm_loadu_ps(ptr2); \
689 jz1 = _mm_loadu_ps(ptr3); \
690 jx2 = _mm_setzero_ps(); \
691 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
692 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
693 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
694 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
695 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
696 jz2 = _mm_unpacklo_ps(jz2,_mm_setzero_ps()); \
697 jy2 = _mm_unpacklo_ps(_tmp1,jz2); \
698 jz2 = _mm_unpackhi_ps(_tmp1,jz2); \
702 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
703 __m128 _tmp1, _tmp2; \
704 jx1 = _mm_loadu_ps(ptr1); \
705 jy1 = _mm_loadu_ps(ptr2); \
706 jz1 = _mm_loadu_ps(ptr3); \
707 jx2 = _mm_setzero_ps(); \
708 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
709 jy2 = _mm_loadu_ps(ptr1+4); \
710 jz2 = _mm_loadu_ps(ptr2+4); \
711 jx3 = _mm_loadu_ps(ptr3+4); \
712 jy3 = _mm_setzero_ps(); \
713 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
714 jz3 = _mm_load_ss(ptr1+8); \
715 _tmp1 = _mm_load_ss(ptr2+8); \
716 _tmp2 = _mm_load_ss(ptr3+8); \
717 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
718 _tmp1 = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps()); \
719 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
723 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
724 jx1 = _mm_loadu_ps(ptr1); \
725 jy1 = _mm_loadu_ps(ptr2); \
726 jz1 = _mm_loadu_ps(ptr3); \
727 jx2 = _mm_setzero_ps(); \
728 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
729 jy2 = _mm_loadu_ps(ptr1+4); \
730 jz2 = _mm_loadu_ps(ptr2+4); \
731 jx3 = _mm_loadu_ps(ptr3+4); \
732 jy3 = _mm_setzero_ps(); \
733 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
734 jz3 = _mm_loadu_ps(ptr1+8); \
735 jx4 = _mm_loadu_ps(ptr2+8); \
736 jy4 = _mm_loadu_ps(ptr3+8); \
737 jz4 = _mm_setzero_ps(); \
738 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
743 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
744 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
745 jx1 = _mm_load_ss(ptr1); \
746 _tmp1 = _mm_load_ss(ptr2); \
747 jy1 = _mm_load_ss(ptr3); \
748 jz1 = _mm_load_ss(ptr4); \
749 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
750 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
751 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
752 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
753 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
754 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
755 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
756 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
757 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
758 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
759 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
763 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
764 __m128 _tmp1, _tmp2; \
765 jx1 = _mm_loadu_ps(ptr1); \
766 jy1 = _mm_loadu_ps(ptr2); \
767 jz1 = _mm_loadu_ps(ptr3); \
768 jx2 = _mm_loadu_ps(ptr4); \
769 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
770 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
771 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
772 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
773 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
774 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
775 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
776 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
777 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
781 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
782 __m128 _tmp1, _tmp2, _tmp3; \
783 jx1 = _mm_loadu_ps(ptr1); \
784 jy1 = _mm_loadu_ps(ptr2); \
785 jz1 = _mm_loadu_ps(ptr3); \
786 jx2 = _mm_loadu_ps(ptr4); \
787 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
788 jy2 = _mm_loadu_ps(ptr1+4); \
789 jz2 = _mm_loadu_ps(ptr2+4); \
790 jx3 = _mm_loadu_ps(ptr3+4); \
791 jy3 = _mm_loadu_ps(ptr4+4); \
792 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
793 jz3 = _mm_load_ss(ptr1+8); \
794 _tmp1 = _mm_load_ss(ptr2+8); \
795 _tmp2 = _mm_load_ss(ptr3+8); \
796 _tmp3 = _mm_load_ss(ptr4+8); \
797 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
798 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
799 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
803 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
804 jx1 = _mm_loadu_ps(ptr1); \
805 jy1 = _mm_loadu_ps(ptr2); \
806 jz1 = _mm_loadu_ps(ptr3); \
807 jx2 = _mm_loadu_ps(ptr4); \
808 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
809 jy2 = _mm_loadu_ps(ptr1+4); \
810 jz2 = _mm_loadu_ps(ptr2+4); \
811 jx3 = _mm_loadu_ps(ptr3+4); \
812 jy3 = _mm_loadu_ps(ptr4+4); \
813 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
814 jz3 = _mm_loadu_ps(ptr1+8); \
815 jx4 = _mm_loadu_ps(ptr2+8); \
816 jy4 = _mm_loadu_ps(ptr3+8); \
817 jz4 = _mm_loadu_ps(ptr4+8); \
818 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
822 /* Routines to increment rvecs in memory, typically use for j particle force updates */
823 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
825 jy1 = _mm_unpacklo_ps(jy1,jz1); \
826 jx1 = _mm_movelh_ps(jx1,jy1); \
827 _tmp1 = _mm_load_ss(ptr1); \
828 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
829 _tmp1 = _mm_add_ps(_tmp1,jx1); \
830 _mm_store_ss(ptr1,_tmp1); \
831 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
835 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
836 __m128 _tmp1, _tmp2; \
837 _tmp1 = _mm_loadu_ps(ptr1); \
838 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
839 jx1 = _mm_unpacklo_ps(jx1,jy1); \
840 jz1 = _mm_unpacklo_ps(jz1,jx2); \
841 jy2 = _mm_unpacklo_ps(jy2,jz2); \
842 jx1 = _mm_movelh_ps(jx1,jz1); \
843 _tmp1 = _mm_add_ps(_tmp1,jx1); \
844 _tmp2 = _mm_add_ps(_tmp2,jy2); \
845 _mm_storeu_ps(ptr1,_tmp1); \
846 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
850 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
851 __m128 _tmp1, _tmp2, _tmp3; \
852 _tmp1 = _mm_loadu_ps(ptr1); \
853 _tmp2 = _mm_loadu_ps(ptr1+4); \
854 _tmp3 = _mm_load_ss(ptr1+8); \
855 jx1 = _mm_unpacklo_ps(jx1,jy1); \
856 jz1 = _mm_unpacklo_ps(jz1,jx2); \
857 jy2 = _mm_unpacklo_ps(jy2,jz2); \
858 jx3 = _mm_unpacklo_ps(jx3,jy3); \
859 jx1 = _mm_movelh_ps(jx1,jz1); \
860 jy2 = _mm_movelh_ps(jy2,jx3); \
861 _tmp1 = _mm_add_ps(_tmp1,jx1); \
862 _tmp2 = _mm_add_ps(_tmp2,jy2); \
863 _tmp3 = _mm_add_ss(_tmp3,jz3); \
864 _mm_storeu_ps(ptr1,_tmp1); \
865 _mm_storeu_ps(ptr1+4,_tmp2); \
866 _mm_store_ss(ptr1+8,_tmp3); \
870 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
871 __m128 _tmp1, _tmp2, _tmp3; \
872 _tmp1 = _mm_loadu_ps(ptr1); \
873 _tmp2 = _mm_loadu_ps(ptr1+4); \
874 _tmp3 = _mm_loadu_ps(ptr1+8); \
875 jx1 = _mm_unpacklo_ps(jx1,jy1); \
876 jz1 = _mm_unpacklo_ps(jz1,jx2); \
877 jy2 = _mm_unpacklo_ps(jy2,jz2); \
878 jx3 = _mm_unpacklo_ps(jx3,jy3); \
879 jz3 = _mm_unpacklo_ps(jz3,jx4); \
880 jy4 = _mm_unpacklo_ps(jy4,jz4); \
881 jx1 = _mm_movelh_ps(jx1,jz1); \
882 jy2 = _mm_movelh_ps(jy2,jx3); \
883 jz3 = _mm_movelh_ps(jz3,jy4); \
884 _tmp1 = _mm_add_ps(_tmp1,jx1); \
885 _tmp2 = _mm_add_ps(_tmp2,jy2); \
886 _tmp3 = _mm_add_ps(_tmp3,jz3); \
887 _mm_storeu_ps(ptr1,_tmp1); \
888 _mm_storeu_ps(ptr1+4,_tmp2); \
889 _mm_storeu_ps(ptr1+8,_tmp3); \
893 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
894 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
895 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
896 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
897 _tmp2 = _mm_load_ss(ptr1+2); \
898 _tmp3 = _mm_load_ss(ptr2+2); \
899 jx1 = _mm_unpacklo_ps(jx1,jy1); \
900 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
901 _tmp1 = _mm_add_ps(_tmp1,jx1); \
902 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
903 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
904 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
905 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
909 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
910 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
911 _tmp1 = _mm_loadu_ps(ptr1); \
912 _tmp2 = _mm_loadu_ps(ptr2); \
913 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
914 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
915 jx1 = _mm_unpacklo_ps(jx1,jy1); \
916 jz1 = _mm_unpacklo_ps(jz1,jx2); \
917 jy2 = _mm_unpacklo_ps(jy2,jz2); \
918 _tmp4 = _mm_movelh_ps(jx1,jz1); \
919 _tmp5 = _mm_movehl_ps(jz1,jx1); \
920 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
921 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
922 _tmp3 = _mm_add_ps(_tmp3,jy2); \
923 _mm_storeu_ps(ptr1,_tmp1); \
924 _mm_storeu_ps(ptr2,_tmp2); \
925 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
926 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
930 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
931 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
932 _tmp1 = _mm_loadu_ps(ptr1); \
933 _tmp2 = _mm_loadu_ps(ptr1+4); \
934 _tmp3 = _mm_load_ss(ptr1+8); \
935 _tmp4 = _mm_loadu_ps(ptr2); \
936 _tmp5 = _mm_loadu_ps(ptr2+4); \
937 _tmp6 = _mm_load_ss(ptr2+8); \
938 jx1 = _mm_unpacklo_ps(jx1,jy1); \
939 jz1 = _mm_unpacklo_ps(jz1,jx2); \
940 jy2 = _mm_unpacklo_ps(jy2,jz2); \
941 jx3 = _mm_unpacklo_ps(jx3,jy3); \
942 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
943 _tmp8 = _mm_movelh_ps(jx1,jz1); \
944 _tmp9 = _mm_movehl_ps(jz1,jx1); \
945 _tmp10 = _mm_movelh_ps(jy2,jx3); \
946 _tmp11 = _mm_movehl_ps(jx3,jy2); \
947 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
948 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
949 _tmp3 = _mm_add_ss(_tmp3,jz3); \
950 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
951 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
952 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
953 _mm_storeu_ps(ptr1,_tmp1); \
954 _mm_storeu_ps(ptr1+4,_tmp2); \
955 _mm_store_ss(ptr1+8,_tmp3); \
956 _mm_storeu_ps(ptr2,_tmp4); \
957 _mm_storeu_ps(ptr2+4,_tmp5); \
958 _mm_store_ss(ptr2+8,_tmp6); \
962 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
963 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
964 _tmp1 = _mm_loadu_ps(ptr1); \
965 _tmp2 = _mm_loadu_ps(ptr1+4); \
966 _tmp3 = _mm_loadu_ps(ptr1+8); \
967 _tmp4 = _mm_loadu_ps(ptr2); \
968 _tmp5 = _mm_loadu_ps(ptr2+4); \
969 _tmp6 = _mm_loadu_ps(ptr2+8); \
970 jx1 = _mm_unpacklo_ps(jx1,jy1); \
971 jz1 = _mm_unpacklo_ps(jz1,jx2); \
972 jy2 = _mm_unpacklo_ps(jy2,jz2); \
973 jx3 = _mm_unpacklo_ps(jx3,jy3); \
974 jz3 = _mm_unpacklo_ps(jz3,jx4); \
975 jy4 = _mm_unpacklo_ps(jy4,jz4); \
976 _tmp8 = _mm_movelh_ps(jx1,jz1); \
977 _tmp9 = _mm_movehl_ps(jz1,jx1); \
978 _tmp10 = _mm_movelh_ps(jy2,jx3); \
979 _tmp11 = _mm_movehl_ps(jx3,jy2); \
980 _tmp12 = _mm_movelh_ps(jz3,jy4); \
981 _tmp13 = _mm_movehl_ps(jy4,jz3); \
982 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
983 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
984 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
985 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
986 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
987 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
988 _mm_storeu_ps(ptr1,_tmp1); \
989 _mm_storeu_ps(ptr1+4,_tmp2); \
990 _mm_storeu_ps(ptr1+8,_tmp3); \
991 _mm_storeu_ps(ptr2,_tmp4); \
992 _mm_storeu_ps(ptr2+4,_tmp5); \
993 _mm_storeu_ps(ptr2+8,_tmp6); \
997 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
998 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
999 _tmp1 = _mm_load_ss(ptr1); \
1000 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1001 _tmp2 = _mm_load_ss(ptr2); \
1002 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1003 _tmp3 = _mm_load_ss(ptr3); \
1004 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1005 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1006 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1007 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1008 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1009 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1010 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1011 _tmp1 = _mm_add_ps(_tmp1,jx1); \
1012 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
1013 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
1014 _mm_store_ss(ptr1,_tmp1); \
1015 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1016 _mm_store_ss(ptr2,_tmp2); \
1017 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1018 _mm_store_ss(ptr3,_tmp3); \
1019 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1023 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1024 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1025 _tmp1 = _mm_loadu_ps(ptr1); \
1026 _tmp2 = _mm_loadu_ps(ptr2); \
1027 _tmp3 = _mm_loadu_ps(ptr3); \
1028 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1029 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1030 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1031 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1032 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1033 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1034 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1035 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1036 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1037 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1038 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1039 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1040 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
1041 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
1042 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
1043 _tmp4 = _mm_add_ps(_tmp4,jy2); \
1044 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
1045 _mm_storeu_ps(ptr1,_tmp1); \
1046 _mm_storeu_ps(ptr2,_tmp2); \
1047 _mm_storeu_ps(ptr3,_tmp3); \
1048 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1049 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1050 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1054 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1055 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1056 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1057 _tmp1 = _mm_loadu_ps(ptr1); \
1058 _tmp2 = _mm_loadu_ps(ptr1+4); \
1059 _tmp3 = _mm_load_ss(ptr1+8); \
1060 _tmp4 = _mm_loadu_ps(ptr2); \
1061 _tmp5 = _mm_loadu_ps(ptr2+4); \
1062 _tmp6 = _mm_load_ss(ptr2+8); \
1063 _tmp7 = _mm_loadu_ps(ptr3); \
1064 _tmp8 = _mm_loadu_ps(ptr3+4); \
1065 _tmp9 = _mm_load_ss(ptr3+8); \
1066 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1067 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1068 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1069 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1070 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1071 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1072 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1073 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1074 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1075 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1076 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1077 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1078 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1079 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1080 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1081 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1082 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1083 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1084 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1085 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1086 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1087 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
1088 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1089 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1090 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
1091 _mm_storeu_ps(ptr1,_tmp1); \
1092 _mm_storeu_ps(ptr1+4,_tmp2); \
1093 _mm_store_ss(ptr1+8,_tmp3); \
1094 _mm_storeu_ps(ptr2,_tmp4); \
1095 _mm_storeu_ps(ptr2+4,_tmp5); \
1096 _mm_store_ss(ptr2+8,_tmp6); \
1097 _mm_storeu_ps(ptr3,_tmp7); \
1098 _mm_storeu_ps(ptr3+4,_tmp8); \
1099 _mm_store_ss(ptr3+8,_tmp9); \
1103 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1104 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1105 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1106 _tmp1 = _mm_loadu_ps(ptr1); \
1107 _tmp2 = _mm_loadu_ps(ptr1+4); \
1108 _tmp3 = _mm_loadu_ps(ptr1+8); \
1109 _tmp4 = _mm_loadu_ps(ptr2); \
1110 _tmp5 = _mm_loadu_ps(ptr2+4); \
1111 _tmp6 = _mm_loadu_ps(ptr2+8); \
1112 _tmp7 = _mm_loadu_ps(ptr3); \
1113 _tmp8 = _mm_loadu_ps(ptr3+4); \
1114 _tmp9 = _mm_loadu_ps(ptr3+8); \
1115 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1116 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1117 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1118 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1119 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1120 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1121 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1122 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1123 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1124 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1125 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1126 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1127 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1128 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1129 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1130 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1131 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1132 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1133 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1134 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1135 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1136 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
1137 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
1138 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
1139 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
1140 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
1141 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
1142 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
1143 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
1144 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
1145 _mm_storeu_ps(ptr1,_tmp1); \
1146 _mm_storeu_ps(ptr1+4,_tmp2); \
1147 _mm_storeu_ps(ptr1+8,_tmp3); \
1148 _mm_storeu_ps(ptr2,_tmp4); \
1149 _mm_storeu_ps(ptr2+4,_tmp5); \
1150 _mm_storeu_ps(ptr2+8,_tmp6); \
1151 _mm_storeu_ps(ptr3,_tmp7); \
1152 _mm_storeu_ps(ptr3+4,_tmp8); \
1153 _mm_storeu_ps(ptr3+8,_tmp9); \
1158 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1159 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1160 _tmp1 = _mm_load_ss(ptr1); \
1161 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1162 _tmp2 = _mm_load_ss(ptr2); \
1163 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1164 _tmp3 = _mm_load_ss(ptr3); \
1165 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1166 _tmp4 = _mm_load_ss(ptr4); \
1167 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1168 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1169 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1170 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1171 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1172 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1173 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1174 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
1175 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
1176 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
1177 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
1178 _mm_store_ss(ptr1,_tmp1); \
1179 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1180 _mm_store_ss(ptr2,_tmp2); \
1181 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1182 _mm_store_ss(ptr3,_tmp3); \
1183 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1184 _mm_store_ss(ptr4,_tmp4); \
1185 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1189 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1190 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1191 _tmp1 = _mm_loadu_ps(ptr1); \
1192 _tmp2 = _mm_loadu_ps(ptr2); \
1193 _tmp3 = _mm_loadu_ps(ptr3); \
1194 _tmp4 = _mm_loadu_ps(ptr4); \
1195 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1196 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1197 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1198 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1199 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1200 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1201 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1202 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1203 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1204 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1205 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1206 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1207 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1208 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1209 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
1210 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
1211 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
1212 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
1213 _tmp5 = _mm_add_ps(_tmp5,jy2); \
1214 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
1215 _mm_storeu_ps(ptr1,_tmp1); \
1216 _mm_storeu_ps(ptr2,_tmp2); \
1217 _mm_storeu_ps(ptr3,_tmp3); \
1218 _mm_storeu_ps(ptr4,_tmp4); \
1219 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1220 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1221 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1222 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1226 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1227 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1228 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1229 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1230 _tmp1 = _mm_loadu_ps(ptr1); \
1231 _tmp2 = _mm_loadu_ps(ptr1+4); \
1232 _tmp3 = _mm_load_ss(ptr1+8); \
1233 _tmp4 = _mm_loadu_ps(ptr2); \
1234 _tmp5 = _mm_loadu_ps(ptr2+4); \
1235 _tmp6 = _mm_load_ss(ptr2+8); \
1236 _tmp7 = _mm_loadu_ps(ptr3); \
1237 _tmp8 = _mm_loadu_ps(ptr3+4); \
1238 _tmp9 = _mm_load_ss(ptr3+8); \
1239 _tmp10 = _mm_loadu_ps(ptr4); \
1240 _tmp11 = _mm_loadu_ps(ptr4+4); \
1241 _tmp12 = _mm_load_ss(ptr4+8); \
1242 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1243 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1244 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1245 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1246 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1247 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1248 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1249 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1250 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1251 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1252 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1253 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1254 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1255 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1256 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1257 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1258 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1259 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1260 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1261 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
1262 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
1263 _tmp3 = _mm_add_ss(_tmp3,jz3); \
1264 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
1265 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
1266 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
1267 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
1268 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
1269 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
1270 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1271 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1272 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
1273 _mm_storeu_ps(ptr1,_tmp1); \
1274 _mm_storeu_ps(ptr1+4,_tmp2); \
1275 _mm_store_ss(ptr1+8,_tmp3); \
1276 _mm_storeu_ps(ptr2,_tmp4); \
1277 _mm_storeu_ps(ptr2+4,_tmp5); \
1278 _mm_store_ss(ptr2+8,_tmp6); \
1279 _mm_storeu_ps(ptr3,_tmp7); \
1280 _mm_storeu_ps(ptr3+4,_tmp8); \
1281 _mm_store_ss(ptr3+8,_tmp9); \
1282 _mm_storeu_ps(ptr4,_tmp10); \
1283 _mm_storeu_ps(ptr4+4,_tmp11); \
1284 _mm_store_ss(ptr4+8,_tmp12); \
1288 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1289 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1290 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22; \
1291 __m128 _tmp23,_tmp24; \
1292 _tmp1 = _mm_loadu_ps(ptr1); \
1293 _tmp2 = _mm_loadu_ps(ptr1+4); \
1294 _tmp3 = _mm_loadu_ps(ptr1+8); \
1295 _tmp4 = _mm_loadu_ps(ptr2); \
1296 _tmp5 = _mm_loadu_ps(ptr2+4); \
1297 _tmp6 = _mm_loadu_ps(ptr2+8); \
1298 _tmp7 = _mm_loadu_ps(ptr3); \
1299 _tmp8 = _mm_loadu_ps(ptr3+4); \
1300 _tmp9 = _mm_loadu_ps(ptr3+8); \
1301 _tmp10 = _mm_loadu_ps(ptr4); \
1302 _tmp11 = _mm_loadu_ps(ptr4+4); \
1303 _tmp12 = _mm_loadu_ps(ptr4+8); \
1304 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1305 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1306 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1307 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1308 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1309 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1310 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1311 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1312 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1313 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1314 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1315 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1316 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1317 jz1 = _mm_movehl_ps(jz1,jx1); \
1318 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1319 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1320 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1321 jx3 = _mm_movehl_ps(jx3,jy2); \
1322 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1323 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1324 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1325 jy4 = _mm_movehl_ps(jy4,jz3); \
1326 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1327 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1328 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
1329 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
1330 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
1331 _tmp4 = _mm_add_ps(_tmp4,jz1); \
1332 _tmp5 = _mm_add_ps(_tmp5,jx3); \
1333 _tmp6 = _mm_add_ps(_tmp6,jy4); \
1334 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
1335 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
1336 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
1337 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
1338 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
1339 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
1340 _mm_storeu_ps(ptr1,_tmp1); \
1341 _mm_storeu_ps(ptr1+4,_tmp2); \
1342 _mm_storeu_ps(ptr1+8,_tmp3); \
1343 _mm_storeu_ps(ptr2,_tmp4); \
1344 _mm_storeu_ps(ptr2+4,_tmp5); \
1345 _mm_storeu_ps(ptr2+8,_tmp6); \
1346 _mm_storeu_ps(ptr3,_tmp7); \
1347 _mm_storeu_ps(ptr3+4,_tmp8); \
1348 _mm_storeu_ps(ptr3+8,_tmp9); \
1349 _mm_storeu_ps(ptr4,_tmp10); \
1350 _mm_storeu_ps(ptr4+4,_tmp11); \
1351 _mm_storeu_ps(ptr4+8,_tmp12); \
1356 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) { \
1358 jy1 = _mm_unpacklo_ps(jy1,jz1); \
1359 jx1 = _mm_movelh_ps(jx1,jy1); \
1360 _tmp1 = _mm_load_ss(ptr1); \
1361 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1362 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1363 _mm_store_ss(ptr1,_tmp1); \
1364 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1368 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1369 __m128 _tmp1, _tmp2; \
1370 _tmp1 = _mm_loadu_ps(ptr1); \
1371 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1372 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1373 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1374 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1375 jx1 = _mm_movelh_ps(jx1,jz1); \
1376 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1377 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1378 _mm_storeu_ps(ptr1,_tmp1); \
1379 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
1383 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1384 __m128 _tmp1, _tmp2, _tmp3; \
1385 _tmp1 = _mm_loadu_ps(ptr1); \
1386 _tmp2 = _mm_loadu_ps(ptr1+4); \
1387 _tmp3 = _mm_load_ss(ptr1+8); \
1388 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1389 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1390 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1391 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1392 jx1 = _mm_movelh_ps(jx1,jz1); \
1393 jy2 = _mm_movelh_ps(jy2,jx3); \
1394 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1395 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1396 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1397 _mm_storeu_ps(ptr1,_tmp1); \
1398 _mm_storeu_ps(ptr1+4,_tmp2); \
1399 _mm_store_ss(ptr1+8,_tmp3); \
1403 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1404 __m128 _tmp1, _tmp2, _tmp3; \
1405 _tmp1 = _mm_loadu_ps(ptr1); \
1406 _tmp2 = _mm_loadu_ps(ptr1+4); \
1407 _tmp3 = _mm_loadu_ps(ptr1+8); \
1408 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1409 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1410 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1411 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1412 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1413 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1414 jx1 = _mm_movelh_ps(jx1,jz1); \
1415 jy2 = _mm_movelh_ps(jy2,jx3); \
1416 jz3 = _mm_movelh_ps(jz3,jy4); \
1417 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1418 _tmp2 = _mm_sub_ps(_tmp2,jy2); \
1419 _tmp3 = _mm_sub_ps(_tmp3,jz3); \
1420 _mm_storeu_ps(ptr1,_tmp1); \
1421 _mm_storeu_ps(ptr1+4,_tmp2); \
1422 _mm_storeu_ps(ptr1+8,_tmp3); \
1426 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) { \
1427 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
1428 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
1429 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
1430 _tmp2 = _mm_load_ss(ptr1+2); \
1431 _tmp3 = _mm_load_ss(ptr2+2); \
1432 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1433 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
1434 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1435 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
1436 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
1437 _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1)); \
1438 _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4)); \
1442 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1443 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
1444 _tmp1 = _mm_loadu_ps(ptr1); \
1445 _tmp2 = _mm_loadu_ps(ptr2); \
1446 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1447 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
1448 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1449 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1450 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1451 _tmp4 = _mm_movelh_ps(jx1,jz1); \
1452 _tmp5 = _mm_movehl_ps(jz1,jx1); \
1453 _tmp1 = _mm_sub_ps(_tmp1,_tmp4); \
1454 _tmp2 = _mm_sub_ps(_tmp2,_tmp5); \
1455 _tmp3 = _mm_sub_ps(_tmp3,jy2); \
1456 _mm_storeu_ps(ptr1,_tmp1); \
1457 _mm_storeu_ps(ptr2,_tmp2); \
1458 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
1459 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
1463 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1464 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1465 _tmp1 = _mm_loadu_ps(ptr1); \
1466 _tmp2 = _mm_loadu_ps(ptr1+4); \
1467 _tmp3 = _mm_load_ss(ptr1+8); \
1468 _tmp4 = _mm_loadu_ps(ptr2); \
1469 _tmp5 = _mm_loadu_ps(ptr2+4); \
1470 _tmp6 = _mm_load_ss(ptr2+8); \
1471 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1472 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1473 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1474 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1475 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1476 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1477 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1478 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1479 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1480 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1481 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1482 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1483 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1484 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1485 _tmp6 = _mm_sub_ss(_tmp6,_tmp7); \
1486 _mm_storeu_ps(ptr1,_tmp1); \
1487 _mm_storeu_ps(ptr1+4,_tmp2); \
1488 _mm_store_ss(ptr1+8,_tmp3); \
1489 _mm_storeu_ps(ptr2,_tmp4); \
1490 _mm_storeu_ps(ptr2+4,_tmp5); \
1491 _mm_store_ss(ptr2+8,_tmp6); \
1495 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1496 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1497 _tmp1 = _mm_loadu_ps(ptr1); \
1498 _tmp2 = _mm_loadu_ps(ptr1+4); \
1499 _tmp3 = _mm_loadu_ps(ptr1+8); \
1500 _tmp4 = _mm_loadu_ps(ptr2); \
1501 _tmp5 = _mm_loadu_ps(ptr2+4); \
1502 _tmp6 = _mm_loadu_ps(ptr2+8); \
1503 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1504 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1505 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1506 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1507 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1508 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1509 _tmp8 = _mm_movelh_ps(jx1,jz1); \
1510 _tmp9 = _mm_movehl_ps(jz1,jx1); \
1511 _tmp10 = _mm_movelh_ps(jy2,jx3); \
1512 _tmp11 = _mm_movehl_ps(jx3,jy2); \
1513 _tmp12 = _mm_movelh_ps(jz3,jy4); \
1514 _tmp13 = _mm_movehl_ps(jy4,jz3); \
1515 _tmp1 = _mm_sub_ps(_tmp1,_tmp8); \
1516 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1517 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1518 _tmp4 = _mm_sub_ps(_tmp4,_tmp9); \
1519 _tmp5 = _mm_sub_ps(_tmp5,_tmp11); \
1520 _tmp6 = _mm_sub_ps(_tmp6,_tmp13); \
1521 _mm_storeu_ps(ptr1,_tmp1); \
1522 _mm_storeu_ps(ptr1+4,_tmp2); \
1523 _mm_storeu_ps(ptr1+8,_tmp3); \
1524 _mm_storeu_ps(ptr2,_tmp4); \
1525 _mm_storeu_ps(ptr2+4,_tmp5); \
1526 _mm_storeu_ps(ptr2+8,_tmp6); \
1530 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1531 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
1532 _tmp1 = _mm_load_ss(ptr1); \
1533 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1534 _tmp2 = _mm_load_ss(ptr2); \
1535 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1536 _tmp3 = _mm_load_ss(ptr3); \
1537 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1538 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
1539 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
1540 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
1541 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
1542 jx1 = _mm_movelh_ps(jx1,_tmp4); \
1543 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
1544 _tmp1 = _mm_sub_ps(_tmp1,jx1); \
1545 _tmp2 = _mm_sub_ps(_tmp2,_tmp6); \
1546 _tmp3 = _mm_sub_ps(_tmp3,_tmp7); \
1547 _mm_store_ss(ptr1,_tmp1); \
1548 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1549 _mm_store_ss(ptr2,_tmp2); \
1550 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1551 _mm_store_ss(ptr3,_tmp3); \
1552 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1556 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1557 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1558 _tmp1 = _mm_loadu_ps(ptr1); \
1559 _tmp2 = _mm_loadu_ps(ptr2); \
1560 _tmp3 = _mm_loadu_ps(ptr3); \
1561 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
1562 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
1563 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
1564 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
1565 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1566 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
1567 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1568 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
1569 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1570 _tmp9 = _mm_movelh_ps(jx1,jz1); \
1571 _tmp10 = _mm_movehl_ps(jz1,jx1); \
1572 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
1573 _tmp1 = _mm_sub_ps(_tmp1,_tmp9); \
1574 _tmp2 = _mm_sub_ps(_tmp2,_tmp10); \
1575 _tmp3 = _mm_sub_ps(_tmp3,_tmp6); \
1576 _tmp4 = _mm_sub_ps(_tmp4,jy2); \
1577 _tmp5 = _mm_sub_ps(_tmp5,_tmp8); \
1578 _mm_storeu_ps(ptr1,_tmp1); \
1579 _mm_storeu_ps(ptr2,_tmp2); \
1580 _mm_storeu_ps(ptr3,_tmp3); \
1581 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
1582 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
1583 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
1587 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1588 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1589 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1590 _tmp1 = _mm_loadu_ps(ptr1); \
1591 _tmp2 = _mm_loadu_ps(ptr1+4); \
1592 _tmp3 = _mm_load_ss(ptr1+8); \
1593 _tmp4 = _mm_loadu_ps(ptr2); \
1594 _tmp5 = _mm_loadu_ps(ptr2+4); \
1595 _tmp6 = _mm_load_ss(ptr2+8); \
1596 _tmp7 = _mm_loadu_ps(ptr3); \
1597 _tmp8 = _mm_loadu_ps(ptr3+4); \
1598 _tmp9 = _mm_load_ss(ptr3+8); \
1599 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1600 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1601 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1602 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1603 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1604 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1605 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1606 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1607 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1608 _tmp15 = _mm_movehl_ps(jz3,jz3); \
1609 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1610 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1611 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1612 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1613 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1614 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1615 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1616 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1617 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1618 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1619 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1620 _tmp6 = _mm_sub_ss(_tmp6,_tmp14); \
1621 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1622 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1623 _tmp9 = _mm_sub_ss(_tmp9,_tmp15); \
1624 _mm_storeu_ps(ptr1,_tmp1); \
1625 _mm_storeu_ps(ptr1+4,_tmp2); \
1626 _mm_store_ss(ptr1+8,_tmp3); \
1627 _mm_storeu_ps(ptr2,_tmp4); \
1628 _mm_storeu_ps(ptr2+4,_tmp5); \
1629 _mm_store_ss(ptr2+8,_tmp6); \
1630 _mm_storeu_ps(ptr3,_tmp7); \
1631 _mm_storeu_ps(ptr3+4,_tmp8); \
1632 _mm_store_ss(ptr3+8,_tmp9); \
1636 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1637 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1638 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
1639 _tmp1 = _mm_loadu_ps(ptr1); \
1640 _tmp2 = _mm_loadu_ps(ptr1+4); \
1641 _tmp3 = _mm_loadu_ps(ptr1+8); \
1642 _tmp4 = _mm_loadu_ps(ptr2); \
1643 _tmp5 = _mm_loadu_ps(ptr2+4); \
1644 _tmp6 = _mm_loadu_ps(ptr2+8); \
1645 _tmp7 = _mm_loadu_ps(ptr3); \
1646 _tmp8 = _mm_loadu_ps(ptr3+4); \
1647 _tmp9 = _mm_loadu_ps(ptr3+8); \
1648 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
1649 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1650 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
1651 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1652 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
1653 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1654 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
1655 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1656 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
1657 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1658 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
1659 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1660 _tmp16 = _mm_movelh_ps(jx1,jz1); \
1661 _tmp17 = _mm_movehl_ps(jz1,jx1); \
1662 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
1663 _tmp18 = _mm_movelh_ps(jy2,jx3); \
1664 _tmp19 = _mm_movehl_ps(jx3,jy2); \
1665 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
1666 _tmp20 = _mm_movelh_ps(jz3,jy4); \
1667 _tmp21 = _mm_movehl_ps(jy4,jz3); \
1668 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
1669 _tmp1 = _mm_sub_ps(_tmp1,_tmp16); \
1670 _tmp2 = _mm_sub_ps(_tmp2,_tmp18); \
1671 _tmp3 = _mm_sub_ps(_tmp3,_tmp20); \
1672 _tmp4 = _mm_sub_ps(_tmp4,_tmp17); \
1673 _tmp5 = _mm_sub_ps(_tmp5,_tmp19); \
1674 _tmp6 = _mm_sub_ps(_tmp6,_tmp21); \
1675 _tmp7 = _mm_sub_ps(_tmp7,_tmp10); \
1676 _tmp8 = _mm_sub_ps(_tmp8,_tmp12); \
1677 _tmp9 = _mm_sub_ps(_tmp9,_tmp14); \
1678 _mm_storeu_ps(ptr1,_tmp1); \
1679 _mm_storeu_ps(ptr1+4,_tmp2); \
1680 _mm_storeu_ps(ptr1+8,_tmp3); \
1681 _mm_storeu_ps(ptr2,_tmp4); \
1682 _mm_storeu_ps(ptr2+4,_tmp5); \
1683 _mm_storeu_ps(ptr2+8,_tmp6); \
1684 _mm_storeu_ps(ptr3,_tmp7); \
1685 _mm_storeu_ps(ptr3+4,_tmp8); \
1686 _mm_storeu_ps(ptr3+8,_tmp9); \
1692 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1693 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1694 _tmp1 = _mm_load_ss(ptr1); \
1695 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
1696 _tmp2 = _mm_load_ss(ptr2); \
1697 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
1698 _tmp3 = _mm_load_ss(ptr3); \
1699 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
1700 _tmp4 = _mm_load_ss(ptr4); \
1701 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
1702 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
1703 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
1704 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1705 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1706 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1707 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1708 _tmp1 = _mm_sub_ps(_tmp1,_tmp7); \
1709 _tmp2 = _mm_sub_ps(_tmp2,_tmp8); \
1710 _tmp3 = _mm_sub_ps(_tmp3,_tmp9); \
1711 _tmp4 = _mm_sub_ps(_tmp4,_tmp10); \
1712 _mm_store_ss(ptr1,_tmp1); \
1713 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
1714 _mm_store_ss(ptr2,_tmp2); \
1715 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
1716 _mm_store_ss(ptr3,_tmp3); \
1717 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
1718 _mm_store_ss(ptr4,_tmp4); \
1719 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
1724 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1725 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1726 _tmp1 = _mm_loadu_ps(ptr1); \
1727 _tmp2 = _mm_loadu_ps(ptr2); \
1728 _tmp3 = _mm_loadu_ps(ptr3); \
1729 _tmp4 = _mm_loadu_ps(ptr4); \
1730 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
1731 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
1732 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
1733 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
1734 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
1735 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1736 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
1737 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1738 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
1739 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1740 _tmp10 = _mm_movelh_ps(jx1,jz1); \
1741 _tmp11 = _mm_movehl_ps(jz1,jx1); \
1742 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
1743 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
1744 _tmp1 = _mm_sub_ps(_tmp1,_tmp10); \
1745 _tmp2 = _mm_sub_ps(_tmp2,_tmp11); \
1746 _tmp3 = _mm_sub_ps(_tmp3,_tmp12); \
1747 _tmp4 = _mm_sub_ps(_tmp4,_tmp13); \
1748 _tmp5 = _mm_sub_ps(_tmp5,jy2); \
1749 _tmp6 = _mm_sub_ps(_tmp6,_tmp9); \
1750 _mm_storeu_ps(ptr1,_tmp1); \
1751 _mm_storeu_ps(ptr2,_tmp2); \
1752 _mm_storeu_ps(ptr3,_tmp3); \
1753 _mm_storeu_ps(ptr4,_tmp4); \
1754 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
1755 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
1756 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
1757 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
1761 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1762 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
1763 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
1764 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
1765 _tmp1 = _mm_loadu_ps(ptr1); \
1766 _tmp2 = _mm_loadu_ps(ptr1+4); \
1767 _tmp3 = _mm_load_ss(ptr1+8); \
1768 _tmp4 = _mm_loadu_ps(ptr2); \
1769 _tmp5 = _mm_loadu_ps(ptr2+4); \
1770 _tmp6 = _mm_load_ss(ptr2+8); \
1771 _tmp7 = _mm_loadu_ps(ptr3); \
1772 _tmp8 = _mm_loadu_ps(ptr3+4); \
1773 _tmp9 = _mm_load_ss(ptr3+8); \
1774 _tmp10 = _mm_loadu_ps(ptr4); \
1775 _tmp11 = _mm_loadu_ps(ptr4+4); \
1776 _tmp12 = _mm_load_ss(ptr4+8); \
1777 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1778 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1779 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1780 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1781 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1782 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1783 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1784 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1785 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
1786 _tmp18 = _mm_movehl_ps(jz3,jz3); \
1787 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
1788 _tmp20 = _mm_movelh_ps(jx1,jz1); \
1789 _tmp21 = _mm_movehl_ps(jz1,jx1); \
1790 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
1791 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1792 _tmp23 = _mm_movelh_ps(jy2,jx3); \
1793 _tmp24 = _mm_movehl_ps(jx3,jy2); \
1794 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
1795 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1796 _tmp1 = _mm_sub_ps(_tmp1,_tmp20); \
1797 _tmp2 = _mm_sub_ps(_tmp2,_tmp23); \
1798 _tmp3 = _mm_sub_ss(_tmp3,jz3); \
1799 _tmp4 = _mm_sub_ps(_tmp4,_tmp21); \
1800 _tmp5 = _mm_sub_ps(_tmp5,_tmp24); \
1801 _tmp6 = _mm_sub_ss(_tmp6,_tmp17); \
1802 _tmp7 = _mm_sub_ps(_tmp7,_tmp22); \
1803 _tmp8 = _mm_sub_ps(_tmp8,_tmp25); \
1804 _tmp9 = _mm_sub_ss(_tmp9,_tmp18); \
1805 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1806 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1807 _tmp12 = _mm_sub_ss(_tmp12,_tmp19); \
1808 _mm_storeu_ps(ptr1,_tmp1); \
1809 _mm_storeu_ps(ptr1+4,_tmp2); \
1810 _mm_store_ss(ptr1+8,_tmp3); \
1811 _mm_storeu_ps(ptr2,_tmp4); \
1812 _mm_storeu_ps(ptr2+4,_tmp5); \
1813 _mm_store_ss(ptr2+8,_tmp6); \
1814 _mm_storeu_ps(ptr3,_tmp7); \
1815 _mm_storeu_ps(ptr3+4,_tmp8); \
1816 _mm_store_ss(ptr3+8,_tmp9); \
1817 _mm_storeu_ps(ptr4,_tmp10); \
1818 _mm_storeu_ps(ptr4+4,_tmp11); \
1819 _mm_store_ss(ptr4+8,_tmp12); \
1823 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1824 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
1825 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1826 __m128 _tmp23,_tmp24; \
1827 _tmp1 = _mm_loadu_ps(ptr1); \
1828 _tmp2 = _mm_loadu_ps(ptr1+4); \
1829 _tmp3 = _mm_loadu_ps(ptr1+8); \
1830 _tmp4 = _mm_loadu_ps(ptr2); \
1831 _tmp5 = _mm_loadu_ps(ptr2+4); \
1832 _tmp6 = _mm_loadu_ps(ptr2+8); \
1833 _tmp7 = _mm_loadu_ps(ptr3); \
1834 _tmp8 = _mm_loadu_ps(ptr3+4); \
1835 _tmp9 = _mm_loadu_ps(ptr3+8); \
1836 _tmp10 = _mm_loadu_ps(ptr4); \
1837 _tmp11 = _mm_loadu_ps(ptr4+4); \
1838 _tmp12 = _mm_loadu_ps(ptr4+8); \
1839 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
1840 jx1 = _mm_unpacklo_ps(jx1,jy1); \
1841 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
1842 jz1 = _mm_unpacklo_ps(jz1,jx2); \
1843 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
1844 jy2 = _mm_unpacklo_ps(jy2,jz2); \
1845 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
1846 jx3 = _mm_unpacklo_ps(jx3,jy3); \
1847 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
1848 jz3 = _mm_unpacklo_ps(jz3,jx4); \
1849 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
1850 jy4 = _mm_unpacklo_ps(jy4,jz4); \
1851 _tmp19 = _mm_movelh_ps(jx1,jz1); \
1852 jz1 = _mm_movehl_ps(jz1,jx1); \
1853 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
1854 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
1855 _tmp21 = _mm_movelh_ps(jy2,jx3); \
1856 jx3 = _mm_movehl_ps(jx3,jy2); \
1857 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
1858 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
1859 _tmp23 = _mm_movelh_ps(jz3,jy4); \
1860 jy4 = _mm_movehl_ps(jy4,jz3); \
1861 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
1862 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
1863 _tmp1 = _mm_sub_ps(_tmp1,_tmp19); \
1864 _tmp2 = _mm_sub_ps(_tmp2,_tmp21); \
1865 _tmp3 = _mm_sub_ps(_tmp3,_tmp23); \
1866 _tmp4 = _mm_sub_ps(_tmp4,jz1); \
1867 _tmp5 = _mm_sub_ps(_tmp5,jx3); \
1868 _tmp6 = _mm_sub_ps(_tmp6,jy4); \
1869 _tmp7 = _mm_sub_ps(_tmp7,_tmp20); \
1870 _tmp8 = _mm_sub_ps(_tmp8,_tmp22); \
1871 _tmp9 = _mm_sub_ps(_tmp9,_tmp24); \
1872 _tmp10 = _mm_sub_ps(_tmp10,_tmp14); \
1873 _tmp11 = _mm_sub_ps(_tmp11,_tmp16); \
1874 _tmp12 = _mm_sub_ps(_tmp12,_tmp18); \
1875 _mm_storeu_ps(ptr1,_tmp1); \
1876 _mm_storeu_ps(ptr1+4,_tmp2); \
1877 _mm_storeu_ps(ptr1+8,_tmp3); \
1878 _mm_storeu_ps(ptr2,_tmp4); \
1879 _mm_storeu_ps(ptr2+4,_tmp5); \
1880 _mm_storeu_ps(ptr2+8,_tmp6); \
1881 _mm_storeu_ps(ptr3,_tmp7); \
1882 _mm_storeu_ps(ptr3+4,_tmp8); \
1883 _mm_storeu_ps(ptr3+8,_tmp9); \
1884 _mm_storeu_ps(ptr4,_tmp10); \
1885 _mm_storeu_ps(ptr4+4,_tmp11); \
1886 _mm_storeu_ps(ptr4+8,_tmp12); \
1894 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1895 * to set up the 7 constants used for analytic 5th order switch calculations.
1897 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) { \
1898 const __m128 _swsetup_cm6 = { -6.0, -6.0, -6.0, -6.0}; \
1899 const __m128 _swsetup_cm10 = {-10.0,-10.0,-10.0,-10.0}; \
1900 const __m128 _swsetup_c15 = { 15.0, 15.0, 15.0, 15.0}; \
1901 const __m128 _swsetup_cm30 = {-30.0,-30.0,-30.0,-30.0}; \
1902 const __m128 _swsetup_c60 = { 60.0, 60.0, 60.0, 60.0}; \
1904 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5; \
1906 d = _mm_sub_ps(rcut,rswitch); \
1907 dinv = gmx_mm_inv_ps(d); \
1908 dinv2 = _mm_mul_ps(dinv,dinv); \
1909 dinv3 = _mm_mul_ps(dinv2,dinv); \
1910 dinv4 = _mm_mul_ps(dinv2,dinv2); \
1911 dinv5 = _mm_mul_ps(dinv3,dinv2); \
1913 switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3); \
1914 switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4); \
1915 switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5); \
1916 switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3); \
1917 switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4); \
1918 switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5); \
1922 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1923 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
1925 d = _mm_max_ps(r,rswitch); \
1926 d = _mm_min_ps(d,rcut); \
1927 d = _mm_sub_ps(d,rswitch); \
1928 d2 = _mm_mul_ps(d,d); \
1929 sw = _mm_mul_ps(d,sw_C5); \
1930 dsw = _mm_mul_ps(d,sw_D4); \
1931 sw = _mm_add_ps(sw,sw_C4); \
1932 dsw = _mm_add_ps(dsw,sw_D3); \
1933 sw = _mm_mul_ps(sw,d); \
1934 dsw = _mm_mul_ps(dsw,d); \
1935 sw = _mm_add_ps(sw,sw_C3); \
1936 dsw = _mm_add_ps(dsw,sw_D2); \
1937 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
1938 dsw = _mm_mul_ps(dsw,d2); \
1939 sw = _mm_add_ps(sw,_sw_one); \
1943 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1944 static inline __m128
1945 gmx_mm_interaction_coulomb_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1947 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1948 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1954 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv
, __m128 qq
,__m128
*vctot
)
1956 __m128 vcoul
= _mm_mul_ps(qq
,rinv
);
1957 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1961 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1962 static inline __m128
1963 gmx_mm_interaction_coulombrf_ps(const __m128 rinv
, const __m128 rsq
, const __m128 krf
, const __m128 crf
, const __m128 qq
,__m128
*vctot
)
1965 const __m128 two
= {2.0,2.0,2.0,2.0};
1968 krsq
= _mm_mul_ps(krf
,rsq
);
1969 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
1970 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1972 return _mm_mul_ps(qq
, _mm_sub_ps(rinv
, _mm_mul_ps(two
,krsq
)));
1977 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv
, __m128 rsq
, __m128 krf
, __m128 crf
, __m128 qq
,__m128
*vctot
)
1981 krsq
= _mm_mul_ps(krf
,rsq
);
1982 vcoul
= _mm_mul_ps(qq
, _mm_sub_ps(_mm_add_ps(rinv
,krsq
),crf
));
1983 *vctot
= _mm_add_ps(*vctot
,vcoul
);
1996 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1997 static inline __m128
1998 gmx_mm_int_lj_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2000 const __m128 six
= {6.0,6.0,6.0,6.0};
2001 const __m128 twelve
= {12.0,12.0,12.0,12.0};
2003 __m128 rinvsix
,vvdw6
,vvdw12
;
2005 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2006 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2007 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2008 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2010 return _mm_sub_ps( _mm_mul_ps(twelve
,vvdw12
),_mm_mul_ps(six
,vvdw6
));
2015 gmx_mm_int_lj_potonly_ps(__m128 rinvsq
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2017 __m128 rinvsix
,vvdw6
,vvdw12
;
2019 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq
,rinvsq
),rinvsq
);
2020 vvdw6
= _mm_mul_ps(c6
,rinvsix
);
2021 vvdw12
= _mm_mul_ps(c12
, _mm_mul_ps(rinvsix
,rinvsix
));
2022 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_sub_ps(vvdw12
,vvdw6
));
2029 /* Return force should be multiplied by -rinv to get fscal */
2030 static inline __m128
2031 gmx_mm_int_4_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2033 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2035 int n_a
,n_b
,n_c
,n_d
;
2037 rt
= _mm_mul_ps(r
,tabscale
);
2038 n0
= _mm_cvttps_epi32(rt
);
2039 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2040 eps2
= _mm_mul_ps(eps
,eps
);
2042 /* Extract indices from n0 */
2043 n_a
= gmx_mm_extract_epi32(n0
,0);
2044 n_b
= gmx_mm_extract_epi32(n0
,1);
2045 n_c
= gmx_mm_extract_epi32(n0
,2);
2046 n_d
= gmx_mm_extract_epi32(n0
,3);
2047 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2048 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2049 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2050 H
= _mm_load_ps(VFtab
+ 4* n_d
);
2051 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2052 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2053 G
= _mm_mul_ps(G
,eps
); /* Geps */
2054 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2055 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2056 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2058 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2060 return _mm_mul_ps(F
,tabscale
);
2065 /* Return force should be multiplied by -rinv to get fscal */
2066 static inline __m128
2067 gmx_mm_int_4_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2069 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2071 int n_a
,n_b
,n_c
,n_d
;
2073 rt
= _mm_mul_ps(r
,tabscale
);
2074 n0
= _mm_cvttps_epi32(rt
);
2075 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2076 eps2
= _mm_mul_ps(eps
,eps
);
2078 /* Extract indices from n0 */
2079 n_a
= gmx_mm_extract_epi32(n0
,0);
2080 n_b
= gmx_mm_extract_epi32(n0
,1);
2081 n_c
= gmx_mm_extract_epi32(n0
,2);
2082 n_d
= gmx_mm_extract_epi32(n0
,3);
2084 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2085 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2086 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2088 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
);
2089 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
);
2090 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
);
2091 Hd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
);
2092 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ 4*offset
+ 4);
2093 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ 4*offset
+ 4);
2094 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ 4*offset
+ 4);
2095 Hr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_d
+ 4*offset
+ 4);
2096 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2097 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2098 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2099 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2100 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2101 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2102 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2103 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2104 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2105 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2106 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2108 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2109 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2111 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2115 /* Return force should be multiplied by -rinv to get fscal */
2116 static inline __m128
2117 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2118 __m128
*vctot
, __m128
*vvdwtot
)
2120 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2122 int n_a
,n_b
,n_c
,n_d
;
2124 rt
= _mm_mul_ps(r
,tabscale
);
2125 n0
= _mm_cvttps_epi32(rt
);
2126 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2127 eps2
= _mm_mul_ps(eps
,eps
);
2129 /* Extract indices from n0 */
2130 n_a
= gmx_mm_extract_epi32(n0
,0);
2131 n_b
= gmx_mm_extract_epi32(n0
,1);
2132 n_c
= gmx_mm_extract_epi32(n0
,2);
2133 n_d
= gmx_mm_extract_epi32(n0
,3);
2136 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2137 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2138 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2139 Hc
= _mm_load_ps(VFtab
+ 12* n_d
);
2140 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2141 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2142 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2143 Hd
= _mm_load_ps(VFtab
+ 12* n_d
+ 4);
2144 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2145 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2146 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2147 Hr
= _mm_load_ps(VFtab
+ 12* n_d
+ 8);
2148 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2149 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2150 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2151 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2152 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2153 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2154 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2155 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2156 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2157 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2158 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2159 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2161 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2162 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2164 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2165 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2166 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2168 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2169 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2170 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2172 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2177 /* Return force should be multiplied by -rinv to get fscal */
2178 static inline __m128
2179 gmx_mm_int_3_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2181 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2185 rt
= _mm_mul_ps(r
,tabscale
);
2186 n0
= _mm_cvttps_epi32(rt
);
2187 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2188 eps2
= _mm_mul_ps(eps
,eps
);
2190 /* Extract indices from n0 */
2191 n_a
= gmx_mm_extract_epi32(n0
,0);
2192 n_b
= gmx_mm_extract_epi32(n0
,1);
2193 n_c
= gmx_mm_extract_epi32(n0
,2);
2194 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2195 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2196 G
= _mm_load_ps(VFtab
+ 4* n_c
);
2197 H
= _mm_setzero_ps();
2198 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2199 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2200 G
= _mm_mul_ps(G
,eps
); /* Geps */
2201 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2202 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2203 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2205 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2207 return _mm_mul_ps(F
,tabscale
);
2212 /* Return force should be multiplied by -rinv to get fscal */
2213 static inline __m128
2214 gmx_mm_int_3_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2216 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2220 rt
= _mm_mul_ps(r
,tabscale
);
2221 n0
= _mm_cvttps_epi32(rt
);
2222 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2223 eps2
= _mm_mul_ps(eps
,eps
);
2225 /* Extract indices from n0 */
2226 n_a
= gmx_mm_extract_epi32(n0
,0);
2227 n_b
= gmx_mm_extract_epi32(n0
,1);
2228 n_c
= gmx_mm_extract_epi32(n0
,2);
2230 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2231 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2232 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2234 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2235 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2236 Gd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
);
2237 Hd
= _mm_setzero_ps();
2238 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2239 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2240 Gr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_c
+ offset
+ 4);
2241 Hr
= _mm_setzero_ps();
2242 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2243 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2244 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2245 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2246 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2247 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2248 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2249 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2250 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2251 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2252 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2254 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2255 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2257 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2261 /* Return force should be multiplied by -rinv to get fscal */
2262 static inline __m128
2263 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2264 __m128
*vctot
, __m128
*vvdwtot
)
2266 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2270 rt
= _mm_mul_ps(r
,tabscale
);
2271 n0
= _mm_cvttps_epi32(rt
);
2272 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2273 eps2
= _mm_mul_ps(eps
,eps
);
2275 /* Extract indices from n0 */
2276 n_a
= gmx_mm_extract_epi32(n0
,0);
2277 n_b
= gmx_mm_extract_epi32(n0
,1);
2278 n_c
= gmx_mm_extract_epi32(n0
,2);
2281 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2282 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2283 Gc
= _mm_load_ps(VFtab
+ 12* n_c
);
2284 Hc
= _mm_setzero_ps();
2285 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2286 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2287 Gd
= _mm_load_ps(VFtab
+ 12* n_c
+ 4);
2288 Hd
= _mm_setzero_ps();
2289 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2290 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2291 Gr
= _mm_load_ps(VFtab
+ 12* n_c
+ 8);
2292 Hr
= _mm_setzero_ps();
2293 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2294 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2295 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2296 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2297 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2298 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2299 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2300 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2301 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2302 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2303 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2304 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2306 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2307 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2309 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2310 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2311 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2313 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2314 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2315 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2317 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2324 /* Return force should be multiplied by -rinv to get fscal */
2325 static inline __m128
2326 gmx_mm_int_2_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2328 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2332 rt
= _mm_mul_ps(r
,tabscale
);
2333 n0
= _mm_cvttps_epi32(rt
);
2334 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2335 eps2
= _mm_mul_ps(eps
,eps
);
2337 /* Extract indices from n0 */
2338 n_a
= gmx_mm_extract_epi32(n0
,0);
2339 n_b
= gmx_mm_extract_epi32(n0
,1);
2340 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2341 F
= _mm_load_ps(VFtab
+ 4* n_b
);
2342 G
= _mm_setzero_ps();
2343 H
= _mm_setzero_ps();
2344 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2345 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2346 G
= _mm_mul_ps(G
,eps
); /* Geps */
2347 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2348 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2349 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2351 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2353 return _mm_mul_ps(F
,tabscale
);
2358 /* Return force should be multiplied by -rinv to get fscal */
2359 static inline __m128
2360 gmx_mm_int_2_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2362 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2366 rt
= _mm_mul_ps(r
,tabscale
);
2367 n0
= _mm_cvttps_epi32(rt
);
2368 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2369 eps2
= _mm_mul_ps(eps
,eps
);
2371 /* Extract indices from n0 */
2372 n_a
= gmx_mm_extract_epi32(n0
,0);
2373 n_b
= gmx_mm_extract_epi32(n0
,1);
2375 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2376 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2377 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2379 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2380 Fd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
);
2381 Gd
= _mm_setzero_ps();
2382 Hd
= _mm_setzero_ps();
2383 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2384 Fr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_b
+ offset
+ 4);
2385 Gr
= _mm_setzero_ps();
2386 Hr
= _mm_setzero_ps();
2387 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2388 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2389 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2390 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2391 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2392 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2393 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2394 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2395 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2396 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2397 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2399 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2400 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2402 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2406 /* Return force should be multiplied by -rinv to get fscal */
2407 static inline __m128
2408 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2409 __m128
*vctot
, __m128
*vvdwtot
)
2411 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2415 rt
= _mm_mul_ps(r
,tabscale
);
2416 n0
= _mm_cvttps_epi32(rt
);
2417 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2418 eps2
= _mm_mul_ps(eps
,eps
);
2420 /* Extract indices from n0 */
2421 n_a
= gmx_mm_extract_epi32(n0
,0);
2422 n_b
= gmx_mm_extract_epi32(n0
,1);
2424 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2425 Fc
= _mm_load_ps(VFtab
+ 12* n_b
);
2426 Gc
= _mm_setzero_ps();
2427 Hc
= _mm_setzero_ps();
2428 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2429 Fd
= _mm_load_ps(VFtab
+ 12* n_b
+ 4);
2430 Gd
= _mm_setzero_ps();
2431 Hd
= _mm_setzero_ps();
2432 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2433 Fr
= _mm_load_ps(VFtab
+ 12* n_b
+ 8);
2434 Gr
= _mm_setzero_ps();
2435 Hr
= _mm_setzero_ps();
2436 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2437 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2438 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2439 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2440 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2441 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2442 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2443 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2444 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2445 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2446 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2447 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2449 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2450 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2452 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2453 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2454 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2456 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2457 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2458 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2460 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2466 /* Return force should be multiplied by -rinv to get fscal */
2467 static inline __m128
2468 gmx_mm_int_1_table_coulomb_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128
*vctot
)
2470 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,vcoul
;
2474 rt
= _mm_mul_ps(r
,tabscale
);
2475 n0
= _mm_cvttps_epi32(rt
);
2476 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2477 eps2
= _mm_mul_ps(eps
,eps
);
2479 /* Extract indices from n0 */
2480 n_a
= gmx_mm_extract_epi32(n0
,0);
2481 Y
= _mm_load_ps(VFtab
+ 4* n_a
);
2482 F
= _mm_setzero_ps();
2483 G
= _mm_setzero_ps();
2484 H
= _mm_setzero_ps();
2485 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2486 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2487 G
= _mm_mul_ps(G
,eps
); /* Geps */
2488 F
= _mm_add_ps(F
, _mm_add_ps(G
,H
)); /* Fp */
2489 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Y
, _mm_mul_ps(eps
,F
)));
2490 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2492 F
= _mm_mul_ps(qq
, _mm_add_ps(F
, _mm_add_ps(G
, _mm_add_ps(H
,H
))));
2494 return _mm_mul_ps(F
,tabscale
);
2499 /* Return force should be multiplied by -rinv to get fscal */
2500 static inline __m128
2501 gmx_mm_int_1_table_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, int offset
, __m128 c6
, __m128 c12
, __m128
*vvdwtot
)
2503 __m128 rt
,eps
,eps2
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2507 rt
= _mm_mul_ps(r
,tabscale
);
2508 n0
= _mm_cvttps_epi32(rt
);
2509 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2510 eps2
= _mm_mul_ps(eps
,eps
);
2512 /* Extract indices from n0 */
2513 n_a
= gmx_mm_extract_epi32(n0
,0);
2515 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2516 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2517 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2519 Yd
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
);
2520 Fd
= _mm_setzero_ps();
2521 Gd
= _mm_setzero_ps();
2522 Hd
= _mm_setzero_ps();
2523 Yr
= _mm_load_ps(VFtab
+ 4*(offset
+2)* n_a
+ offset
+ 4);
2524 Fr
= _mm_setzero_ps();
2525 Gr
= _mm_setzero_ps();
2526 Hr
= _mm_setzero_ps();
2527 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2528 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2529 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2530 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2531 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2532 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2533 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2534 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2535 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2536 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2537 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2539 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2540 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2542 return _mm_mul_ps( _mm_add_ps(Fd
,Fr
),tabscale
);
2546 /* Return force should be multiplied by -rinv to get fscal */
2547 static inline __m128
2548 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r
, __m128 tabscale
, float * VFtab
, __m128 qq
, __m128 c6
, __m128 c12
,
2549 __m128
*vctot
, __m128
*vvdwtot
)
2551 __m128 rt
,eps
,eps2
,vcoul
,Yc
,Fc
,Gc
,Hc
,Yd
,Fd
,Gd
,Hd
,Yr
,Fr
,Gr
,Hr
,vvdw6
,vvdw12
;
2555 rt
= _mm_mul_ps(r
,tabscale
);
2556 n0
= _mm_cvttps_epi32(rt
);
2557 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2558 eps2
= _mm_mul_ps(eps
,eps
);
2560 /* Extract indices from n0 */
2561 n_a
= gmx_mm_extract_epi32(n0
,0);
2563 Yc
= _mm_load_ps(VFtab
+ 12* n_a
);
2564 Fc
= _mm_setzero_ps();
2565 Gc
= _mm_setzero_ps();
2566 Hc
= _mm_setzero_ps();
2567 Yd
= _mm_load_ps(VFtab
+ 12* n_a
+ 4);
2568 Fd
= _mm_setzero_ps();
2569 Gd
= _mm_setzero_ps();
2570 Hd
= _mm_setzero_ps();
2571 Yr
= _mm_load_ps(VFtab
+ 12* n_a
+ 8);
2572 Fr
= _mm_setzero_ps();
2573 Gr
= _mm_setzero_ps();
2574 Hr
= _mm_setzero_ps();
2575 _MM_TRANSPOSE4_PS(Yc
,Fc
,Gc
,Hc
);
2576 _MM_TRANSPOSE4_PS(Yd
,Fd
,Gd
,Hd
);
2577 _MM_TRANSPOSE4_PS(Yr
,Fr
,Gr
,Hr
);
2578 Hc
= _mm_mul_ps(Hc
,eps2
); /* Heps2 */
2579 Gc
= _mm_mul_ps(Gc
,eps
); /* Geps */
2580 Fc
= _mm_add_ps(Fc
, _mm_add_ps(Gc
,Hc
)); /* Fp */
2581 Hd
= _mm_mul_ps(Hd
,eps2
); /* Heps2 */
2582 Gd
= _mm_mul_ps(Gd
,eps
); /* Geps */
2583 Fd
= _mm_add_ps(Fd
, _mm_add_ps(Gd
,Hd
)); /* Fp */
2584 Hr
= _mm_mul_ps(Hr
,eps2
); /* Heps2 */
2585 Gr
= _mm_mul_ps(Gr
,eps
); /* Geps */
2586 Fr
= _mm_add_ps(Fr
, _mm_add_ps(Gr
,Hr
)); /* Fp */
2588 vcoul
= _mm_mul_ps(qq
, _mm_add_ps(Yc
, _mm_mul_ps(eps
,Fc
)));
2589 *vctot
= _mm_add_ps(*vctot
,vcoul
);
2591 vvdw6
= _mm_mul_ps(c6
, _mm_add_ps(Yd
, _mm_mul_ps(eps
,Fd
)));
2592 vvdw12
= _mm_mul_ps(c12
, _mm_add_ps(Yr
, _mm_mul_ps(eps
,Fr
)));
2593 *vvdwtot
= _mm_add_ps(*vvdwtot
, _mm_add_ps(vvdw6
,vvdw12
));
2595 Fc
= _mm_mul_ps(qq
, _mm_add_ps(Fc
, _mm_add_ps(Gc
, _mm_add_ps(Hc
,Hc
))));
2596 Fd
= _mm_mul_ps(c6
, _mm_add_ps(Fd
, _mm_add_ps(Gd
, _mm_add_ps(Hd
,Hd
))));
2597 Fr
= _mm_mul_ps(c12
, _mm_add_ps(Fr
, _mm_add_ps(Gr
, _mm_add_ps(Hr
,Hr
))));
2599 return _mm_mul_ps( _mm_add_ps(Fc
,_mm_add_ps(Fd
,Fr
)),tabscale
);
2606 /* Return force should be multiplied by +rinv to get fscal */
2607 static inline __m128
2608 gmx_mm_int_4_genborn_ps(__m128 r
, __m128 isai
,
2609 float * isaj1
, float *isaj2
, float *isaj3
, float *isaj4
,
2610 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2611 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
, float *dvdaj4
,
2614 const __m128 half
= {0.5,0.5,0.5,0.5};
2616 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2618 int n_a
,n_b
,n_c
,n_d
;
2621 isaj
= _mm_load_ss(isaj1
);
2622 t2
= _mm_load_ss(isaj2
);
2623 t3
= _mm_load_ss(isaj3
);
2624 t4
= _mm_load_ss(isaj4
);
2625 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2626 t3
= _mm_unpacklo_ps(t3
,t4
); /* - - t4 t3 */
2627 isaj
= _mm_movelh_ps(isaj
,t3
); /* t4 t3 t2 t1 */
2629 isaprod
= _mm_mul_ps(isai
,isaj
);
2630 qq
= _mm_mul_ps(qq
,isaprod
);
2631 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2633 rt
= _mm_mul_ps(r
,gbtabscale
);
2634 n0
= _mm_cvttps_epi32(rt
);
2635 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2636 eps2
= _mm_mul_ps(eps
,eps
);
2638 /* Extract indices from n0 */
2639 n_a
= gmx_mm_extract_epi32(n0
,0);
2640 n_b
= gmx_mm_extract_epi32(n0
,1);
2641 n_c
= gmx_mm_extract_epi32(n0
,2);
2642 n_d
= gmx_mm_extract_epi32(n0
,3);
2643 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2644 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2645 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2646 H
= _mm_load_ps(GBtab
+ 4* n_d
);
2647 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2648 G
= _mm_mul_ps(G
,eps
); /* Geps */
2649 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2650 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2652 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2653 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2655 vgb
= _mm_mul_ps(qq
, VV
);
2656 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2658 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2660 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2662 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2664 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2666 /* Update 4 dada[j] values */
2667 Y
= _mm_load_ss(dvdaj1
);
2668 F
= _mm_load_ss(dvdaj2
);
2669 G
= _mm_load_ss(dvdaj3
);
2670 H
= _mm_load_ss(dvdaj4
);
2671 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2672 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2673 t4
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(0,0,0,1));
2675 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2676 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2677 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2678 _mm_store_ss( dvdaj4
, _mm_add_ss( H
, t4
) );
2685 /* Return force should be multiplied by +rinv to get fscal */
2686 static inline __m128
2687 gmx_mm_int_3_genborn_ps(__m128 r
, __m128 isai
,
2688 float * isaj1
, float *isaj2
, float *isaj3
,
2689 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2690 float *dvdaj1
, float *dvdaj2
, float *dvdaj3
,
2693 const __m128 half
= {0.5,0.5,0.5,0.5};
2695 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2697 int n_a
,n_b
,n_c
,n_d
;
2700 isaj
= _mm_load_ss(isaj1
);
2701 t2
= _mm_load_ss(isaj2
);
2702 t3
= _mm_load_ss(isaj3
);
2703 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2704 t3
= _mm_unpacklo_ps(t3
,t3
); /* - - t3 t3 */
2705 isaj
= _mm_movelh_ps(isaj
,t3
); /* t3 t3 t2 t1 */
2707 isaprod
= _mm_mul_ps(isai
,isaj
);
2708 qq
= _mm_mul_ps(qq
,isaprod
);
2709 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2711 rt
= _mm_mul_ps(r
,gbtabscale
);
2712 n0
= _mm_cvttps_epi32(rt
);
2713 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2714 eps2
= _mm_mul_ps(eps
,eps
);
2716 /* Extract indices from n0 */
2717 n_a
= gmx_mm_extract_epi32(n0
,0);
2718 n_b
= gmx_mm_extract_epi32(n0
,1);
2719 n_c
= gmx_mm_extract_epi32(n0
,2);
2720 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2721 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2722 G
= _mm_load_ps(GBtab
+ 4* n_c
);
2723 H
= _mm_setzero_ps();
2724 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2725 G
= _mm_mul_ps(G
,eps
); /* Geps */
2726 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2727 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2729 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2730 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2732 vgb
= _mm_mul_ps(qq
, VV
);
2733 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2735 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2737 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2739 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2741 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2743 /* Update 3 dada[j] values */
2744 Y
= _mm_load_ss(dvdaj1
);
2745 F
= _mm_load_ss(dvdaj2
);
2746 G
= _mm_load_ss(dvdaj3
);
2747 t3
= _mm_movehl_ps(_mm_setzero_ps(),dvdatmp
);
2748 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2750 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2751 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2752 _mm_store_ss( dvdaj3
, _mm_add_ss( G
, t3
) );
2760 /* Return force should be multiplied by +rinv to get fscal */
2761 static inline __m128
2762 gmx_mm_int_2_genborn_ps(__m128 r
, __m128 isai
,
2763 float * isaj1
, float *isaj2
,
2764 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2765 float *dvdaj1
, float *dvdaj2
,
2768 const __m128 half
= {0.5,0.5,0.5,0.5};
2770 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2772 int n_a
,n_b
,n_c
,n_d
;
2775 isaj
= _mm_load_ss(isaj1
);
2776 t2
= _mm_load_ss(isaj2
);
2777 isaj
= _mm_unpacklo_ps(isaj
,t2
); /* - - t2 t1 */
2779 isaprod
= _mm_mul_ps(isai
,isaj
);
2780 qq
= _mm_mul_ps(qq
,isaprod
);
2781 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2783 rt
= _mm_mul_ps(r
,gbtabscale
);
2784 n0
= _mm_cvttps_epi32(rt
);
2785 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2786 eps2
= _mm_mul_ps(eps
,eps
);
2788 /* Extract indices from n0 */
2789 n_a
= gmx_mm_extract_epi32(n0
,0);
2790 n_b
= gmx_mm_extract_epi32(n0
,1);
2791 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2792 F
= _mm_load_ps(GBtab
+ 4* n_b
);
2793 G
= _mm_setzero_ps();
2794 H
= _mm_setzero_ps();
2795 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2796 G
= _mm_mul_ps(G
,eps
); /* Geps */
2797 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2798 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2800 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2801 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2803 vgb
= _mm_mul_ps(qq
, VV
);
2804 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2806 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2808 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2810 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2812 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2814 /* Update 2 dada[j] values */
2815 Y
= _mm_load_ss(dvdaj1
);
2816 F
= _mm_load_ss(dvdaj2
);
2817 t2
= _mm_shuffle_ps(dvdatmp
,dvdatmp
,_MM_SHUFFLE(0,0,0,1));
2819 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2820 _mm_store_ss( dvdaj2
, _mm_add_ss( F
, t2
) );
2825 /* Return force should be multiplied by +rinv to get fscal */
2826 static inline __m128
2827 gmx_mm_int_1_genborn_ps(__m128 r
, __m128 isai
,
2829 __m128 gbtabscale
, float * GBtab
, __m128 qq
, __m128
*dvdasum
,
2833 const __m128 half
= {0.5,0.5,0.5,0.5};
2835 __m128 rt
,eps
,eps2
,Y
,F
,G
,H
,VV
,FF
,ftmp
,isaprod
,t2
,t3
,t4
,isaj
,vgb
,dvdatmp
;
2837 int n_a
,n_b
,n_c
,n_d
;
2840 isaj
= _mm_load_ss(isaj1
);
2842 isaprod
= _mm_mul_ps(isai
,isaj
);
2843 qq
= _mm_mul_ps(qq
,isaprod
);
2844 gbtabscale
= _mm_mul_ps( isaprod
, gbtabscale
);
2846 rt
= _mm_mul_ps(r
,gbtabscale
);
2847 n0
= _mm_cvttps_epi32(rt
);
2848 eps
= _mm_sub_ps(rt
, _mm_cvtepi32_ps(n0
));
2849 eps2
= _mm_mul_ps(eps
,eps
);
2851 /* Extract indices from n0 */
2852 n_a
= gmx_mm_extract_epi32(n0
,0);
2853 Y
= _mm_load_ps(GBtab
+ 4* n_a
);
2854 F
= _mm_setzero_ps();
2855 G
= _mm_setzero_ps();
2856 H
= _mm_setzero_ps();
2857 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2858 G
= _mm_mul_ps(G
,eps
); /* Geps */
2859 H
= _mm_mul_ps(H
,eps2
); /* Heps2 */
2860 F
= _mm_add_ps(_mm_add_ps(F
,G
),H
); /* Fp */
2862 VV
= _mm_add_ps(Y
, _mm_mul_ps(eps
,F
));
2863 FF
= _mm_add_ps(_mm_add_ps(F
,G
), _mm_add_ps(H
,H
));
2865 vgb
= _mm_mul_ps(qq
, VV
);
2866 *vgbtot
= _mm_sub_ps(*vgbtot
,vgb
); /* Yes, the sign is correct */
2868 ftmp
= _mm_mul_ps(_mm_mul_ps(qq
, FF
), gbtabscale
);
2870 dvdatmp
= _mm_mul_ps(half
, _mm_add_ps(vgb
,_mm_mul_ps(ftmp
,r
)));
2872 *dvdasum
= _mm_add_ps(*dvdasum
,dvdatmp
);
2874 dvdatmp
= _mm_mul_ps(_mm_mul_ps(dvdatmp
,isaj
), isaj
);
2876 /* Update 1 dada[j] values */
2877 Y
= _mm_load_ss(dvdaj1
);
2879 _mm_store_ss( dvdaj1
, _mm_add_ss( Y
, dvdatmp
) );
2889 gmx_mm_update_iforce_1atom_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2896 fix1
= _mm_hadd_ps(fix1
,fix1
);
2897 fiy1
= _mm_hadd_ps(fiy1
,fiz1
);
2899 fix1
= _mm_hadd_ps(fix1
,fiy1
); /* fiz1 fiy1 fix1 fix1 */
2902 /* transpose data */
2904 _MM_TRANSPOSE4_PS(fix1
,t1
,fiy1
,fiz1
);
2905 fix1
= _mm_add_ps(_mm_add_ps(fix1
,t1
), _mm_add_ps(fiy1
,fiz1
));
2907 t2
= _mm_load_ss(fptr
);
2908 t2
= _mm_loadh_pi(t2
,(__m64
*)(fptr
+1));
2909 t3
= _mm_load_ss(fshiftptr
);
2910 t3
= _mm_loadh_pi(t3
,(__m64
*)(fshiftptr
+1));
2912 t2
= _mm_add_ps(t2
,fix1
);
2913 t3
= _mm_add_ps(t3
,fix1
);
2915 _mm_store_ss(fptr
,t2
);
2916 _mm_storeh_pi((__m64
*)(fptr
+1),t2
);
2917 _mm_store_ss(fshiftptr
,t3
);
2918 _mm_storeh_pi((__m64
*)(fshiftptr
+1),t3
);
2922 gmx_mm_update_iforce_2atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2923 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
2930 fix1
= _mm_hadd_ps(fix1
,fiy1
);
2931 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
2932 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
2934 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
2935 fiy2
= _mm_hadd_ps(fiy2
,fiy2
); /* - - fiz2 fiy2 */
2938 /* transpose data */
2939 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
2940 t1
= _mm_unpacklo_ps(fiy2
,fiz2
);
2941 t2
= _mm_unpackhi_ps(fiy2
,fiz2
);
2943 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
2944 t1
= _mm_add_ps(t1
,t2
);
2945 t2
= _mm_movehl_ps(t2
,t1
);
2946 fiy2
= _mm_add_ps(t1
,t2
);
2948 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
2949 t1
= _mm_loadl_pi(t1
,(__m64
*)(fptr
+4));
2950 _mm_storel_pi((__m64
*)(fptr
+4), _mm_add_ps(fiy2
,t1
));
2952 t4
= _mm_load_ss(fshiftptr
+2);
2953 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
2955 t1
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
2956 t1
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
2957 t2
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
2959 t1
= _mm_add_ps(t1
,t2
);
2960 t1
= _mm_add_ps(t1
,t4
); /* y x - z */
2962 _mm_store_ss(fshiftptr
+2,t1
);
2963 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
2969 gmx_mm_update_iforce_3atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
2970 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
2971 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
2978 fix1
= _mm_hadd_ps(fix1
,fiy1
);
2979 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
2980 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
2981 fix3
= _mm_hadd_ps(fix3
,fiy3
);
2982 fiz3
= _mm_hadd_ps(fiz3
,fiz3
);
2984 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
2985 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
2986 fiz3
= _mm_hadd_ps(fiz3
,fiz3
); /* - - - fiz3 */
2989 /* transpose data */
2990 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
2991 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
2992 t2
= _mm_movehl_ps(_mm_setzero_ps(),fiz3
);
2993 t1
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(0,0,0,1));
2994 t3
= _mm_shuffle_ps(t2
,t2
,_MM_SHUFFLE(0,0,0,1));
2996 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
2997 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
2998 fiz3
= _mm_add_ss(_mm_add_ps(fiz3
,t1
) , _mm_add_ps(t2
,t3
));
3000 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3001 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3002 _mm_store_ss (fptr
+8,_mm_add_ss(fiz3
,_mm_load_ss(fptr
+8) ));
3004 t4
= _mm_load_ss(fshiftptr
+2);
3005 t4
= _mm_loadh_pi(t4
,(__m64
*)(fshiftptr
));
3007 t1
= _mm_shuffle_ps(fiz3
,fix1
,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
3008 t2
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
3009 t3
= _mm_shuffle_ps(fiy2
,fix1
,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
3010 t3
= _mm_shuffle_ps(t3
,t3
,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
3012 t1
= _mm_add_ps(t1
,t2
);
3013 t3
= _mm_add_ps(t3
,t4
);
3014 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3016 _mm_store_ss(fshiftptr
+2,t1
);
3017 _mm_storeh_pi((__m64
*)(fshiftptr
),t1
);
3022 gmx_mm_update_iforce_4atoms_ps(__m128 fix1
, __m128 fiy1
, __m128 fiz1
,
3023 __m128 fix2
, __m128 fiy2
, __m128 fiz2
,
3024 __m128 fix3
, __m128 fiy3
, __m128 fiz3
,
3025 __m128 fix4
, __m128 fiy4
, __m128 fiz4
,
3029 __m128 t1
,t2
,t3
,t4
,t5
;
3032 fix1
= _mm_hadd_ps(fix1
,fiy1
);
3033 fiz1
= _mm_hadd_ps(fiz1
,fix2
);
3034 fiy2
= _mm_hadd_ps(fiy2
,fiz2
);
3035 fix3
= _mm_hadd_ps(fix3
,fiy3
);
3036 fiz3
= _mm_hadd_ps(fiz3
,fix4
);
3037 fiy4
= _mm_hadd_ps(fiy4
,fiz4
);
3039 fix1
= _mm_hadd_ps(fix1
,fiz1
); /* fix2 fiz1 fiy1 fix1 */
3040 fiy2
= _mm_hadd_ps(fiy2
,fix3
); /* fiy3 fix3 fiz2 fiy2 */
3041 fiz3
= _mm_hadd_ps(fiz3
,fiy4
); /* fiz4 fiy4 fix4 fiz3 */
3044 /* transpose data */
3045 _MM_TRANSPOSE4_PS(fix1
,fiy1
,fiz1
,fix2
);
3046 _MM_TRANSPOSE4_PS(fiy2
,fiz2
,fix3
,fiy3
);
3047 _MM_TRANSPOSE4_PS(fiz3
,fix4
,fiy4
,fiz4
);
3049 fix1
= _mm_add_ps(_mm_add_ps(fix1
,fiy1
), _mm_add_ps(fiz1
,fix2
));
3050 fiy2
= _mm_add_ps(_mm_add_ps(fiy2
,fiz2
), _mm_add_ps(fix3
,fiy3
));
3051 fiz3
= _mm_add_ps(_mm_add_ps(fiz3
,fix4
), _mm_add_ps(fiy4
,fiz4
));
3053 _mm_storeu_ps(fptr
, _mm_add_ps(fix1
,_mm_loadu_ps(fptr
) ));
3054 _mm_storeu_ps(fptr
+4,_mm_add_ps(fiy2
,_mm_loadu_ps(fptr
+4)));
3055 _mm_storeu_ps(fptr
+8,_mm_add_ps(fiz3
,_mm_loadu_ps(fptr
+8)));
3057 t5
= _mm_load_ss(fshiftptr
+2);
3058 t5
= _mm_loadh_pi(t5
,(__m64
*)(fshiftptr
));
3060 t1
= _mm_shuffle_ps(fix1
,fix1
,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
3061 t2
= _mm_shuffle_ps(fiy2
,fiy2
,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
3062 t3
= _mm_shuffle_ps(fiz3
,fiz3
,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
3063 t4
= _mm_shuffle_ps(fix1
,fiy2
,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
3064 t4
= _mm_shuffle_ps(fiz3
,t4
,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
3066 t1
= _mm_add_ps(t1
,t2
);
3067 t3
= _mm_add_ps(t3
,t4
);
3068 t1
= _mm_add_ps(t1
,t3
); /* y x - z */
3069 t5
= _mm_add_ps(t5
,t1
);
3071 _mm_store_ss(fshiftptr
+2,t5
);
3072 _mm_storeh_pi((__m64
*)(fshiftptr
),t5
);
3077 gmx_mm_update_1pot_ps(__m128 pot1
, float *ptr1
)
3080 pot1
= _mm_hadd_ps(pot1
,pot1
);
3081 pot1
= _mm_hadd_ps(pot1
,pot1
);
3084 pot1
= _mm_add_ps(pot1
,_mm_movehl_ps(pot1
,pot1
));
3085 pot1
= _mm_add_ps(pot1
,_mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1)));
3087 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3092 gmx_mm_update_2pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
)
3095 pot1
= _mm_hadd_ps(pot1
,pot2
);
3096 pot1
= _mm_hadd_ps(pot1
,pot1
);
3097 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(0,0,0,1));
3101 t1
= _mm_movehl_ps(pot2
,pot1
); /* 2d 2c 1d 1c */
3102 t2
= _mm_movelh_ps(pot1
,pot2
); /* 2b 2a 1b 1a */
3103 t1
= _mm_add_ps(t1
,t2
); /* 2 2 1 1 */
3104 t2
= _mm_shuffle_ps(t1
,t1
,_MM_SHUFFLE(3,3,1,1));
3105 pot1
= _mm_add_ps(t1
,t2
); /* - 2 - 1 */
3106 pot2
= _mm_movehl_ps(t2
,pot1
); /* - - - 2 */
3109 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3110 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3115 gmx_mm_update_4pot_ps(__m128 pot1
, float *ptr1
, __m128 pot2
, float *ptr2
, __m128 pot3
, float *ptr3
, __m128 pot4
, float *ptr4
)
3117 _MM_TRANSPOSE4_PS(pot1
,pot2
,pot3
,pot4
);
3119 pot1
= _mm_add_ps(_mm_add_ps(pot1
,pot2
),_mm_add_ps(pot3
,pot4
));
3120 pot2
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(1,1,1,1));
3121 pot3
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(2,2,2,2));
3122 pot4
= _mm_shuffle_ps(pot1
,pot1
,_MM_SHUFFLE(3,3,3,3));
3124 _mm_store_ss(ptr1
,_mm_add_ss(pot1
,_mm_load_ss(ptr1
)));
3125 _mm_store_ss(ptr2
,_mm_add_ss(pot2
,_mm_load_ss(ptr2
)));
3126 _mm_store_ss(ptr3
,_mm_add_ss(pot3
,_mm_load_ss(ptr3
)));
3127 _mm_store_ss(ptr4
,_mm_add_ss(pot4
,_mm_load_ss(ptr4
)));