Fixed typo in CMakeLists for mdrun-gpu
[gromacs/qmmm-gamess-us.git] / src / mdlib / genborn_sse2_double.c
blob14ae3477edd8d964914693884c9c02c850d7f8a9
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
5 #include <math.h>
6 #include <string.h>
8 #include "typedefs.h"
9 #include "smalloc.h"
10 #include "genborn.h"
11 #include "vec.h"
12 #include "grompp.h"
13 #include "pdbio.h"
14 #include "names.h"
15 #include "physics.h"
16 #include "domdec.h"
17 #include "partdec.h"
18 #include "network.h"
19 #include "gmx_fatal.h"
20 #include "mtop_util.h"
21 #include "genborn.h"
23 #ifdef GMX_LIB_MPI
24 #include <mpi.h>
25 #endif
26 #ifdef GMX_THREADS
27 #include "tmpi.h"
28 #endif
30 /* Only compile this file if SSE2 intrinsics are available */
31 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
35 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
36 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
37 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
38 #elif defined(__GNUC__)
39 #define gmx_castsi128_pd(a) ((__m128d)(a))
40 #define gmx_castpd_si128(a) ((__m128i)(a))
41 #else
42 static __m128d gmx_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
43 static __m128i gmx_castpd_si128(__m128d a) { return *(__m128i *) &a; }
44 #endif
47 static inline void
48 sincos_sse2double(__m128d x, __m128d *sinval, __m128d *cosval)
50 const __m128d two_over_pi = {2.0/M_PI,2.0/M_PI};
51 const __m128d half = {0.5,0.5};
52 const __m128d one = {1.0,1.0};
53 const __m128i izero = _mm_set1_epi32(0);
54 const __m128i ione = _mm_set1_epi32(1);
55 const __m128i itwo = _mm_set1_epi32(2);
56 const __m128i ithree = _mm_set1_epi32(3);
57 const __m128d sincosd_kc1 = {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
58 const __m128d sincosd_kc2 = {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
59 const __m128d sincosd_cc0 = {0.00000000206374484196,0.00000000206374484196};
60 const __m128d sincosd_cc1 = {-0.00000027555365134677,-0.00000027555365134677};
61 const __m128d sincosd_cc2 = {0.00002480157946764225,0.00002480157946764225};
62 const __m128d sincosd_cc3 = {-0.00138888888730525966,-0.00138888888730525966};
63 const __m128d sincosd_cc4 = {0.04166666666651986722,0.04166666666651986722};
64 const __m128d sincosd_cc5 = {-0.49999999999999547304,-0.49999999999999547304};
65 const __m128d sincosd_sc0 = {0.00000000015893606014,0.00000000015893606014};
66 const __m128d sincosd_sc1 = {-0.00000002505069049138,-0.00000002505069049138};
67 const __m128d sincosd_sc2 = {0.00000275573131527032,0.00000275573131527032};
68 const __m128d sincosd_sc3 = {-0.00019841269827816117,-0.00019841269827816117};
69 const __m128d sincosd_sc4 = {0.00833333333331908278,0.00833333333331908278};
70 const __m128d sincosd_sc5 = {-0.16666666666666612594,-0.16666666666666612594};
72 __m128d signbit = gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL));
73 __m128d tiny = gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL));
75 __m128d xl,xl2,xl3,qd,absxl,p1,cx,sx,ts,tc,tsn,tcn;
76 __m128i q;
77 __m128i offsetSin,offsetCos;
78 __m128d sinMask,cosMask,isTiny;
79 __m128d ct0,ct1,ct2,ct3,ct4,ct5,ct6,st1,st2,st3,st4,st6;
81 /* Rescale the angle to the range 0..4, and find which quadrant it is in */
82 xl = _mm_mul_pd(x,two_over_pi);
84 /* q=integer part of xl, rounded _away_ from 0.0 */
85 /* Add 0.5 away from 0.0 */
86 xl = _mm_add_pd(xl,_mm_or_pd(_mm_and_pd(xl,signbit),half));
88 q = _mm_cvttpd_epi32(xl);
89 qd = _mm_cvtepi32_pd(q);
90 q = _mm_shuffle_epi32(q,_MM_SHUFFLE(1,1,0,0));
92 /* Compute offset based on quadrant the arg falls in */
93 offsetSin = _mm_and_si128(q,ithree);
94 offsetCos = _mm_add_epi32(offsetSin,ione);
96 /* Remainder in range [-pi/4..pi/4] */
97 p1 = _mm_mul_pd(qd,sincosd_kc1);
98 xl = _mm_mul_pd(qd,sincosd_kc2);
99 p1 = _mm_sub_pd(x,p1);
100 xl = _mm_sub_pd(p1,xl);
102 absxl = _mm_andnot_pd(signbit,xl);
103 isTiny = _mm_cmpgt_pd(tiny,absxl);
105 xl2 = _mm_mul_pd(xl,xl);
106 xl3 = _mm_mul_pd(xl2,xl);
108 ct0 = _mm_mul_pd(xl2,xl2);
109 ct1 = _mm_mul_pd(sincosd_cc0,xl2);
110 ct2 = _mm_mul_pd(sincosd_cc2,xl2);
111 ct3 = _mm_mul_pd(sincosd_cc4,xl2);
112 st1 = _mm_mul_pd(sincosd_sc0,xl2);
113 st2 = _mm_mul_pd(sincosd_sc2,xl2);
114 st3 = _mm_mul_pd(sincosd_sc4,xl2);
115 ct1 = _mm_add_pd(ct1,sincosd_cc1);
116 ct2 = _mm_add_pd(ct2,sincosd_cc3);
117 ct3 = _mm_add_pd(ct3,sincosd_cc5);
118 st1 = _mm_add_pd(st1,sincosd_sc1);
119 st2 = _mm_add_pd(st2,sincosd_sc3);
120 st3 = _mm_add_pd(st3,sincosd_sc5);
122 ct4 = _mm_mul_pd(ct2,ct0);
123 ct4 = _mm_add_pd(ct4,ct3);
125 st4 = _mm_mul_pd(st2,ct0);
126 st4 = _mm_add_pd(st4,st3);
127 ct5 = _mm_mul_pd(ct0,ct0);
129 ct6 = _mm_mul_pd(ct5,ct1);
130 ct6 = _mm_add_pd(ct6,ct4);
132 st6 = _mm_mul_pd(ct5,st1);
133 st6 = _mm_add_pd(st6,st4);
135 cx = _mm_mul_pd(ct6,xl2);
136 cx = _mm_add_pd(cx,one);
138 sx = _mm_mul_pd(st6,xl3);
139 sx = _mm_add_pd(sx,xl);
141 /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
142 sx = _mm_or_pd( _mm_and_pd(isTiny,xl) , _mm_andnot_pd(isTiny,sx) );
143 cx = _mm_or_pd( _mm_and_pd(isTiny,one) , _mm_andnot_pd(isTiny,cx) );
145 sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,ione), izero));
146 cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,ione), izero));
148 ts = _mm_or_pd( _mm_and_pd(sinMask,sx) , _mm_andnot_pd(sinMask,cx) );
149 tc = _mm_or_pd( _mm_and_pd(cosMask,sx) , _mm_andnot_pd(cosMask,cx) );
151 /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
152 sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,itwo), izero));
153 tsn = _mm_xor_pd(signbit,ts);
154 ts = _mm_or_pd( _mm_and_pd(sinMask,ts) , _mm_andnot_pd(sinMask,tsn) );
156 cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,itwo), izero));
157 tcn = _mm_xor_pd(signbit,tc);
158 tc = _mm_or_pd( _mm_and_pd(cosMask,tc) , _mm_andnot_pd(cosMask,tcn) );
160 *sinval = ts;
161 *cosval = tc;
163 return;
166 __m128d log_pd(__m128d x)
168 const __m128i exp_mask = _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
169 const __m128i exp_bias = _mm_set_epi32(0,1023,0,1023);
171 const __m128d const_loge = _mm_set1_pd(0.69314718055994529);
172 const __m128d const_one = _mm_set1_pd(1.0);
173 const __m128d const_two = _mm_set1_pd(2.0);
175 /* Almost full single precision accuracy (~20 bits worst case) */
176 const __m128d P0 = _mm_set1_pd(6.108179944792157749153050);
177 const __m128d P1 = _mm_set1_pd(52.43691313715523327631139);
178 const __m128d P2 = _mm_set1_pd(71.53664010795613671168440);
179 const __m128d P3 = _mm_set1_pd(18.92097516931559547548485);
180 const __m128d P4 = _mm_set1_pd(0.3504714784635941984522153);
181 const __m128d P5 = _mm_set1_pd(-0.007105890734229368515879);
182 const __m128d Q1 = _mm_set1_pd(17.73314231909420567454406);
183 const __m128d Q2 = _mm_set1_pd(48.82373085428713023213363);
184 const __m128d Q3 = _mm_set1_pd(31.65945943354513166309101);
185 const __m128d Q4 = _mm_set1_pd(4.302477477108162270199051);
187 __m128d xmm0,xmm1,xmm2,xmm3, xmm4;
188 __m128i xmmi,xmmj;
190 xmmi = gmx_castpd_si128(x);
191 xmm1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi, exp_mask), 52), exp_bias),_MM_SHUFFLE(3,1,2,0)));
192 xmm0 = _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask, xmmi)), const_one);
194 xmm2 = _mm_mul_pd(P5,xmm0);
195 xmm2 = _mm_add_pd(xmm2,P4);
196 xmm2 = _mm_mul_pd(xmm2,xmm0);
197 xmm2 = _mm_add_pd(xmm2,P3);
198 xmm2 = _mm_mul_pd(xmm2,xmm0);
199 xmm2 = _mm_add_pd(xmm2,P2);
200 xmm2 = _mm_mul_pd(xmm2,xmm0);
201 xmm2 = _mm_add_pd(xmm2,P1);
202 xmm2 = _mm_mul_pd(xmm2,xmm0);
203 xmm2 = _mm_add_pd(xmm2,P0);
205 xmm3 = _mm_mul_pd(Q4,xmm0);
206 xmm3 = _mm_add_pd(xmm3,Q3);
207 xmm3 = _mm_mul_pd(xmm3,xmm0);
208 xmm3 = _mm_add_pd(xmm3,Q2);
209 xmm3 = _mm_mul_pd(xmm3,xmm0);
210 xmm3 = _mm_add_pd(xmm3,Q1);
211 xmm3 = _mm_mul_pd(xmm3,xmm0);
212 xmm3 = _mm_add_pd(xmm3,const_one);
214 /* xmm4=1.0/xmm3 */
215 xmm4 = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3)));
216 xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
217 xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
218 xmm2 = _mm_mul_pd(xmm2,xmm4);
220 xmm0 = _mm_sub_pd(xmm0, const_one);
221 xmm0 = _mm_mul_pd(xmm0,xmm2);
223 xmm0 = _mm_add_pd(xmm0,xmm1);
225 return _mm_mul_pd(xmm0, const_loge);
228 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
229 * The polynomial minimax coefficients are actually accurate to 10E-14,
230 * but we lose some accuracy in the polynomial evaluation.
232 __m128d exp_pd(__m128d x)
234 const __m128d lim1 = _mm_set1_pd(1025.0); /* 1025.00000e+0d */
235 const __m128d lim2 = _mm_set1_pd(-1022.99999999); /* -1022.99999e+0f */
237 const __m128i base = _mm_set_epi32(0,0,1023,1023);
238 const __m128d half = _mm_set1_pd(0.5);
239 const __m128d log2e = _mm_set1_pd(1.4426950408889634);
241 const __m128d exp_P0 = _mm_set1_pd(1.00000000000001276211229749);
242 const __m128d exp_P1 = _mm_set1_pd(6.931471805598709708635169E-1);
243 const __m128d exp_P2 = _mm_set1_pd(2.402265069564965287455972E-1);
244 const __m128d exp_P3 = _mm_set1_pd(5.550410866868561155599683E-2);
245 const __m128d exp_P4 = _mm_set1_pd(9.618129192067246128919915E-3);
246 const __m128d exp_P5 = _mm_set1_pd(1.333355761760444302342084E-3);
247 const __m128d exp_P6 = _mm_set1_pd(1.540343494807179111289781E-4);
248 const __m128d exp_P7 = _mm_set1_pd(1.525298483865349629325421E-5);
249 const __m128d exp_P8 = _mm_set1_pd(1.325940560934510417818578E-6);
250 const __m128d exp_P9 = _mm_set1_pd(1.015033670529892589443421E-7);
252 __m128d xmm0,xmm1;
253 __m128i xmmi;
255 xmm0 = _mm_mul_pd(x,log2e);
256 xmm0 = _mm_min_pd(xmm0,lim1);
257 xmm0 = _mm_max_pd(xmm0,lim2);
258 xmm1 = _mm_sub_pd(xmm0,half);
260 xmmi = _mm_cvtpd_epi32(xmm1);
261 xmm1 = _mm_cvtepi32_pd(xmmi);
263 xmmi = _mm_add_epi32(xmmi,base);
264 xmmi = _mm_shuffle_epi32(xmmi,_MM_SHUFFLE(3,1,2,0));
265 xmmi = _mm_slli_epi64(xmmi,52);
267 xmm0 = _mm_sub_pd(xmm0,xmm1);
269 xmm1 = _mm_mul_pd(exp_P9,xmm0);
270 xmm1 = _mm_add_pd(xmm1,exp_P8);
271 xmm1 = _mm_mul_pd(xmm1,xmm0);
272 xmm1 = _mm_add_pd(xmm1,exp_P7);
273 xmm1 = _mm_mul_pd(xmm1,xmm0);
274 xmm1 = _mm_add_pd(xmm1,exp_P6);
275 xmm1 = _mm_mul_pd(xmm1,xmm0);
276 xmm1 = _mm_add_pd(xmm1,exp_P5);
277 xmm1 = _mm_mul_pd(xmm1,xmm0);
278 xmm1 = _mm_add_pd(xmm1,exp_P4);
279 xmm1 = _mm_mul_pd(xmm1,xmm0);
280 xmm1 = _mm_add_pd(xmm1,exp_P3);
281 xmm1 = _mm_mul_pd(xmm1,xmm0);
282 xmm1 = _mm_add_pd(xmm1,exp_P2);
283 xmm1 = _mm_mul_pd(xmm1,xmm0);
284 xmm1 = _mm_add_pd(xmm1,exp_P1);
285 xmm1 = _mm_mul_pd(xmm1,xmm0);
286 xmm1 = _mm_add_pd(xmm1,exp_P0);
287 xmm1 = _mm_mul_pd(xmm1,gmx_castsi128_pd(xmmi));
289 return xmm1;
292 static inline __m128d
293 my_invrsq_pd(__m128d x)
295 const __m128d three = {3.0, 3.0};
296 const __m128d half = {0.5, 0.5};
298 __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
299 __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
301 /* First Newton-Rapson step, accuracy is now 24 bits */
302 __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
304 /* Return second Newton-Rapson step, accuracy 48 bits */
305 return _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
308 static inline __m128d
309 my_inv_pd(__m128d x)
311 const __m128d two = {2.0, 2.0};
313 __m128 t = _mm_rcp_ps(_mm_cvtpd_ps(x));
314 __m128d t1 = _mm_cvtps_pd(t);
315 __m128d t2 = _mm_mul_pd(t1,_mm_sub_pd(two,_mm_mul_pd(t1,x)));
317 return _mm_mul_pd(t2,_mm_sub_pd(two,_mm_mul_pd(t2,x)));
322 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
323 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
325 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23;
326 int at0,at1,nj0,nj1,offset,taj1,taj2;
327 int shift;
329 double factor,gpi_ai,gpi_tmp,gpi2;
330 double shX,shY,shZ;
332 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
333 __m128d t1,t2,t3,rsq11,rinv,rinv2,rinv4,rinv6;
334 __m128d ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
335 __m128d ccf,dccf,theta,cosq,term,sinq,res,prod;
336 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,vai,prod_ai,icf4,icf6;
338 const __m128d half = {0.5, 0.5};
339 const __m128d three = {3.0, 3.0};
340 const __m128d one = {1.0, 1.0};
341 const __m128d two = {2.0, 2.0};
342 const __m128d zero = {0.0, 0.0};
343 const __m128d four = {4.0, 4.0};
345 const __m128d p5inv = {STILL_P5INV, STILL_P5INV};
346 const __m128d pip5 = {STILL_PIP5, STILL_PIP5};
347 const __m128d p4 = {STILL_P4, STILL_P4};
349 factor = 0.5 * ONE_4PI_EPS0;
350 n = 0;
352 /* Keep the compiler happy */
353 raj = _mm_setzero_pd();
354 vaj = _mm_setzero_pd();
355 jx = _mm_setzero_pd();
356 jy = _mm_setzero_pd();
357 jz = _mm_setzero_pd();
358 xmm2 = _mm_setzero_pd();
359 xmm7 = _mm_setzero_pd();
361 for(i=0;i<born->nr;i++)
363 born->gpol_still_work[i]=0;
366 for(i=0;i<nl->nri;i++)
368 ai = nl->iinr[i];
369 ai3 = ai * 3;
371 nj0 = nl->jindex[i];
372 nj1 = nl->jindex[i+1];
374 /* Load shifts for this list */
375 shift = nl->shift[i];
376 shX = fr->shift_vec[shift][0];
377 shY = fr->shift_vec[shift][1];
378 shZ = fr->shift_vec[shift][2];
380 /* Splat the shifts */
381 sX = _mm_load1_pd(&shX);
382 sY = _mm_load1_pd(&shY);
383 sZ = _mm_load1_pd(&shZ);
385 offset = (nj1-nj0)%2;
387 /* Polarization energy for atom ai */
388 gpi = _mm_setzero_pd();
390 /* Load particle ai coordinates and add shifts */
391 ix = _mm_load1_pd(x+ai3);
392 iy = _mm_load1_pd(x+ai3+1);
393 iz = _mm_load1_pd(x+ai3+2);
395 ix = _mm_add_pd(sX,ix);
396 iy = _mm_add_pd(sY,iy);
397 iz = _mm_add_pd(sZ,iz);
399 /* Load particle ai gb_radius */
400 rai = _mm_set1_pd(top->atomtypes.gb_radius[md->typeA[ai]]);
401 vai = _mm_set1_pd(born->vsolv[ai]);
402 prod_ai = _mm_mul_pd(p4,vai);
404 for(k=nj0;k<nj1-offset;k+=2)
406 aj1 = nl->jjnr[k];
407 aj2 = nl->jjnr[k+1];
409 aj13 = aj1 * 3;
410 aj23 = aj2 * 3;
412 taj1 = md->typeA[aj1];
413 taj2 = md->typeA[aj2];
415 /* Load particle aj1-2 coordinates and compute ai->aj distances */
416 xmm1 = _mm_loadu_pd(x+aj13);
417 xmm2 = _mm_loadu_pd(x+aj23);
418 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
419 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
421 jz = _mm_loadl_pd(jz,x+aj13+2);
422 jz = _mm_loadh_pd(jz,x+aj23+2);
424 dx = _mm_sub_pd(ix,jx);
425 dy = _mm_sub_pd(iy,jy);
426 dz = _mm_sub_pd(iz,jz);
428 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
429 rinv = my_invrsq_pd(rsq11);
431 rinv2 = _mm_mul_pd(rinv,rinv);
432 rinv4 = _mm_mul_pd(rinv2,rinv2);
433 rinv6 = _mm_mul_pd(rinv4,rinv2);
435 vaj = _mm_loadl_pd(vaj,born->vsolv+aj1);
436 vaj = _mm_loadh_pd(vaj,born->vsolv+aj2);
438 raj = _mm_loadl_pd(raj, top->atomtypes.gb_radius+taj1);
439 raj = _mm_loadh_pd(raj, top->atomtypes.gb_radius+taj2);
441 rvdw = _mm_add_pd(rai,raj);
442 rvdw = _mm_mul_pd(rvdw,rvdw);
443 ratio = _mm_div_pd(rsq11,rvdw);
445 mask_cmp = _mm_cmpgt_pd(ratio,p5inv);
447 switch(_mm_movemask_pd(mask_cmp))
449 case 0xF:
450 ccf = one;
451 dccf = zero;
452 break;
453 default:
454 theta = _mm_mul_pd(ratio,pip5);
455 sincos_sse2double(theta,&sinq,&cosq);
457 term = _mm_sub_pd(one,cosq);
458 term = _mm_mul_pd(half,term);
459 ccf = _mm_mul_pd(term,term);
460 dccf = _mm_mul_pd(two,term);
461 dccf = _mm_mul_pd(dccf,sinq);
462 dccf = _mm_mul_pd(dccf,pip5);
463 dccf = _mm_mul_pd(dccf,ratio);
465 ccf = _mm_or_pd(_mm_and_pd(mask_cmp,one) ,_mm_andnot_pd(mask_cmp,ccf));
466 dccf = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
469 prod = _mm_mul_pd(p4,vaj);
470 icf4 = _mm_mul_pd(ccf,rinv4);
471 xmm2 = _mm_mul_pd(icf4,prod);
472 xmm3 = _mm_mul_pd(icf4,prod_ai);
473 gpi = _mm_add_pd(gpi,xmm2);
475 /* Load, subtract and store atom aj gpol energy */
476 xmm7 = _mm_loadl_pd(xmm7,born->gpol_still_work+aj1);
477 xmm7 = _mm_loadh_pd(xmm7,born->gpol_still_work+aj2);
479 xmm3 = _mm_add_pd(xmm7,xmm3);
481 _mm_storel_pd(born->gpol_still_work+aj1,xmm3);
482 _mm_storeh_pd(born->gpol_still_work+aj2,xmm3);
484 /* Chain rule terms */
485 ccf = _mm_mul_pd(four,ccf);
486 xmm3 = _mm_sub_pd(ccf,dccf);
487 icf6 = _mm_mul_pd(xmm3,rinv6);
488 xmm1 = _mm_mul_pd(icf6,prod);
489 xmm2 = _mm_mul_pd(icf6,prod_ai);
491 /* As with single precision, we need to shift stuff around, to change the order of
492 * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
493 to do 2 instead of 4 store operations
495 xmm3 = _mm_unpacklo_pd(xmm1,xmm2);
496 xmm4 = _mm_unpackhi_pd(xmm1,xmm2);
498 _mm_storeu_pd(fr->dadx+n,xmm3);
499 n = n + 2;
500 _mm_storeu_pd(fr->dadx+n,xmm4);
501 n = n + 2;
504 /* Deal with odd elements */
505 if(offset!=0)
507 aj1 = nl->jjnr[k];
508 aj13 = aj1 * 3;
509 taj1 = md->typeA[aj1];
511 jx = _mm_load_sd(x+aj13);
512 jy = _mm_load_sd(x+aj13+1);
513 jz = _mm_load_sd(x+aj13+2);
515 raj = _mm_load_sd(top->atomtypes.gb_radius+taj1);
516 vaj = _mm_load_sd(born->vsolv+aj1);
518 dx = _mm_sub_sd(ix,jx);
519 dy = _mm_sub_pd(iy,jy);
520 dz = _mm_sub_pd(iz,jz);
522 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
523 rinv = my_invrsq_pd(rsq11);
525 rinv2 = _mm_mul_sd(rinv,rinv);
526 rinv4 = _mm_mul_sd(rinv2,rinv2);
527 rinv6 = _mm_mul_sd(rinv4,rinv2);
529 rvdw = _mm_add_sd(rai,raj);
530 rvdw = _mm_mul_sd(rvdw,rvdw);
531 ratio = _mm_div_sd(rsq11,rvdw);
533 mask_cmp = _mm_cmpgt_sd(ratio,p5inv);
535 switch(_mm_movemask_pd(mask_cmp))
537 case 0xF:
538 ccf = one;
539 dccf = zero;
540 break;
541 default:
542 theta = _mm_mul_sd(ratio,pip5);
543 sincos_sse2double(theta,&sinq,&cosq);
545 term = _mm_sub_sd(one,cosq);
546 term = _mm_mul_sd(half,term);
547 ccf = _mm_mul_sd(term,term);
548 dccf = _mm_mul_sd(two,term);
549 dccf = _mm_mul_sd(dccf,sinq);
550 dccf = _mm_mul_sd(dccf,pip5);
551 dccf = _mm_mul_sd(dccf,ratio);
553 ccf = _mm_or_pd(_mm_and_pd(mask_cmp,one) ,_mm_andnot_pd(mask_cmp,ccf));
554 dccf = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
557 prod = _mm_mul_sd(p4,vaj);
558 icf4 = _mm_mul_sd(ccf,rinv4);
559 xmm2 = _mm_mul_sd(icf4,prod);
560 xmm3 = _mm_mul_sd(icf4,prod_ai);
561 gpi = _mm_add_sd(gpi,xmm2);
563 /* Load, subtract and store atom aj gpol energy */
564 xmm7 = _mm_load_sd(born->gpol_still_work+aj1);
565 xmm3 = _mm_add_sd(xmm7,xmm3);
566 _mm_store_sd(born->gpol_still_work+aj1,xmm3);
568 /* Chain rule terms */
569 ccf = _mm_mul_sd(four,ccf);
570 xmm3 = _mm_sub_sd(ccf,dccf);
571 icf6 = _mm_mul_sd(xmm3,rinv6);
572 xmm1 = _mm_mul_sd(icf6,prod);
573 xmm2 = _mm_mul_sd(icf6,prod_ai);
575 /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
576 _mm_storel_pd(fr->dadx+n,xmm1);
577 n = n + 1;
578 _mm_storel_pd(fr->dadx+n,xmm2);
579 n = n + 1;
580 } /* End offset */
582 /* Do end processing ... */
583 xmm2 = _mm_unpacklo_pd(xmm2,gpi);
584 gpi = _mm_add_pd(gpi,xmm2);
585 gpi = _mm_shuffle_pd(gpi,gpi,_MM_SHUFFLE2(1,1));
587 /* Load, add and store atom ai polarisation energy */
588 xmm2 = _mm_load_sd(born->gpol_still_work+ai);
589 gpi = _mm_add_sd(gpi,xmm2);
590 _mm_store_sd(born->gpol_still_work+ai,gpi);
593 /* Parallell summations */
594 if(PARTDECOMP(cr))
596 gmx_sum(natoms,born->gpol_still_work, cr);
598 else if(DOMAINDECOMP(cr))
600 dd_atom_sum_real(cr->dd, born->gpol_still_work);
603 /* Compute the radii */
604 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
606 if(born->use[i] != 0)
608 gpi_ai = born->gpol[i] + born->gpol_still_work[i];
609 gpi2 = gpi_ai*gpi_ai;
611 born->bRad[i]=factor*gmx_invsqrt(gpi2);
612 fr->invsqrta[i]=gmx_invsqrt(born->bRad[ai]);
616 /* Extra (local) communication reqiured for DD */
617 if(DOMAINDECOMP(cr))
619 dd_atom_spread_real(cr->dd, born->bRad);
620 dd_atom_spread_real(cr->dd, fr->invsqrta);
623 return 0;
626 int
627 calc_gb_rad_hct_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
628 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
630 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
631 int p1, p2, shift;
632 double rr,sum,sum_tmp,min_rad,rad,doff;
633 double shX,shY,shZ;
635 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
636 __m128d t1,t2,t3,rsq11,rinv,r,rai;
637 __m128d rai_inv,sk,sk2,lij,dlij,duij;
638 __m128d uij,lij2,uij2,lij3,uij3,diff2;
639 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
640 __m128d mask_cmp,mask_cmp2,mask_cmp3;
641 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9,doffset;
642 __m128d sum_ai,chrule,chrule_ai,tmp_ai;
643 __m128d sk_ai, sk2_ai,raj,raj_inv;
645 const __m128d neg = {-1.0, -1.0};
646 const __m128d zero = {0.0, 0.0};
647 const __m128d eigth = {0.125, 0.125};
648 const __m128d qrtr = {0.25, 0.25};
649 const __m128d half = {0.5, 0.5};
650 const __m128d one = {1.0, 1.0};
651 const __m128d two = {2.0, 2.0};
652 const __m128d three = {3.0, 3.0};
654 /* Keep the compiler happy */
655 tmp_ai = _mm_setzero_pd();
656 tmp = _mm_setzero_pd();
657 xmm7 = _mm_setzero_pd();
658 xmm8 = _mm_setzero_pd();
659 raj_inv = _mm_setzero_pd();
660 raj = _mm_setzero_pd();
661 sk = _mm_setzero_pd();
662 jx = _mm_setzero_pd();
663 jy = _mm_setzero_pd();
664 jz = _mm_setzero_pd();
666 /* Set the dielectric offset */
667 doff = born->gb_doffset;
668 doffset = _mm_load1_pd(&doff);
669 n = 0;
671 for(i=0;i<born->nr;i++)
673 born->gpol_hct_work[i] = 0;
676 for(i=0;i<nl->nri;i++)
678 ai = nl->iinr[i];
679 ai3 = ai * 3;
681 nj0 = nl->jindex[i];
682 nj1 = nl->jindex[i+1];
684 /* Load shifts for this list */
685 shift = nl->shift[i];
686 shX = fr->shift_vec[shift][0];
687 shY = fr->shift_vec[shift][1];
688 shZ = fr->shift_vec[shift][2];
690 /* Splat the shifts */
691 sX = _mm_load1_pd(&shX);
692 sY = _mm_load1_pd(&shY);
693 sZ = _mm_load1_pd(&shZ);
695 offset = (nj1-nj0)%2;
697 /* Load rai */
698 rr = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
699 rai = _mm_load1_pd(&rr);
700 rr = 1.0/rr;
701 rai_inv = _mm_load1_pd(&rr);
703 /* Zero out sums for polarisation energies */
704 sum_ai = _mm_setzero_pd();
706 /* Load ai coordinates and add shifts */
707 ix = _mm_load1_pd(x+ai3);
708 iy = _mm_load1_pd(x+ai3+1);
709 iz = _mm_load1_pd(x+ai3+2);
711 ix = _mm_add_pd(sX,ix);
712 iy = _mm_add_pd(sY,iy);
713 iz = _mm_add_pd(sZ,iz);
715 sk_ai = _mm_load1_pd(born->param+ai);
716 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
718 for(k=nj0;k<nj1-offset;k+=2)
720 aj1 = nl->jjnr[k];
721 aj2 = nl->jjnr[k+1];
723 aj13 = aj1 * 3;
724 aj23 = aj2 * 3;
726 /* Load particle aj1-2 coordinates */
727 xmm1 = _mm_loadu_pd(x+aj13);
728 xmm2 = _mm_loadu_pd(x+aj23);
729 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
730 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
732 jz = _mm_loadl_pd(jz, x+aj13+2);
733 jz = _mm_loadh_pd(jz, x+aj23+2);
735 dx = _mm_sub_pd(ix,jx);
736 dy = _mm_sub_pd(iy,jy);
737 dz = _mm_sub_pd(iz,jz);
739 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
740 rinv = my_invrsq_pd(rsq11);
741 r = _mm_mul_pd(rinv,rsq11);
743 sk = _mm_loadl_pd(sk,born->param+aj1);
744 sk = _mm_loadh_pd(sk,born->param+aj2);
746 /* Load aj1,aj2 raj */
747 p1 = md->typeA[aj1];
748 p2 = md->typeA[aj2];
750 raj = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
751 raj = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
752 raj = _mm_sub_pd(raj,doffset);
754 /* Compute 1.0/raj */
755 raj_inv = my_inv_pd(raj);
757 /* INTERACTION aj->ai STARS HERE */
758 /* conditional mask for rai<dr+sk */
759 xmm1 = _mm_add_pd(r,sk);
760 mask_cmp = _mm_cmplt_pd(rai,xmm1);
762 /* conditional for rai>dr-sk, ends with mask_cmp2 */
763 xmm2 = _mm_sub_pd(r,sk);
764 xmm3 = my_inv_pd(xmm2);
765 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
767 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
768 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
770 uij = my_inv_pd(xmm1);
771 lij2 = _mm_mul_pd(lij,lij);
772 lij3 = _mm_mul_pd(lij2,lij);
773 uij2 = _mm_mul_pd(uij,uij);
774 uij3 = _mm_mul_pd(uij2,uij);
776 diff2 = _mm_sub_pd(uij2,lij2);
778 lij_inv = my_invrsq_pd(lij2);
779 sk2 = _mm_mul_pd(sk,sk);
780 sk2_inv = _mm_mul_pd(sk2,rinv);
781 prod = _mm_mul_pd(qrtr,sk2_inv);
783 log_term = _mm_mul_pd(uij,lij_inv);
784 log_term = log_pd(log_term);
786 xmm1 = _mm_sub_pd(lij,uij);
787 xmm2 = _mm_mul_pd(qrtr,r);
788 xmm2 = _mm_mul_pd(xmm2,diff2);
789 xmm1 = _mm_add_pd(xmm1,xmm2);
790 xmm2 = _mm_mul_pd(half,rinv);
791 xmm2 = _mm_mul_pd(xmm2,log_term);
792 xmm1 = _mm_add_pd(xmm1,xmm2);
793 xmm9 = _mm_mul_pd(neg,diff2);
794 xmm2 = _mm_mul_pd(xmm9,prod);
795 tmp_ai = _mm_add_pd(xmm1,xmm2);
797 /* contitional for rai<sk-dr */
798 xmm3 = _mm_sub_pd(sk,r);
799 mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
801 xmm4 = _mm_sub_pd(rai_inv,lij);
802 xmm4 = _mm_mul_pd(two,xmm4);
803 xmm4 = _mm_add_pd(tmp_ai,xmm4);
805 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
807 /* the tmp will now contain two partial values, that not all are to be used. Which */
808 /* ones are governed by the mask_cmp mask. */
809 tmp_ai = _mm_mul_pd(half,tmp_ai);
810 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
811 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
813 xmm2 = _mm_mul_pd(half,lij2);
814 xmm3 = _mm_mul_pd(prod,lij3);
815 xmm2 = _mm_add_pd(xmm2,xmm3);
816 xmm3 = _mm_mul_pd(lij,rinv);
817 xmm4 = _mm_mul_pd(lij3,r);
818 xmm3 = _mm_add_pd(xmm3,xmm4);
819 xmm3 = _mm_mul_pd(qrtr,xmm3);
820 t1 = _mm_sub_pd(xmm2,xmm3);
822 xmm2 = _mm_mul_pd(half,uij2);
823 xmm2 = _mm_mul_pd(neg,xmm2);
824 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
825 xmm3 = _mm_mul_pd(xmm3,uij3);
826 xmm2 = _mm_sub_pd(xmm2,xmm3);
827 xmm3 = _mm_mul_pd(uij,rinv);
828 xmm4 = _mm_mul_pd(uij3,r);
829 xmm3 = _mm_add_pd(xmm3,xmm4);
830 xmm3 = _mm_mul_pd(qrtr,xmm3);
831 t2 = _mm_add_pd(xmm2,xmm3);
833 xmm2 = _mm_mul_pd(sk2_inv,rinv);
834 xmm2 = _mm_add_pd(one,xmm2);
835 xmm2 = _mm_mul_pd(eigth,xmm2);
836 xmm2 = _mm_mul_pd(xmm2,xmm9);
837 xmm3 = _mm_mul_pd(log_term, rinv);
838 xmm3 = _mm_mul_pd(xmm3,rinv);
839 xmm3 = _mm_mul_pd(qrtr,xmm3);
840 t3 = _mm_add_pd(xmm2,xmm3);
842 /* chain rule terms */
843 xmm2 = _mm_mul_pd(dlij,t1);
844 xmm2 = _mm_add_pd(xmm2,t2);
845 xmm2 = _mm_add_pd(xmm2,t3);
847 /* temporary storage of chain rule terms, since we have to compute
848 the reciprocal terms also before storing them */
849 chrule = _mm_mul_pd(xmm2,rinv);
851 /* INTERACTION ai->aj starts here */
852 /* Conditional mask for raj<dr+sk_ai */
853 xmm1 = _mm_add_pd(r,sk_ai);
854 mask_cmp = _mm_cmplt_pd(raj,xmm1);
856 /* Conditional for rai>dr-sk, ends with mask_cmp2 */
857 xmm2 = _mm_sub_pd(r,sk_ai);
858 xmm3 = my_inv_pd(xmm2);
859 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
861 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
862 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
864 uij = my_inv_pd(xmm1);
865 lij2 = _mm_mul_pd(lij,lij);
866 lij3 = _mm_mul_pd(lij2,lij);
867 uij2 = _mm_mul_pd(uij,uij);
868 uij3 = _mm_mul_pd(uij2,uij);
870 diff2 = _mm_sub_pd(uij2,lij2);
872 lij_inv = my_invrsq_pd(lij2);
874 sk2 = sk2_ai;
875 sk2_inv = _mm_mul_pd(sk2,rinv);
876 prod = _mm_mul_pd(qrtr,sk2_inv);
878 log_term = _mm_mul_pd(uij,lij_inv);
879 log_term = log_pd(log_term);
881 xmm1 = _mm_sub_pd(lij,uij);
882 xmm2 = _mm_mul_pd(qrtr,r);
883 xmm2 = _mm_mul_pd(xmm2,diff2);
884 xmm1 = _mm_add_pd(xmm1,xmm2);
885 xmm2 = _mm_mul_pd(half,rinv);
886 xmm2 = _mm_mul_pd(xmm2,log_term);
887 xmm1 = _mm_add_pd(xmm1,xmm2);
888 xmm9 = _mm_mul_pd(neg,diff2);
889 xmm2 = _mm_mul_pd(xmm9,prod);
890 tmp = _mm_add_pd(xmm1,xmm2);
892 /* contitional for rai<sk_ai-dr */
893 xmm3 = _mm_sub_pd(sk_ai,r);
894 mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
896 xmm4 = _mm_sub_pd(raj_inv,lij);
897 xmm4 = _mm_mul_pd(two,xmm4);
898 xmm4 = _mm_add_pd(tmp,xmm4);
900 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
902 /* the tmp will now contain two partial values, that not all are to be used. Which */
903 /* ones are governed by the mask_cmp mask. */
904 tmp = _mm_mul_pd(half,tmp);
905 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
907 /* Load, add and store ai->aj pol energy */
908 xmm7 = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
909 xmm7 = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
911 xmm7 = _mm_add_pd(xmm7,tmp);
913 _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
914 _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
916 /* Start chain rule terms */
917 xmm2 = _mm_mul_pd(half,lij2);
918 xmm3 = _mm_mul_pd(prod,lij3);
919 xmm2 = _mm_add_pd(xmm2,xmm3);
920 xmm3 = _mm_mul_pd(lij,rinv);
921 xmm4 = _mm_mul_pd(lij3,r);
922 xmm3 = _mm_add_pd(xmm3,xmm4);
923 xmm3 = _mm_mul_pd(qrtr,xmm3);
924 t1 = _mm_sub_pd(xmm2,xmm3);
926 xmm2 = _mm_mul_pd(half,uij2);
927 xmm2 = _mm_mul_pd(neg,xmm2);
928 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
929 xmm3 = _mm_mul_pd(xmm3,uij3);
930 xmm2 = _mm_sub_pd(xmm2,xmm3);
931 xmm3 = _mm_mul_pd(uij,rinv);
932 xmm4 = _mm_mul_pd(uij3,r);
933 xmm3 = _mm_add_pd(xmm3,xmm4);
934 xmm3 = _mm_mul_pd(qrtr,xmm3);
935 t2 = _mm_add_pd(xmm2,xmm3);
937 xmm2 = _mm_mul_pd(sk2_inv,rinv);
938 xmm2 = _mm_add_pd(one,xmm2);
939 xmm2 = _mm_mul_pd(eigth,xmm2);
940 xmm2 = _mm_mul_pd(xmm2,xmm9);
941 xmm3 = _mm_mul_pd(log_term, rinv);
942 xmm3 = _mm_mul_pd(xmm3,rinv);
943 xmm3 = _mm_mul_pd(qrtr,xmm3);
944 t3 = _mm_add_pd(xmm2,xmm3);
946 /* chain rule terms */
947 xmm2 = _mm_mul_pd(dlij,t1);
948 xmm2 = _mm_add_pd(xmm2,t2);
949 xmm2 = _mm_add_pd(xmm2,t3);
950 chrule_ai = _mm_mul_pd(xmm2,rinv);
952 /* Store chain rule terms
953 * same unpacking rationale as with Still above
955 xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
956 xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
958 _mm_storeu_pd(fr->dadx+n, xmm3);
959 n = n + 2;
960 _mm_storeu_pd(fr->dadx+n, xmm4);
961 n = n + 2;
964 if(offset!=0)
966 aj1 = nl->jjnr[k];
967 aj13 = aj1 * 3;
968 p1 = md->typeA[aj1];
970 jx = _mm_load_sd(x+aj13);
971 jy = _mm_load_sd(x+aj13+1);
972 jz = _mm_load_sd(x+aj13+2);
974 sk = _mm_load_sd(born->param+aj1);
976 dx = _mm_sub_sd(ix,jx);
977 dy = _mm_sub_pd(iy,jy);
978 dz = _mm_sub_pd(iz,jz);
980 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
981 rinv = my_invrsq_pd(rsq11);
982 r = _mm_mul_sd(rinv,rsq11);
984 /* Load raj */
985 raj = _mm_load_sd(top->atomtypes.gb_radius+p1);
986 raj = _mm_sub_sd(raj,doffset);
987 raj_inv = my_inv_pd(raj);
989 /* OFFSET INTERATIONS aj->ai STARTS HERE */
990 /* conditional mask for rai<dr+sk */
991 xmm1 = _mm_add_sd(r,sk);
992 mask_cmp = _mm_cmplt_sd(rai,xmm1);
994 /* conditional for rai>dr-sk, ends with mask_cmp2 */
995 xmm2 = _mm_sub_sd(r,sk);
996 xmm3 = my_inv_pd(xmm2);
997 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
999 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1000 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1002 uij = my_inv_pd(xmm1);
1003 lij2 = _mm_mul_sd(lij,lij);
1004 lij3 = _mm_mul_sd(lij2,lij);
1005 uij2 = _mm_mul_sd(uij,uij);
1006 uij3 = _mm_mul_sd(uij2,uij);
1008 diff2 = _mm_sub_sd(uij2,lij2);
1010 lij_inv = my_invrsq_pd(lij2);
1011 sk2 = _mm_mul_sd(sk,sk);
1012 sk2_inv = _mm_mul_sd(sk2,rinv);
1013 prod = _mm_mul_sd(qrtr,sk2_inv);
1015 log_term = _mm_mul_pd(uij,lij_inv);
1016 log_term = log_pd(log_term);
1018 xmm1 = _mm_sub_sd(lij,uij);
1019 xmm2 = _mm_mul_sd(qrtr,r);
1020 xmm2 = _mm_mul_sd(xmm2,diff2);
1021 xmm1 = _mm_add_sd(xmm1,xmm2);
1022 xmm2 = _mm_mul_sd(half,rinv);
1023 xmm2 = _mm_mul_sd(xmm2,log_term);
1024 xmm1 = _mm_add_sd(xmm1,xmm2);
1025 xmm9 = _mm_mul_sd(neg,diff2);
1026 xmm2 = _mm_mul_sd(xmm9,prod);
1028 tmp_ai = _mm_add_sd(xmm1,xmm2);
1030 /* contitional for rai<sk-dr */
1031 xmm3 = _mm_sub_sd(sk,r);
1032 mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1034 xmm4 = _mm_sub_sd(rai_inv,lij);
1035 xmm4 = _mm_mul_sd(two,xmm4);
1036 xmm4 = _mm_add_sd(tmp_ai,xmm4);
1038 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1040 /* the tmp will now contain two partial values, that not all are to be used. Which */
1041 /* ones are governed by the mask_cmp mask. */
1042 tmp_ai = _mm_mul_pd(half,tmp_ai);
1043 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1044 sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1046 xmm2 = _mm_mul_sd(half,lij2);
1047 xmm3 = _mm_mul_sd(prod,lij3);
1048 xmm2 = _mm_add_sd(xmm2,xmm3);
1049 xmm3 = _mm_mul_sd(lij,rinv);
1050 xmm4 = _mm_mul_sd(lij3,r);
1051 xmm3 = _mm_add_sd(xmm3,xmm4);
1052 xmm3 = _mm_mul_sd(qrtr,xmm3);
1053 t1 = _mm_sub_sd(xmm2,xmm3);
1055 xmm2 = _mm_mul_sd(half,uij2);
1056 xmm2 = _mm_mul_sd(neg,xmm2);
1057 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1058 xmm3 = _mm_mul_sd(xmm3,uij3);
1059 xmm2 = _mm_sub_sd(xmm2,xmm3);
1060 xmm3 = _mm_mul_sd(uij,rinv);
1061 xmm4 = _mm_mul_sd(uij3,r);
1062 xmm3 = _mm_add_sd(xmm3,xmm4);
1063 xmm3 = _mm_mul_sd(qrtr,xmm3);
1064 t2 = _mm_add_sd(xmm2,xmm3);
1066 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1067 xmm2 = _mm_add_sd(one,xmm2);
1068 xmm2 = _mm_mul_sd(eigth,xmm2);
1069 xmm2 = _mm_mul_sd(xmm2,xmm9);
1070 xmm3 = _mm_mul_sd(log_term, rinv);
1071 xmm3 = _mm_mul_sd(xmm3,rinv);
1072 xmm3 = _mm_mul_sd(qrtr,xmm3);
1073 t3 = _mm_add_sd(xmm2,xmm3);
1075 /* chain rule terms */
1076 xmm2 = _mm_mul_sd(dlij,t1);
1077 xmm2 = _mm_add_sd(xmm2,t2);
1078 xmm2 = _mm_add_sd(xmm2,t3);
1080 /* temporary storage of chain rule terms, since we have to compute
1081 the reciprocal terms also before storing them */
1082 chrule = _mm_mul_sd(xmm2,rinv);
1084 /* OFFSET INTERACTION ai->aj starts here */
1085 /* conditional mask for raj<dr+sk */
1086 xmm1 = _mm_add_sd(r,sk_ai);
1087 mask_cmp = _mm_cmplt_sd(raj,xmm1);
1089 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1090 xmm2 = _mm_sub_sd(r,sk_ai);
1091 xmm3 = my_inv_pd(xmm2);
1092 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1094 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1095 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1097 uij = my_inv_pd(xmm1);
1098 lij2 = _mm_mul_sd(lij,lij);
1099 lij3 = _mm_mul_sd(lij2,lij);
1100 uij2 = _mm_mul_sd(uij,uij);
1101 uij3 = _mm_mul_sd(uij2,uij);
1103 diff2 = _mm_sub_sd(uij2,lij2);
1105 lij_inv = my_invrsq_pd(lij2);
1107 sk2 = sk2_ai;
1108 sk2_inv = _mm_mul_sd(sk2,rinv);
1109 prod = _mm_mul_sd(qrtr,sk2_inv);
1111 log_term = _mm_mul_pd(uij,lij_inv);
1112 log_term = log_pd(log_term);
1114 xmm1 = _mm_sub_sd(lij,uij);
1115 xmm2 = _mm_mul_sd(qrtr,r);
1116 xmm2 = _mm_mul_sd(xmm2,diff2);
1117 xmm1 = _mm_add_sd(xmm1,xmm2);
1118 xmm2 = _mm_mul_sd(half,rinv);
1119 xmm2 = _mm_mul_sd(xmm2,log_term);
1120 xmm1 = _mm_add_sd(xmm1,xmm2);
1121 xmm9 = _mm_mul_sd(neg,diff2);
1122 xmm2 = _mm_mul_sd(xmm9,prod);
1123 tmp = _mm_add_sd(xmm1,xmm2);
1125 /* contitional for rai<sk-dr */
1126 xmm3 = _mm_sub_sd(sk_ai,r);
1127 mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1129 xmm4 = _mm_sub_sd(raj_inv,lij);
1130 xmm4 = _mm_mul_sd(two,xmm4);
1131 xmm4 = _mm_add_sd(tmp,xmm4);
1133 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1135 /* the tmp will now contain two partial values, that not all are to be used. Which */
1136 /* ones are governed by the mask_cmp mask. */
1137 tmp = _mm_mul_pd(half,tmp);
1138 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1140 /* Load, add and store gpol energy */
1141 xmm7 = _mm_load_sd(born->gpol_hct_work+aj1);
1142 xmm7 = _mm_add_sd(xmm7,tmp);
1143 _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1145 /* Start chain rule terms, t1 */
1146 xmm2 = _mm_mul_sd(half,lij2);
1147 xmm3 = _mm_mul_sd(prod,lij3);
1148 xmm2 = _mm_add_sd(xmm2,xmm3);
1149 xmm3 = _mm_mul_sd(lij,rinv);
1150 xmm4 = _mm_mul_sd(lij3,r);
1151 xmm3 = _mm_add_sd(xmm3,xmm4);
1152 xmm3 = _mm_mul_sd(qrtr,xmm3);
1153 t1 = _mm_sub_sd(xmm2,xmm3);
1155 xmm2 = _mm_mul_sd(half,uij2);
1156 xmm2 = _mm_mul_sd(neg,xmm2);
1157 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1158 xmm3 = _mm_mul_sd(xmm3,uij3);
1159 xmm2 = _mm_sub_sd(xmm2,xmm3);
1160 xmm3 = _mm_mul_sd(uij,rinv);
1161 xmm4 = _mm_mul_sd(uij3,r);
1162 xmm3 = _mm_add_sd(xmm3,xmm4);
1163 xmm3 = _mm_mul_sd(qrtr,xmm3);
1164 t2 = _mm_add_sd(xmm2,xmm3);
1166 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1167 xmm2 = _mm_add_sd(one,xmm2);
1168 xmm2 = _mm_mul_sd(eigth,xmm2);
1169 xmm2 = _mm_mul_sd(xmm2,xmm9);
1170 xmm3 = _mm_mul_sd(log_term, rinv);
1171 xmm3 = _mm_mul_sd(xmm3,rinv);
1172 xmm3 = _mm_mul_sd(qrtr,xmm3);
1173 t3 = _mm_add_sd(xmm2,xmm3);
1175 /* chain rule terms */
1176 xmm2 = _mm_mul_sd(dlij,t1);
1177 xmm2 = _mm_add_sd(xmm2,t2);
1178 xmm2 = _mm_add_sd(xmm2,t3);
1179 chrule_ai = _mm_mul_sd(xmm2,rinv);
1181 _mm_store_sd(fr->dadx+n, chrule);
1182 n = n + 1;
1183 _mm_store_sd(fr->dadx+n, chrule_ai);
1184 n = n + 1;
1187 /* Do end processing ... */
1188 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1189 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1190 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1192 /* Load, add and store atom ai polarisation energy */
1193 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1194 sum_ai = _mm_add_sd(sum_ai,xmm2);
1195 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1198 /* Parallel summations */
1199 if(PARTDECOMP(cr))
1201 gmx_sum(natoms, born->gpol_hct_work, cr);
1203 else if(DOMAINDECOMP(cr))
1205 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1208 /* Compute the radii */
1209 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1211 if(born->use[i] != 0)
1213 rr = top->atomtypes.gb_radius[md->typeA[i]]-doff;
1214 sum = 1.0/rr - born->gpol_hct_work[i];
1215 min_rad = rr + doff;
1216 rad = 1.0/sum;
1218 born->bRad[i] = rad > min_rad ? rad : min_rad;
1219 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1223 /* Extra (local) communication required for DD */
1224 if(DOMAINDECOMP(cr))
1226 dd_atom_spread_real(cr->dd, born->bRad);
1227 dd_atom_spread_real(cr->dd, fr->invsqrta);
1230 return 0;
1233 int
1234 calc_gb_rad_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
1235 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
1237 int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
1238 int p1,p2,p3,p4,shift;
1239 double rr,sum,sum_tmp,sum2,sum3,min_rad,rad,doff;
1240 double tsum,tchain,rr_inv,rr_inv2,gbr;
1241 double shX,shY,shZ;
1243 __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
1244 __m128d t1,t2,t3,rsq11,rinv,r,rai;
1245 __m128d rai_inv,sk,sk2,lij,dlij,duij;
1246 __m128d uij,lij2,uij2,lij3,uij3,diff2;
1247 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
1248 __m128d mask_cmp,mask_cmp2,mask_cmp3,doffset,raj,raj_inv;
1249 __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9;
1250 __m128d sum_ai,chrule,chrule_ai,tmp_ai,sk_ai,sk2_ai;
1252 const __m128d neg = {-1.0, -1.0};
1253 const __m128d zero = {0.0, 0.0};
1254 const __m128d eigth = {0.125, 0.125};
1255 const __m128d qrtr = {0.25, 0.25};
1256 const __m128d half = {0.5, 0.5};
1257 const __m128d one = {1.0, 1.0};
1258 const __m128d two = {2.0, 2.0};
1259 const __m128d three = {3.0, 3.0};
1261 /* Keep the compiler happy */
1262 tmp_ai = _mm_setzero_pd();
1263 tmp = _mm_setzero_pd();
1264 raj_inv = _mm_setzero_pd();
1265 raj = _mm_setzero_pd();
1266 sk = _mm_setzero_pd();
1267 jx = _mm_setzero_pd();
1268 jy = _mm_setzero_pd();
1269 jz = _mm_setzero_pd();
1270 xmm7 = _mm_setzero_pd();
1271 xmm8 = _mm_setzero_pd();
1273 /* Set the dielectric offset */
1274 doff = born->gb_doffset;
1275 doffset = _mm_load1_pd(&doff);
1277 n = 0;
1279 for(i=0;i<born->nr;i++)
1281 born->gpol_hct_work[i] = 0;
1284 for(i=0;i<nl->nri;i++)
1286 ai = nl->iinr[i];
1287 ai3 = ai * 3;
1289 nj0 = nl->jindex[i];
1290 nj1 = nl->jindex[i+1];
1292 /* Load shifts for this list */
1293 shift = nl->shift[i];
1294 shX = fr->shift_vec[shift][0];
1295 shY = fr->shift_vec[shift][1];
1296 shZ = fr->shift_vec[shift][2];
1298 /* Splat the shifts */
1299 sX = _mm_load1_pd(&shX);
1300 sY = _mm_load1_pd(&shY);
1301 sZ = _mm_load1_pd(&shZ);
1303 offset = (nj1-nj0)%2;
1305 /* Load rai */
1306 rr = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
1307 rai = _mm_load1_pd(&rr);
1308 rr = 1.0/rr;
1309 rai_inv = _mm_load1_pd(&rr);
1311 /* Load ai coordinates and add shifts */
1312 ix = _mm_load1_pd(x+ai3);
1313 iy = _mm_load1_pd(x+ai3+1);
1314 iz = _mm_load1_pd(x+ai3+2);
1316 ix = _mm_add_pd(sX,ix);
1317 iy = _mm_add_pd(sY,iy);
1318 iz = _mm_add_pd(sZ,iz);
1320 /* Zero out sums for polarisation energies */
1321 sum_ai = _mm_setzero_pd();
1323 sk_ai = _mm_load1_pd(born->param+ai);
1324 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
1326 for(k=nj0;k<nj1-offset;k+=2)
1328 aj1 = nl->jjnr[k];
1329 aj2 = nl->jjnr[k+1];
1331 aj13 = aj1 * 3;
1332 aj23 = aj2 * 3;
1334 /* Load particle aj1-2 coordinates */
1335 xmm1 = _mm_loadu_pd(x+aj13);
1336 xmm2 = _mm_loadu_pd(x+aj23);
1337 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1338 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1340 jz = _mm_loadl_pd(jz, x+aj13+2);
1341 jz = _mm_loadh_pd(jz, x+aj23+2);
1343 dx = _mm_sub_pd(ix,jx);
1344 dy = _mm_sub_pd(iy,jy);
1345 dz = _mm_sub_pd(iz,jz);
1347 rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
1348 rinv = my_invrsq_pd(rsq11);
1349 r = _mm_mul_pd(rinv,rsq11);
1351 /* Load atom aj1,aj2 raj */
1352 p1 = md->typeA[aj1];
1353 p2 = md->typeA[aj2];
1355 raj = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
1356 raj = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
1357 raj = _mm_sub_pd(raj,doffset);
1359 /* Compute 1.0/raj */
1360 raj_inv = my_inv_pd(raj);
1362 sk = _mm_loadl_pd(sk,born->param+aj1);
1363 sk = _mm_loadh_pd(sk,born->param+aj2);
1365 /* INTERACTION aj->ai STARTS HERE */
1366 /* conditional mask for rai<dr+sk */
1367 xmm1 = _mm_add_pd(r,sk);
1368 mask_cmp = _mm_cmplt_pd(rai,xmm1);
1370 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1371 xmm2 = _mm_sub_pd(r,sk);
1372 xmm3 = my_inv_pd(xmm2);
1373 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1375 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1376 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1378 uij = my_inv_pd(xmm1);
1379 lij2 = _mm_mul_pd(lij,lij);
1380 lij3 = _mm_mul_pd(lij2,lij);
1381 uij2 = _mm_mul_pd(uij,uij);
1382 uij3 = _mm_mul_pd(uij2,uij);
1384 diff2 = _mm_sub_pd(uij2,lij2);
1386 lij_inv = my_invrsq_pd(lij2);
1387 sk2 = _mm_mul_pd(sk,sk);
1388 sk2_inv = _mm_mul_pd(sk2,rinv);
1389 prod = _mm_mul_pd(qrtr,sk2_inv);
1391 log_term = _mm_mul_pd(uij,lij_inv);
1392 log_term = log_pd(log_term);
1394 xmm1 = _mm_sub_pd(lij,uij);
1395 xmm2 = _mm_mul_pd(qrtr,r);
1396 xmm2 = _mm_mul_pd(xmm2,diff2);
1397 xmm1 = _mm_add_pd(xmm1,xmm2);
1398 xmm2 = _mm_mul_pd(half,rinv);
1399 xmm2 = _mm_mul_pd(xmm2,log_term);
1400 xmm1 = _mm_add_pd(xmm1,xmm2);
1401 xmm9 = _mm_mul_pd(neg,diff2);
1402 xmm2 = _mm_mul_pd(xmm9,prod);
1403 tmp_ai = _mm_add_pd(xmm1,xmm2);
1405 /* contitional for rai<sk-dr */
1406 xmm3 = _mm_sub_pd(sk,r);
1407 mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
1409 xmm4 = _mm_sub_pd(rai_inv,lij);
1410 xmm4 = _mm_mul_pd(two,xmm4);
1411 xmm4 = _mm_add_pd(tmp_ai,xmm4);
1413 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1415 /* the tmp will now contain two partial values, that not all are to be used. Which */
1416 /* ones are governed by the mask_cmp mask. */
1417 tmp_ai = _mm_mul_pd(half,tmp_ai);
1418 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1419 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1421 /* Start the dadx chain rule terms */
1422 xmm2 = _mm_mul_pd(half,lij2);
1423 xmm3 = _mm_mul_pd(prod,lij3);
1424 xmm2 = _mm_add_pd(xmm2,xmm3);
1425 xmm3 = _mm_mul_pd(lij,rinv);
1426 xmm4 = _mm_mul_pd(lij3,r);
1427 xmm3 = _mm_add_pd(xmm3,xmm4);
1428 xmm3 = _mm_mul_pd(qrtr,xmm3);
1429 t1 = _mm_sub_pd(xmm2,xmm3);
1431 xmm2 = _mm_mul_pd(half,uij2);
1432 xmm2 = _mm_mul_pd(neg,xmm2);
1433 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
1434 xmm3 = _mm_mul_pd(xmm3,uij3);
1435 xmm2 = _mm_sub_pd(xmm2,xmm3);
1436 xmm3 = _mm_mul_pd(uij,rinv);
1437 xmm4 = _mm_mul_pd(uij3,r);
1438 xmm3 = _mm_add_pd(xmm3,xmm4);
1439 xmm3 = _mm_mul_pd(qrtr,xmm3);
1440 t2 = _mm_add_pd(xmm2,xmm3);
1442 xmm2 = _mm_mul_pd(sk2_inv,rinv);
1443 xmm2 = _mm_add_pd(one,xmm2);
1444 xmm2 = _mm_mul_pd(eigth,xmm2);
1445 xmm2 = _mm_mul_pd(xmm2,xmm9);
1446 xmm3 = _mm_mul_pd(log_term, rinv);
1447 xmm3 = _mm_mul_pd(xmm3,rinv);
1448 xmm3 = _mm_mul_pd(qrtr,xmm3);
1449 t3 = _mm_add_pd(xmm2,xmm3);
1451 /* chain rule terms */
1452 xmm2 = _mm_mul_pd(dlij,t1);
1453 xmm2 = _mm_add_pd(xmm2,t2);
1454 xmm2 = _mm_add_pd(xmm2,t3);
1456 /* temporary storage of chain rule terms, since we have to compute
1457 the reciprocal terms also before storing them */
1458 chrule = _mm_mul_pd(xmm2,rinv);
1460 /* INTERACTION ai->aj STARTS HERE */
1461 /* conditional mask for raj<dr+sk_ai */
1462 xmm1 = _mm_add_pd(r,sk_ai);
1463 mask_cmp = _mm_cmplt_pd(raj,xmm1);
1465 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1466 xmm2 = _mm_sub_pd(r,sk_ai);
1467 xmm3 = my_inv_pd(xmm2);
1468 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1470 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1471 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1473 uij = my_inv_pd(xmm1);
1474 lij2 = _mm_mul_pd(lij,lij);
1475 lij3 = _mm_mul_pd(lij2,lij);
1476 uij2 = _mm_mul_pd(uij,uij);
1477 uij3 = _mm_mul_pd(uij2,uij);
1479 diff2 = _mm_sub_pd(uij2,lij2);
1481 lij_inv = my_invrsq_pd(lij2);
1483 sk2 = sk2_ai;
1484 sk2_inv = _mm_mul_pd(sk2,rinv);
1485 prod = _mm_mul_pd(qrtr,sk2_inv);
1487 log_term = _mm_mul_pd(uij,lij_inv);
1488 log_term = log_pd(log_term);
1490 xmm1 = _mm_sub_pd(lij,uij);
1491 xmm2 = _mm_mul_pd(qrtr,r);
1492 xmm2 = _mm_mul_pd(xmm2,diff2);
1493 xmm1 = _mm_add_pd(xmm1,xmm2);
1494 xmm2 = _mm_mul_pd(half,rinv);
1495 xmm2 = _mm_mul_pd(xmm2,log_term);
1496 xmm1 = _mm_add_pd(xmm1,xmm2);
1497 xmm9 = _mm_mul_pd(neg,diff2);
1498 xmm2 = _mm_mul_pd(xmm9,prod);
1499 tmp = _mm_add_pd(xmm1,xmm2);
1501 /* contitional for raj<sk_ai-dr */
1502 xmm3 = _mm_sub_pd(sk_ai,r);
1503 mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
1505 xmm4 = _mm_sub_pd(raj_inv,lij);
1506 xmm4 = _mm_mul_pd(two,xmm4);
1507 xmm4 = _mm_add_pd(tmp,xmm4);
1509 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1511 /* the tmp will now contain two partial values, that not all are to be used. Which */
1512 /* ones are governed by the mask_cmp mask. */
1513 tmp = _mm_mul_pd(half,tmp);
1514 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1516 /* Load, add and store ai->aj pol energy */
1517 xmm7 = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
1518 xmm7 = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
1520 xmm7 = _mm_add_pd(xmm7,tmp);
1522 _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
1523 _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
1525 /* Start dadx chain rule terms */
1526 xmm2 = _mm_mul_pd(half,lij2);
1527 xmm3 = _mm_mul_pd(prod,lij3);
1528 xmm2 = _mm_add_pd(xmm2,xmm3);
1529 xmm3 = _mm_mul_pd(lij,rinv);
1530 xmm4 = _mm_mul_pd(lij3,r);
1531 xmm3 = _mm_add_pd(xmm3,xmm4);
1532 xmm3 = _mm_mul_pd(qrtr,xmm3);
1533 t1 = _mm_sub_pd(xmm2,xmm3);
1535 xmm2 = _mm_mul_pd(half,uij2);
1536 xmm2 = _mm_mul_pd(neg,xmm2);
1537 xmm3 = _mm_mul_pd(qrtr,sk2_inv);
1538 xmm3 = _mm_mul_pd(xmm3,uij3);
1539 xmm2 = _mm_sub_pd(xmm2,xmm3);
1540 xmm3 = _mm_mul_pd(uij,rinv);
1541 xmm4 = _mm_mul_pd(uij3,r);
1542 xmm3 = _mm_add_pd(xmm3,xmm4);
1543 xmm3 = _mm_mul_pd(qrtr,xmm3);
1544 t2 = _mm_add_pd(xmm2,xmm3);
1546 xmm2 = _mm_mul_pd(sk2_inv,rinv);
1547 xmm2 = _mm_add_pd(one,xmm2);
1548 xmm2 = _mm_mul_pd(eigth,xmm2);
1549 xmm2 = _mm_mul_pd(xmm2,xmm9);
1550 xmm3 = _mm_mul_pd(log_term, rinv);
1551 xmm3 = _mm_mul_pd(xmm3,rinv);
1552 xmm3 = _mm_mul_pd(qrtr,xmm3);
1553 t3 = _mm_add_pd(xmm2,xmm3);
1555 /* chain rule terms */
1556 xmm2 = _mm_mul_pd(dlij,t1);
1557 xmm2 = _mm_add_pd(xmm2,t2);
1558 xmm2 = _mm_add_pd(xmm2,t3);
1559 chrule_ai = _mm_mul_pd(xmm2,rinv);
1561 /* Store chain rule terms
1562 * same unpacking rationale as with Still above
1564 xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
1565 xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
1567 _mm_storeu_pd(fr->dadx+n, xmm3);
1568 n = n + 2;
1569 _mm_storeu_pd(fr->dadx+n, xmm4);
1570 n = n + 2;
1573 if(offset!=0)
1575 aj1 = nl->jjnr[k];
1576 aj13 = aj1 * 3;
1577 p1 = md->typeA[aj1];
1579 jx = _mm_load_sd(x+aj13);
1580 jy = _mm_load_sd(x+aj13+1);
1581 jz = _mm_load_sd(x+aj13+2);
1583 sk = _mm_load_sd(born->param+aj1);
1585 /* Load raj */
1586 raj = _mm_load_sd(top->atomtypes.gb_radius+p1);
1587 raj = _mm_sub_sd(raj,doffset);
1588 raj_inv = my_inv_pd(raj);
1590 dx = _mm_sub_sd(ix,jx);
1591 dy = _mm_sub_pd(iy,jy);
1592 dz = _mm_sub_pd(iz,jz);
1594 rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
1595 rinv = my_invrsq_pd(rsq11);
1596 r = _mm_mul_sd(rinv,rsq11);
1598 /* OFFSET INTERACTION aj->ai STARTS HERE */
1599 /* conditional mask for rai<dr+sk */
1600 xmm1 = _mm_add_sd(r,sk);
1601 mask_cmp = _mm_cmplt_sd(rai,xmm1);
1603 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1604 xmm2 = _mm_sub_sd(r,sk);
1605 xmm3 = my_inv_pd(xmm2);
1606 mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1608 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1609 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1611 uij = my_inv_pd(xmm1);
1612 lij2 = _mm_mul_sd(lij,lij);
1613 lij3 = _mm_mul_sd(lij2,lij);
1614 uij2 = _mm_mul_sd(uij,uij);
1615 uij3 = _mm_mul_sd(uij2,uij);
1617 diff2 = _mm_sub_sd(uij2,lij2);
1619 lij_inv = my_invrsq_pd(lij2);
1620 sk2 = _mm_mul_sd(sk,sk);
1621 sk2_inv = _mm_mul_sd(sk2,rinv);
1622 prod = _mm_mul_sd(qrtr,sk2_inv);
1624 log_term = _mm_mul_pd(uij,lij_inv);
1625 log_term = log_pd(log_term);
1627 xmm1 = _mm_sub_sd(lij,uij);
1628 xmm2 = _mm_mul_sd(qrtr,r);
1629 xmm2 = _mm_mul_sd(xmm2,diff2);
1630 xmm1 = _mm_add_sd(xmm1,xmm2);
1631 xmm2 = _mm_mul_sd(half,rinv);
1632 xmm2 = _mm_mul_sd(xmm2,log_term);
1633 xmm1 = _mm_add_sd(xmm1,xmm2);
1634 xmm9 = _mm_mul_sd(neg,diff2);
1635 xmm2 = _mm_mul_sd(xmm9,prod);
1637 tmp_ai = _mm_add_sd(xmm1,xmm2);
1639 /* contitional for rai<sk-dr */
1640 xmm3 = _mm_sub_sd(sk,r);
1641 mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1643 xmm4 = _mm_sub_sd(rai_inv,lij);
1644 xmm4 = _mm_mul_sd(two,xmm4);
1645 xmm4 = _mm_add_sd(tmp_ai,xmm4);
1647 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1649 /* the tmp will now contain two partial values, that not all are to be used. Which */
1650 /* ones are governed by the mask_cmp mask. */
1651 tmp_ai = _mm_mul_pd(half,tmp_ai);
1652 tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1653 sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1655 xmm2 = _mm_mul_sd(half,lij2);
1656 xmm3 = _mm_mul_sd(prod,lij3);
1657 xmm2 = _mm_add_sd(xmm2,xmm3);
1658 xmm3 = _mm_mul_sd(lij,rinv);
1659 xmm4 = _mm_mul_sd(lij3,r);
1660 xmm3 = _mm_add_sd(xmm3,xmm4);
1661 xmm3 = _mm_mul_sd(qrtr,xmm3);
1662 t1 = _mm_sub_sd(xmm2,xmm3);
1664 xmm2 = _mm_mul_sd(half,uij2);
1665 xmm2 = _mm_mul_sd(neg,xmm2);
1666 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1667 xmm3 = _mm_mul_sd(xmm3,uij3);
1668 xmm2 = _mm_sub_sd(xmm2,xmm3);
1669 xmm3 = _mm_mul_sd(uij,rinv);
1670 xmm4 = _mm_mul_sd(uij3,r);
1671 xmm3 = _mm_add_sd(xmm3,xmm4);
1672 xmm3 = _mm_mul_sd(qrtr,xmm3);
1673 t2 = _mm_add_sd(xmm2,xmm3);
1675 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1676 xmm2 = _mm_add_sd(one,xmm2);
1677 xmm2 = _mm_mul_sd(eigth,xmm2);
1678 xmm2 = _mm_mul_sd(xmm2,xmm9);
1679 xmm3 = _mm_mul_sd(log_term, rinv);
1680 xmm3 = _mm_mul_sd(xmm3,rinv);
1681 xmm3 = _mm_mul_sd(qrtr,xmm3);
1682 t3 = _mm_add_sd(xmm2,xmm3);
1684 /* chain rule terms */
1685 xmm2 = _mm_mul_sd(dlij,t1);
1686 xmm2 = _mm_add_sd(xmm2,t2);
1687 xmm2 = _mm_add_sd(xmm2,t3);
1688 chrule = _mm_mul_sd(xmm2,rinv);
1690 /* OFFSET INTERACTION ai->aj STARTS HERE */
1691 /* conditional mask for raj<dr+sk_ai */
1692 xmm1 = _mm_add_sd(r,sk_ai);
1693 mask_cmp = _mm_cmplt_sd(raj,xmm1);
1695 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1696 xmm2 = _mm_sub_sd(r,sk_ai);
1697 xmm3 = my_inv_pd(xmm2);
1698 mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1700 lij = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv) ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1701 dlij = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1703 uij = my_inv_pd(xmm1);
1704 lij2 = _mm_mul_sd(lij,lij);
1705 lij3 = _mm_mul_sd(lij2,lij);
1706 uij2 = _mm_mul_sd(uij,uij);
1707 uij3 = _mm_mul_sd(uij2,uij);
1709 diff2 = _mm_sub_sd(uij2,lij2);
1711 lij_inv = my_invrsq_pd(lij2);
1713 sk2 = sk2_ai;
1714 sk2_inv = _mm_mul_sd(sk2,rinv);
1715 prod = _mm_mul_sd(qrtr,sk2_inv);
1717 log_term = _mm_mul_pd(uij,lij_inv);
1718 log_term = log_pd(log_term);
1720 xmm1 = _mm_sub_sd(lij,uij);
1721 xmm2 = _mm_mul_sd(qrtr,r);
1722 xmm2 = _mm_mul_sd(xmm2,diff2);
1723 xmm1 = _mm_add_sd(xmm1,xmm2);
1724 xmm2 = _mm_mul_sd(half,rinv);
1725 xmm2 = _mm_mul_sd(xmm2,log_term);
1726 xmm1 = _mm_add_sd(xmm1,xmm2);
1727 xmm9 = _mm_mul_sd(neg,diff2);
1728 xmm2 = _mm_mul_sd(xmm9,prod);
1729 tmp = _mm_add_sd(xmm1,xmm2);
1731 /* contitional for raj<sk_ai-dr */
1732 xmm3 = _mm_sub_sd(sk_ai,r);
1733 mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1735 xmm4 = _mm_sub_sd(raj_inv,lij);
1736 xmm4 = _mm_mul_sd(two,xmm4);
1737 xmm4 = _mm_add_sd(tmp,xmm4);
1739 tmp = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4) ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1741 /* the tmp will now contain two partial values, that not all are to be used. Which */
1742 /* ones are governed by the mask_cmp mask. */
1743 tmp = _mm_mul_pd(half,tmp);
1744 tmp = _mm_or_pd(_mm_and_pd(mask_cmp,tmp) ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1746 /* Load, add and store gpol energy */
1747 xmm7 = _mm_load_sd(born->gpol_hct_work+aj1);
1748 xmm7 = _mm_add_sd(xmm7,tmp);
1749 _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1751 /* Start chain rule terms, t1 */
1752 xmm2 = _mm_mul_sd(half,lij2);
1753 xmm3 = _mm_mul_sd(prod,lij3);
1754 xmm2 = _mm_add_sd(xmm2,xmm3);
1755 xmm3 = _mm_mul_sd(lij,rinv);
1756 xmm4 = _mm_mul_sd(lij3,r);
1757 xmm3 = _mm_add_sd(xmm3,xmm4);
1758 xmm3 = _mm_mul_sd(qrtr,xmm3);
1759 t1 = _mm_sub_sd(xmm2,xmm3);
1761 xmm2 = _mm_mul_sd(half,uij2);
1762 xmm2 = _mm_mul_sd(neg,xmm2);
1763 xmm3 = _mm_mul_sd(qrtr,sk2_inv);
1764 xmm3 = _mm_mul_sd(xmm3,uij3);
1765 xmm2 = _mm_sub_sd(xmm2,xmm3);
1766 xmm3 = _mm_mul_sd(uij,rinv);
1767 xmm4 = _mm_mul_sd(uij3,r);
1768 xmm3 = _mm_add_sd(xmm3,xmm4);
1769 xmm3 = _mm_mul_sd(qrtr,xmm3);
1770 t2 = _mm_add_sd(xmm2,xmm3);
1772 xmm2 = _mm_mul_sd(sk2_inv,rinv);
1773 xmm2 = _mm_add_sd(one,xmm2);
1774 xmm2 = _mm_mul_sd(eigth,xmm2);
1775 xmm2 = _mm_mul_sd(xmm2,xmm9);
1776 xmm3 = _mm_mul_sd(log_term, rinv);
1777 xmm3 = _mm_mul_sd(xmm3,rinv);
1778 xmm3 = _mm_mul_sd(qrtr,xmm3);
1779 t3 = _mm_add_sd(xmm2,xmm3);
1781 /* chain rule terms */
1782 xmm2 = _mm_mul_sd(dlij,t1);
1783 xmm2 = _mm_add_sd(xmm2,t2);
1784 xmm2 = _mm_add_sd(xmm2,t3);
1785 chrule_ai = _mm_mul_sd(xmm2,rinv);
1787 _mm_store_sd(fr->dadx+n, chrule);
1788 n = n + 1;
1789 _mm_store_sd(fr->dadx+n, chrule_ai);
1790 n = n + 1;
1793 /* Do end processing ... */
1794 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1795 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1796 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1798 /* Load, add and store atom ai polarisation energy */
1799 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1800 sum_ai = _mm_add_sd(sum_ai,xmm2);
1801 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1804 /* Parallel summations */
1805 if(PARTDECOMP(cr))
1807 gmx_sum(natoms, born->gpol_hct_work, cr);
1809 else if(DOMAINDECOMP(cr))
1811 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1814 /* Compute the radii */
1815 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1817 if(born->use[i] != 0)
1819 rr = top->atomtypes.gb_radius[md->typeA[i]];
1820 rr_inv2 = 1.0/rr;
1821 rr = rr-doff;
1822 rr_inv = 1.0/rr;
1823 sum = rr * born->gpol_hct_work[i];
1824 sum2 = sum * sum;
1825 sum3 = sum2 * sum;
1827 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1828 born->bRad[i] = rr_inv - tsum*rr_inv2;
1829 born->bRad[i] = 1.0 / born->bRad[i];
1831 fr->invsqrta[ai]=gmx_invsqrt(born->bRad[ai]);
1833 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1834 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1838 /* Extra (local) communication required for DD */
1839 if(DOMAINDECOMP(cr))
1841 dd_atom_spread_real(cr->dd, born->bRad);
1842 dd_atom_spread_real(cr->dd, fr->invsqrta);
1843 dd_atom_spread_real(cr->dd, born->drobc);
1846 return 0;
1852 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
1853 double *fshift, double *shift_vec, int gb_algorithm, gmx_genborn_t *born)
1855 int i,k,n,ai,aj,ai3,aj1,aj2,aj13,aj23,aj4,nj0,nj1,offset;
1856 int shift;
1857 double rbi,shX,shY,shZ;
1858 double *rb;
1860 __m128d ix,iy,iz,jx,jy,jz,fix,fiy,fiz,sX,sY,sZ;
1861 __m128d dx,dy,dz,t1,t2,t3,dva,dvaj,dax,dax_ai,fgb,fgb_ai;
1862 __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1864 t1 = t2 = t3 = _mm_setzero_pd();
1865 rb = born->work;
1867 /* Loop to get the proper form for the Born radius term */
1868 if(gb_algorithm==egbSTILL) {
1869 for(i=0;i<natoms;i++)
1871 rbi = born->bRad[i];
1872 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1876 else if(gb_algorithm==egbHCT) {
1877 for(i=0;i<natoms;i++)
1879 rbi = born->bRad[i];
1880 rb[i] = rbi * rbi * dvda[i];
1884 else if(gb_algorithm==egbOBC) {
1885 for(i=0;i<natoms;i++)
1887 rbi = born->bRad[i];
1888 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1892 n = 0;
1894 for(i=0;i<nl->nri;i++)
1896 ai = nl->iinr[i];
1897 ai3 = ai * 3;
1899 nj0 = nl->jindex[i];
1900 nj1 = nl->jindex[i+1];
1902 /* Load shifts for this list */
1903 shift = 3*nl->shift[i];
1904 shX = shift_vec[shift+0];
1905 shY = shift_vec[shift+1];
1906 shZ = shift_vec[shift+2];
1908 /* Splat the shifts */
1909 sX = _mm_load1_pd(&shX);
1910 sY = _mm_load1_pd(&shY);
1911 sZ = _mm_load1_pd(&shZ);
1913 offset = (nj1-nj0)%2;
1915 /* Load particle ai coordinates and add shifts */
1916 ix = _mm_load1_pd(xd+ai3);
1917 iy = _mm_load1_pd(xd+ai3+1);
1918 iz = _mm_load1_pd(xd+ai3+2);
1920 ix = _mm_add_pd(sX,ix);
1921 iy = _mm_add_pd(sY,iy);
1922 iz = _mm_add_pd(sZ,iz);
1924 /* Load particle ai dvda */
1925 dva = _mm_load1_pd(rb+ai);
1927 fix = _mm_setzero_pd();
1928 fiy = _mm_setzero_pd();
1929 fiz = _mm_setzero_pd();
1931 for(k=nj0;k<nj1-offset;k+=2)
1933 aj1 = nl->jjnr[k];
1934 aj2 = nl->jjnr[k+1];
1936 /* Load dvda_j */
1937 dvaj = _mm_set_pd(rb[aj2],rb[aj1]);
1939 aj13 = aj1 * 3;
1940 aj23 = aj2 * 3;
1942 /* Load particle aj1-2 coordinates */
1943 xmm1 = _mm_loadu_pd(xd+aj13);
1944 xmm2 = _mm_loadu_pd(xd+aj23);
1946 xmm5 = _mm_load_sd(xd+aj13+2);
1947 xmm6 = _mm_load_sd(xd+aj23+2);
1949 jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1950 jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1951 jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
1953 /* load derivative of born radii w.r.t. coordinates */
1954 xmm7 = _mm_loadu_pd(dadx+n);
1955 n = n + 2;
1956 xmm8 = _mm_loadu_pd(dadx+n);
1957 n = n + 2;
1959 /* Shuffle to get the ai and aj components right */
1960 dax = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(2,0));
1961 dax_ai = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(3,1));
1963 /* distances */
1964 dx = _mm_sub_pd(ix,jx);
1965 dy = _mm_sub_pd(iy,jy);
1966 dz = _mm_sub_pd(iz,jz);
1968 /* scalar force */
1969 fgb = _mm_mul_pd(dva,dax);
1970 fgb_ai = _mm_mul_pd(dvaj,dax_ai);
1971 fgb = _mm_add_pd(fgb,fgb_ai);
1973 /* partial forces */
1974 t1 = _mm_mul_pd(fgb,dx);
1975 t2 = _mm_mul_pd(fgb,dy);
1976 t3 = _mm_mul_pd(fgb,dz);
1978 /* update the i force */
1979 fix = _mm_add_pd(fix,t1);
1980 fiy = _mm_add_pd(fiy,t2);
1981 fiz = _mm_add_pd(fiz,t3);
1983 /* accumulate forces from memory */
1984 xmm1 = _mm_loadu_pd(f+aj13); /* fx1 fx2 */
1985 xmm2 = _mm_loadu_pd(f+aj23); /* fy1 fy2 */
1987 xmm5 = _mm_load1_pd(f+aj13+2); /* fz1 fz1 */
1988 xmm6 = _mm_load1_pd(f+aj23+2); /* fz2 fz2 */
1990 /* transpose */
1991 xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1992 xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1993 xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1995 /* subtract partial forces */
1996 xmm5 = _mm_sub_pd(xmm5, t1);
1997 xmm6 = _mm_sub_pd(xmm6, t2);
1998 xmm7 = _mm_sub_pd(xmm7, t3);
2000 /* transpose back fx and fy */
2001 xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
2002 xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1));
2004 /* store the force, first fx's and fy's */
2005 _mm_storeu_pd(f+aj13,xmm1);
2006 _mm_storeu_pd(f+aj23,xmm2);
2008 /* then fz */
2009 _mm_storel_pd(f+aj13+2,xmm7);
2010 _mm_storeh_pd(f+aj23+2,xmm7);
2013 if(offset!=0)
2015 aj1 = nl->jjnr[k];
2017 dvaj = _mm_load_sd(rb+aj1);
2019 aj13 = aj1 * 3;
2021 jx = _mm_load_sd(xd+aj13);
2022 jy = _mm_load_sd(xd+aj13+1);
2023 jz = _mm_load_sd(xd+aj13+2);
2025 dax = _mm_load_sd(dadx+n);
2026 n = n + 1;
2027 dax_ai = _mm_load_sd(dadx+n);
2028 n = n + 1;
2030 dx = _mm_sub_sd(ix,jx);
2031 dy = _mm_sub_sd(iy,jy);
2032 dz = _mm_sub_sd(iz,jz);
2034 fgb = _mm_mul_sd(dva,dax);
2035 fgb_ai = _mm_mul_sd(dvaj,dax_ai);
2036 fgb = _mm_add_pd(fgb,fgb_ai);
2038 t1 = _mm_mul_sd(fgb,dx);
2039 t2 = _mm_mul_sd(fgb,dy);
2040 t3 = _mm_mul_sd(fgb,dz);
2042 fix = _mm_add_sd(fix,t1);
2043 fiy = _mm_add_sd(fiy,t2);
2044 fiz = _mm_add_sd(fiz,t3);
2046 /* accumulate forces from memory */
2047 xmm5 = _mm_load_sd(f+aj13);
2048 xmm6 = _mm_load_sd(f+aj13+1);
2049 xmm7 = _mm_load_sd(f+aj13+2);
2051 /* subtract partial forces */
2052 xmm5 = _mm_sub_sd(xmm5, t1);
2053 xmm6 = _mm_sub_sd(xmm6, t2);
2054 xmm7 = _mm_sub_sd(xmm7, t3);
2056 /* store the force */
2057 _mm_store_sd(f+aj13,xmm5);
2058 _mm_store_sd(f+aj13+1,xmm6);
2059 _mm_store_sd(f+aj13+2,xmm7);
2062 /* fix/fiy/fiz now contain two partial terms, that all should be
2063 * added to the i particle forces
2065 t1 = _mm_unpacklo_pd(t1,fix);
2066 t2 = _mm_unpacklo_pd(t2,fiy);
2067 t3 = _mm_unpacklo_pd(t3,fiz);
2069 fix = _mm_add_pd(fix,t1);
2070 fiy = _mm_add_pd(fiy,t2);
2071 fiz = _mm_add_pd(fiz,t3);
2073 fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
2074 fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
2075 fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
2077 /* load, add and store i forces */
2078 xmm1 = _mm_load_sd(f+ai3);
2079 xmm2 = _mm_load_sd(f+ai3+1);
2080 xmm3 = _mm_load_sd(f+ai3+2);
2082 fix = _mm_add_sd(fix,xmm1);
2083 fiy = _mm_add_sd(fiy,xmm2);
2084 fiz = _mm_add_sd(fiz,xmm3);
2086 _mm_store_sd(f+ai3,fix);
2087 _mm_store_sd(f+ai3+1,fiy);
2088 _mm_store_sd(f+ai3+2,fiz);
2090 /* load, add and store i shift forces */
2091 xmm1 = _mm_load_sd(fshift+shift);
2092 xmm2 = _mm_load_sd(fshift+shift+1);
2093 xmm3 = _mm_load_sd(fshift+shift+2);
2095 fix = _mm_add_sd(fix,xmm1);
2096 fiy = _mm_add_sd(fiy,xmm2);
2097 fiz = _mm_add_sd(fiz,xmm3);
2099 _mm_store_sd(fshift+shift,fix);
2100 _mm_store_sd(fshift+shift+1,fiy);
2101 _mm_store_sd(fshift+shift+2,fiz);
2105 return 0;
2108 #else
2109 /* keep compiler happy */
2110 int genborn_sse_dummy;
2112 #endif /* SSE2 intrinsics available */