19 #include "gmx_fatal.h"
20 #include "mtop_util.h"
30 /* Only compile this file if SSE2 intrinsics are available */
31 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
35 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
36 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
37 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
38 #elif defined(__GNUC__)
39 #define gmx_castsi128_pd(a) ((__m128d)(a))
40 #define gmx_castpd_si128(a) ((__m128i)(a))
42 static __m128d
gmx_castsi128_pd(__m128i a
) { return *(__m128d
*) &a
; }
43 static __m128i
gmx_castpd_si128(__m128d a
) { return *(__m128i
*) &a
; }
48 sincos_sse2double(__m128d x
, __m128d
*sinval
, __m128d
*cosval
)
50 const __m128d two_over_pi
= {2.0/M_PI
,2.0/M_PI
};
51 const __m128d half
= {0.5,0.5};
52 const __m128d one
= {1.0,1.0};
53 const __m128i izero
= _mm_set1_epi32(0);
54 const __m128i ione
= _mm_set1_epi32(1);
55 const __m128i itwo
= _mm_set1_epi32(2);
56 const __m128i ithree
= _mm_set1_epi32(3);
57 const __m128d sincosd_kc1
= {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
58 const __m128d sincosd_kc2
= {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
59 const __m128d sincosd_cc0
= {0.00000000206374484196,0.00000000206374484196};
60 const __m128d sincosd_cc1
= {-0.00000027555365134677,-0.00000027555365134677};
61 const __m128d sincosd_cc2
= {0.00002480157946764225,0.00002480157946764225};
62 const __m128d sincosd_cc3
= {-0.00138888888730525966,-0.00138888888730525966};
63 const __m128d sincosd_cc4
= {0.04166666666651986722,0.04166666666651986722};
64 const __m128d sincosd_cc5
= {-0.49999999999999547304,-0.49999999999999547304};
65 const __m128d sincosd_sc0
= {0.00000000015893606014,0.00000000015893606014};
66 const __m128d sincosd_sc1
= {-0.00000002505069049138,-0.00000002505069049138};
67 const __m128d sincosd_sc2
= {0.00000275573131527032,0.00000275573131527032};
68 const __m128d sincosd_sc3
= {-0.00019841269827816117,-0.00019841269827816117};
69 const __m128d sincosd_sc4
= {0.00833333333331908278,0.00833333333331908278};
70 const __m128d sincosd_sc5
= {-0.16666666666666612594,-0.16666666666666612594};
72 __m128d signbit
= gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL
));
73 __m128d tiny
= gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL
));
75 __m128d xl
,xl2
,xl3
,qd
,absxl
,p1
,cx
,sx
,ts
,tc
,tsn
,tcn
;
77 __m128i offsetSin
,offsetCos
;
78 __m128d sinMask
,cosMask
,isTiny
;
79 __m128d ct0
,ct1
,ct2
,ct3
,ct4
,ct5
,ct6
,st1
,st2
,st3
,st4
,st6
;
81 /* Rescale the angle to the range 0..4, and find which quadrant it is in */
82 xl
= _mm_mul_pd(x
,two_over_pi
);
84 /* q=integer part of xl, rounded _away_ from 0.0 */
85 /* Add 0.5 away from 0.0 */
86 xl
= _mm_add_pd(xl
,_mm_or_pd(_mm_and_pd(xl
,signbit
),half
));
88 q
= _mm_cvttpd_epi32(xl
);
89 qd
= _mm_cvtepi32_pd(q
);
90 q
= _mm_shuffle_epi32(q
,_MM_SHUFFLE(1,1,0,0));
92 /* Compute offset based on quadrant the arg falls in */
93 offsetSin
= _mm_and_si128(q
,ithree
);
94 offsetCos
= _mm_add_epi32(offsetSin
,ione
);
96 /* Remainder in range [-pi/4..pi/4] */
97 p1
= _mm_mul_pd(qd
,sincosd_kc1
);
98 xl
= _mm_mul_pd(qd
,sincosd_kc2
);
99 p1
= _mm_sub_pd(x
,p1
);
100 xl
= _mm_sub_pd(p1
,xl
);
102 absxl
= _mm_andnot_pd(signbit
,xl
);
103 isTiny
= _mm_cmpgt_pd(tiny
,absxl
);
105 xl2
= _mm_mul_pd(xl
,xl
);
106 xl3
= _mm_mul_pd(xl2
,xl
);
108 ct0
= _mm_mul_pd(xl2
,xl2
);
109 ct1
= _mm_mul_pd(sincosd_cc0
,xl2
);
110 ct2
= _mm_mul_pd(sincosd_cc2
,xl2
);
111 ct3
= _mm_mul_pd(sincosd_cc4
,xl2
);
112 st1
= _mm_mul_pd(sincosd_sc0
,xl2
);
113 st2
= _mm_mul_pd(sincosd_sc2
,xl2
);
114 st3
= _mm_mul_pd(sincosd_sc4
,xl2
);
115 ct1
= _mm_add_pd(ct1
,sincosd_cc1
);
116 ct2
= _mm_add_pd(ct2
,sincosd_cc3
);
117 ct3
= _mm_add_pd(ct3
,sincosd_cc5
);
118 st1
= _mm_add_pd(st1
,sincosd_sc1
);
119 st2
= _mm_add_pd(st2
,sincosd_sc3
);
120 st3
= _mm_add_pd(st3
,sincosd_sc5
);
122 ct4
= _mm_mul_pd(ct2
,ct0
);
123 ct4
= _mm_add_pd(ct4
,ct3
);
125 st4
= _mm_mul_pd(st2
,ct0
);
126 st4
= _mm_add_pd(st4
,st3
);
127 ct5
= _mm_mul_pd(ct0
,ct0
);
129 ct6
= _mm_mul_pd(ct5
,ct1
);
130 ct6
= _mm_add_pd(ct6
,ct4
);
132 st6
= _mm_mul_pd(ct5
,st1
);
133 st6
= _mm_add_pd(st6
,st4
);
135 cx
= _mm_mul_pd(ct6
,xl2
);
136 cx
= _mm_add_pd(cx
,one
);
138 sx
= _mm_mul_pd(st6
,xl3
);
139 sx
= _mm_add_pd(sx
,xl
);
141 /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
142 sx
= _mm_or_pd( _mm_and_pd(isTiny
,xl
) , _mm_andnot_pd(isTiny
,sx
) );
143 cx
= _mm_or_pd( _mm_and_pd(isTiny
,one
) , _mm_andnot_pd(isTiny
,cx
) );
145 sinMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin
,ione
), izero
));
146 cosMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos
,ione
), izero
));
148 ts
= _mm_or_pd( _mm_and_pd(sinMask
,sx
) , _mm_andnot_pd(sinMask
,cx
) );
149 tc
= _mm_or_pd( _mm_and_pd(cosMask
,sx
) , _mm_andnot_pd(cosMask
,cx
) );
151 /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
152 sinMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin
,itwo
), izero
));
153 tsn
= _mm_xor_pd(signbit
,ts
);
154 ts
= _mm_or_pd( _mm_and_pd(sinMask
,ts
) , _mm_andnot_pd(sinMask
,tsn
) );
156 cosMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos
,itwo
), izero
));
157 tcn
= _mm_xor_pd(signbit
,tc
);
158 tc
= _mm_or_pd( _mm_and_pd(cosMask
,tc
) , _mm_andnot_pd(cosMask
,tcn
) );
166 __m128d
log_pd(__m128d x
)
168 const __m128i exp_mask
= _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
169 const __m128i exp_bias
= _mm_set_epi32(0,1023,0,1023);
171 const __m128d const_loge
= _mm_set1_pd(0.69314718055994529);
172 const __m128d const_one
= _mm_set1_pd(1.0);
173 const __m128d const_two
= _mm_set1_pd(2.0);
175 /* Almost full single precision accuracy (~20 bits worst case) */
176 const __m128d P0
= _mm_set1_pd(6.108179944792157749153050);
177 const __m128d P1
= _mm_set1_pd(52.43691313715523327631139);
178 const __m128d P2
= _mm_set1_pd(71.53664010795613671168440);
179 const __m128d P3
= _mm_set1_pd(18.92097516931559547548485);
180 const __m128d P4
= _mm_set1_pd(0.3504714784635941984522153);
181 const __m128d P5
= _mm_set1_pd(-0.007105890734229368515879);
182 const __m128d Q1
= _mm_set1_pd(17.73314231909420567454406);
183 const __m128d Q2
= _mm_set1_pd(48.82373085428713023213363);
184 const __m128d Q3
= _mm_set1_pd(31.65945943354513166309101);
185 const __m128d Q4
= _mm_set1_pd(4.302477477108162270199051);
187 __m128d xmm0
,xmm1
,xmm2
,xmm3
, xmm4
;
190 xmmi
= gmx_castpd_si128(x
);
191 xmm1
= _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi
, exp_mask
), 52), exp_bias
),_MM_SHUFFLE(3,1,2,0)));
192 xmm0
= _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask
, xmmi
)), const_one
);
194 xmm2
= _mm_mul_pd(P5
,xmm0
);
195 xmm2
= _mm_add_pd(xmm2
,P4
);
196 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
197 xmm2
= _mm_add_pd(xmm2
,P3
);
198 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
199 xmm2
= _mm_add_pd(xmm2
,P2
);
200 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
201 xmm2
= _mm_add_pd(xmm2
,P1
);
202 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
203 xmm2
= _mm_add_pd(xmm2
,P0
);
205 xmm3
= _mm_mul_pd(Q4
,xmm0
);
206 xmm3
= _mm_add_pd(xmm3
,Q3
);
207 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
208 xmm3
= _mm_add_pd(xmm3
,Q2
);
209 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
210 xmm3
= _mm_add_pd(xmm3
,Q1
);
211 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
212 xmm3
= _mm_add_pd(xmm3
,const_one
);
215 xmm4
= _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3
)));
216 xmm4
= _mm_mul_pd(xmm4
,_mm_sub_pd(const_two
,_mm_mul_pd(xmm3
,xmm4
)));
217 xmm4
= _mm_mul_pd(xmm4
,_mm_sub_pd(const_two
,_mm_mul_pd(xmm3
,xmm4
)));
218 xmm2
= _mm_mul_pd(xmm2
,xmm4
);
220 xmm0
= _mm_sub_pd(xmm0
, const_one
);
221 xmm0
= _mm_mul_pd(xmm0
,xmm2
);
223 xmm0
= _mm_add_pd(xmm0
,xmm1
);
225 return _mm_mul_pd(xmm0
, const_loge
);
228 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
229 * The polynomial minimax coefficients are actually accurate to 10E-14,
230 * but we lose some accuracy in the polynomial evaluation.
232 __m128d
exp_pd(__m128d x
)
234 const __m128d lim1
= _mm_set1_pd(1025.0); /* 1025.00000e+0d */
235 const __m128d lim2
= _mm_set1_pd(-1022.99999999); /* -1022.99999e+0f */
237 const __m128i base
= _mm_set_epi32(0,0,1023,1023);
238 const __m128d half
= _mm_set1_pd(0.5);
239 const __m128d log2e
= _mm_set1_pd(1.4426950408889634);
241 const __m128d exp_P0
= _mm_set1_pd(1.00000000000001276211229749);
242 const __m128d exp_P1
= _mm_set1_pd(6.931471805598709708635169E-1);
243 const __m128d exp_P2
= _mm_set1_pd(2.402265069564965287455972E-1);
244 const __m128d exp_P3
= _mm_set1_pd(5.550410866868561155599683E-2);
245 const __m128d exp_P4
= _mm_set1_pd(9.618129192067246128919915E-3);
246 const __m128d exp_P5
= _mm_set1_pd(1.333355761760444302342084E-3);
247 const __m128d exp_P6
= _mm_set1_pd(1.540343494807179111289781E-4);
248 const __m128d exp_P7
= _mm_set1_pd(1.525298483865349629325421E-5);
249 const __m128d exp_P8
= _mm_set1_pd(1.325940560934510417818578E-6);
250 const __m128d exp_P9
= _mm_set1_pd(1.015033670529892589443421E-7);
255 xmm0
= _mm_mul_pd(x
,log2e
);
256 xmm0
= _mm_min_pd(xmm0
,lim1
);
257 xmm0
= _mm_max_pd(xmm0
,lim2
);
258 xmm1
= _mm_sub_pd(xmm0
,half
);
260 xmmi
= _mm_cvtpd_epi32(xmm1
);
261 xmm1
= _mm_cvtepi32_pd(xmmi
);
263 xmmi
= _mm_add_epi32(xmmi
,base
);
264 xmmi
= _mm_shuffle_epi32(xmmi
,_MM_SHUFFLE(3,1,2,0));
265 xmmi
= _mm_slli_epi64(xmmi
,52);
267 xmm0
= _mm_sub_pd(xmm0
,xmm1
);
269 xmm1
= _mm_mul_pd(exp_P9
,xmm0
);
270 xmm1
= _mm_add_pd(xmm1
,exp_P8
);
271 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
272 xmm1
= _mm_add_pd(xmm1
,exp_P7
);
273 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
274 xmm1
= _mm_add_pd(xmm1
,exp_P6
);
275 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
276 xmm1
= _mm_add_pd(xmm1
,exp_P5
);
277 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
278 xmm1
= _mm_add_pd(xmm1
,exp_P4
);
279 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
280 xmm1
= _mm_add_pd(xmm1
,exp_P3
);
281 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
282 xmm1
= _mm_add_pd(xmm1
,exp_P2
);
283 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
284 xmm1
= _mm_add_pd(xmm1
,exp_P1
);
285 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
286 xmm1
= _mm_add_pd(xmm1
,exp_P0
);
287 xmm1
= _mm_mul_pd(xmm1
,gmx_castsi128_pd(xmmi
));
292 static inline __m128d
293 my_invrsq_pd(__m128d x
)
295 const __m128d three
= {3.0, 3.0};
296 const __m128d half
= {0.5, 0.5};
298 __m128 t
= _mm_rsqrt_ps(_mm_cvtpd_ps(x
)); /* Convert to single precision and do _mm_rsqrt_ps() */
299 __m128d t1
= _mm_cvtps_pd(t
); /* Convert back to double precision */
301 /* First Newton-Rapson step, accuracy is now 24 bits */
302 __m128d t2
= _mm_mul_pd(half
,_mm_mul_pd(t1
,_mm_sub_pd(three
,_mm_mul_pd(x
,_mm_mul_pd(t1
,t1
)))));
304 /* Return second Newton-Rapson step, accuracy 48 bits */
305 return _mm_mul_pd(half
,_mm_mul_pd(t2
,_mm_sub_pd(three
,_mm_mul_pd(x
,_mm_mul_pd(t2
,t2
)))));
308 static inline __m128d
311 const __m128d two
= {2.0, 2.0};
313 __m128 t
= _mm_rcp_ps(_mm_cvtpd_ps(x
));
314 __m128d t1
= _mm_cvtps_pd(t
);
315 __m128d t2
= _mm_mul_pd(t1
,_mm_sub_pd(two
,_mm_mul_pd(t1
,x
)));
317 return _mm_mul_pd(t2
,_mm_sub_pd(two
,_mm_mul_pd(t2
,x
)));
322 calc_gb_rad_still_sse2_double(t_commrec
*cr
, t_forcerec
*fr
,int natoms
, gmx_localtop_t
*top
,
323 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
325 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
;
326 int at0
,at1
,nj0
,nj1
,offset
,taj1
,taj2
;
329 double factor
,gpi_ai
,gpi_tmp
,gpi2
;
332 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
333 __m128d t1
,t2
,t3
,rsq11
,rinv
,rinv2
,rinv4
,rinv6
;
334 __m128d ratio
,gpi
,rai
,raj
,vaj
,rvdw
,mask_cmp
;
335 __m128d ccf
,dccf
,theta
,cosq
,term
,sinq
,res
,prod
;
336 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,vai
,prod_ai
,icf4
,icf6
;
338 const __m128d half
= {0.5, 0.5};
339 const __m128d three
= {3.0, 3.0};
340 const __m128d one
= {1.0, 1.0};
341 const __m128d two
= {2.0, 2.0};
342 const __m128d zero
= {0.0, 0.0};
343 const __m128d four
= {4.0, 4.0};
345 const __m128d p5inv
= {STILL_P5INV
, STILL_P5INV
};
346 const __m128d pip5
= {STILL_PIP5
, STILL_PIP5
};
347 const __m128d p4
= {STILL_P4
, STILL_P4
};
349 factor
= 0.5 * ONE_4PI_EPS0
;
352 /* Keep the compiler happy */
353 raj
= _mm_setzero_pd();
354 vaj
= _mm_setzero_pd();
355 jx
= _mm_setzero_pd();
356 jy
= _mm_setzero_pd();
357 jz
= _mm_setzero_pd();
358 xmm2
= _mm_setzero_pd();
359 xmm7
= _mm_setzero_pd();
361 for(i
=0;i
<born
->nr
;i
++)
363 born
->gpol_still_work
[i
]=0;
366 for(i
=0;i
<nl
->nri
;i
++)
372 nj1
= nl
->jindex
[i
+1];
374 /* Load shifts for this list */
375 shift
= nl
->shift
[i
];
376 shX
= fr
->shift_vec
[shift
][0];
377 shY
= fr
->shift_vec
[shift
][1];
378 shZ
= fr
->shift_vec
[shift
][2];
380 /* Splat the shifts */
381 sX
= _mm_load1_pd(&shX
);
382 sY
= _mm_load1_pd(&shY
);
383 sZ
= _mm_load1_pd(&shZ
);
385 offset
= (nj1
-nj0
)%2;
387 /* Polarization energy for atom ai */
388 gpi
= _mm_setzero_pd();
390 /* Load particle ai coordinates and add shifts */
391 ix
= _mm_load1_pd(x
+ai3
);
392 iy
= _mm_load1_pd(x
+ai3
+1);
393 iz
= _mm_load1_pd(x
+ai3
+2);
395 ix
= _mm_add_pd(sX
,ix
);
396 iy
= _mm_add_pd(sY
,iy
);
397 iz
= _mm_add_pd(sZ
,iz
);
399 /* Load particle ai gb_radius */
400 rai
= _mm_set1_pd(top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]);
401 vai
= _mm_set1_pd(born
->vsolv
[ai
]);
402 prod_ai
= _mm_mul_pd(p4
,vai
);
404 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
412 taj1
= md
->typeA
[aj1
];
413 taj2
= md
->typeA
[aj2
];
415 /* Load particle aj1-2 coordinates and compute ai->aj distances */
416 xmm1
= _mm_loadu_pd(x
+aj13
);
417 xmm2
= _mm_loadu_pd(x
+aj23
);
418 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
419 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
421 jz
= _mm_loadl_pd(jz
,x
+aj13
+2);
422 jz
= _mm_loadh_pd(jz
,x
+aj23
+2);
424 dx
= _mm_sub_pd(ix
,jx
);
425 dy
= _mm_sub_pd(iy
,jy
);
426 dz
= _mm_sub_pd(iz
,jz
);
428 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
429 rinv
= my_invrsq_pd(rsq11
);
431 rinv2
= _mm_mul_pd(rinv
,rinv
);
432 rinv4
= _mm_mul_pd(rinv2
,rinv2
);
433 rinv6
= _mm_mul_pd(rinv4
,rinv2
);
435 vaj
= _mm_loadl_pd(vaj
,born
->vsolv
+aj1
);
436 vaj
= _mm_loadh_pd(vaj
,born
->vsolv
+aj2
);
438 raj
= _mm_loadl_pd(raj
, top
->atomtypes
.gb_radius
+taj1
);
439 raj
= _mm_loadh_pd(raj
, top
->atomtypes
.gb_radius
+taj2
);
441 rvdw
= _mm_add_pd(rai
,raj
);
442 rvdw
= _mm_mul_pd(rvdw
,rvdw
);
443 ratio
= _mm_div_pd(rsq11
,rvdw
);
445 mask_cmp
= _mm_cmpgt_pd(ratio
,p5inv
);
447 switch(_mm_movemask_pd(mask_cmp
))
454 theta
= _mm_mul_pd(ratio
,pip5
);
455 sincos_sse2double(theta
,&sinq
,&cosq
);
457 term
= _mm_sub_pd(one
,cosq
);
458 term
= _mm_mul_pd(half
,term
);
459 ccf
= _mm_mul_pd(term
,term
);
460 dccf
= _mm_mul_pd(two
,term
);
461 dccf
= _mm_mul_pd(dccf
,sinq
);
462 dccf
= _mm_mul_pd(dccf
,pip5
);
463 dccf
= _mm_mul_pd(dccf
,ratio
);
465 ccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,one
) ,_mm_andnot_pd(mask_cmp
,ccf
));
466 dccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,zero
) ,_mm_andnot_pd(mask_cmp
,dccf
));
469 prod
= _mm_mul_pd(p4
,vaj
);
470 icf4
= _mm_mul_pd(ccf
,rinv4
);
471 xmm2
= _mm_mul_pd(icf4
,prod
);
472 xmm3
= _mm_mul_pd(icf4
,prod_ai
);
473 gpi
= _mm_add_pd(gpi
,xmm2
);
475 /* Load, subtract and store atom aj gpol energy */
476 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_still_work
+aj1
);
477 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_still_work
+aj2
);
479 xmm3
= _mm_add_pd(xmm7
,xmm3
);
481 _mm_storel_pd(born
->gpol_still_work
+aj1
,xmm3
);
482 _mm_storeh_pd(born
->gpol_still_work
+aj2
,xmm3
);
484 /* Chain rule terms */
485 ccf
= _mm_mul_pd(four
,ccf
);
486 xmm3
= _mm_sub_pd(ccf
,dccf
);
487 icf6
= _mm_mul_pd(xmm3
,rinv6
);
488 xmm1
= _mm_mul_pd(icf6
,prod
);
489 xmm2
= _mm_mul_pd(icf6
,prod_ai
);
491 /* As with single precision, we need to shift stuff around, to change the order of
492 * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
493 to do 2 instead of 4 store operations
495 xmm3
= _mm_unpacklo_pd(xmm1
,xmm2
);
496 xmm4
= _mm_unpackhi_pd(xmm1
,xmm2
);
498 _mm_storeu_pd(fr
->dadx
+n
,xmm3
);
500 _mm_storeu_pd(fr
->dadx
+n
,xmm4
);
504 /* Deal with odd elements */
509 taj1
= md
->typeA
[aj1
];
511 jx
= _mm_load_sd(x
+aj13
);
512 jy
= _mm_load_sd(x
+aj13
+1);
513 jz
= _mm_load_sd(x
+aj13
+2);
515 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+taj1
);
516 vaj
= _mm_load_sd(born
->vsolv
+aj1
);
518 dx
= _mm_sub_sd(ix
,jx
);
519 dy
= _mm_sub_pd(iy
,jy
);
520 dz
= _mm_sub_pd(iz
,jz
);
522 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
523 rinv
= my_invrsq_pd(rsq11
);
525 rinv2
= _mm_mul_sd(rinv
,rinv
);
526 rinv4
= _mm_mul_sd(rinv2
,rinv2
);
527 rinv6
= _mm_mul_sd(rinv4
,rinv2
);
529 rvdw
= _mm_add_sd(rai
,raj
);
530 rvdw
= _mm_mul_sd(rvdw
,rvdw
);
531 ratio
= _mm_div_sd(rsq11
,rvdw
);
533 mask_cmp
= _mm_cmpgt_sd(ratio
,p5inv
);
535 switch(_mm_movemask_pd(mask_cmp
))
542 theta
= _mm_mul_sd(ratio
,pip5
);
543 sincos_sse2double(theta
,&sinq
,&cosq
);
545 term
= _mm_sub_sd(one
,cosq
);
546 term
= _mm_mul_sd(half
,term
);
547 ccf
= _mm_mul_sd(term
,term
);
548 dccf
= _mm_mul_sd(two
,term
);
549 dccf
= _mm_mul_sd(dccf
,sinq
);
550 dccf
= _mm_mul_sd(dccf
,pip5
);
551 dccf
= _mm_mul_sd(dccf
,ratio
);
553 ccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,one
) ,_mm_andnot_pd(mask_cmp
,ccf
));
554 dccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,zero
) ,_mm_andnot_pd(mask_cmp
,dccf
));
557 prod
= _mm_mul_sd(p4
,vaj
);
558 icf4
= _mm_mul_sd(ccf
,rinv4
);
559 xmm2
= _mm_mul_sd(icf4
,prod
);
560 xmm3
= _mm_mul_sd(icf4
,prod_ai
);
561 gpi
= _mm_add_sd(gpi
,xmm2
);
563 /* Load, subtract and store atom aj gpol energy */
564 xmm7
= _mm_load_sd(born
->gpol_still_work
+aj1
);
565 xmm3
= _mm_add_sd(xmm7
,xmm3
);
566 _mm_store_sd(born
->gpol_still_work
+aj1
,xmm3
);
568 /* Chain rule terms */
569 ccf
= _mm_mul_sd(four
,ccf
);
570 xmm3
= _mm_sub_sd(ccf
,dccf
);
571 icf6
= _mm_mul_sd(xmm3
,rinv6
);
572 xmm1
= _mm_mul_sd(icf6
,prod
);
573 xmm2
= _mm_mul_sd(icf6
,prod_ai
);
575 /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
576 _mm_storel_pd(fr
->dadx
+n
,xmm1
);
578 _mm_storel_pd(fr
->dadx
+n
,xmm2
);
582 /* Do end processing ... */
583 xmm2
= _mm_unpacklo_pd(xmm2
,gpi
);
584 gpi
= _mm_add_pd(gpi
,xmm2
);
585 gpi
= _mm_shuffle_pd(gpi
,gpi
,_MM_SHUFFLE2(1,1));
587 /* Load, add and store atom ai polarisation energy */
588 xmm2
= _mm_load_sd(born
->gpol_still_work
+ai
);
589 gpi
= _mm_add_sd(gpi
,xmm2
);
590 _mm_store_sd(born
->gpol_still_work
+ai
,gpi
);
593 /* Parallell summations */
596 gmx_sum(natoms
,born
->gpol_still_work
, cr
);
598 else if(DOMAINDECOMP(cr
))
600 dd_atom_sum_real(cr
->dd
, born
->gpol_still_work
);
603 /* Compute the radii */
604 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
606 if(born
->use
[i
] != 0)
608 gpi_ai
= born
->gpol
[i
] + born
->gpol_still_work
[i
];
609 gpi2
= gpi_ai
*gpi_ai
;
611 born
->bRad
[i
]=factor
*gmx_invsqrt(gpi2
);
612 fr
->invsqrta
[i
]=gmx_invsqrt(born
->bRad
[ai
]);
616 /* Extra (local) communication reqiured for DD */
619 dd_atom_spread_real(cr
->dd
, born
->bRad
);
620 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
627 calc_gb_rad_hct_sse2_double(t_commrec
*cr
, t_forcerec
*fr
, int natoms
, gmx_localtop_t
*top
,
628 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
630 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
,nj0
,nj1
,at0
,at1
,offset
;
632 double rr
,sum
,sum_tmp
,min_rad
,rad
,doff
;
635 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
636 __m128d t1
,t2
,t3
,rsq11
,rinv
,r
,rai
;
637 __m128d rai_inv
,sk
,sk2
,lij
,dlij
,duij
;
638 __m128d uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
639 __m128d lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
640 __m128d mask_cmp
,mask_cmp2
,mask_cmp3
;
641 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,xmm8
,xmm9
,doffset
;
642 __m128d sum_ai
,chrule
,chrule_ai
,tmp_ai
;
643 __m128d sk_ai
, sk2_ai
,raj
,raj_inv
;
645 const __m128d neg
= {-1.0, -1.0};
646 const __m128d zero
= {0.0, 0.0};
647 const __m128d eigth
= {0.125, 0.125};
648 const __m128d qrtr
= {0.25, 0.25};
649 const __m128d half
= {0.5, 0.5};
650 const __m128d one
= {1.0, 1.0};
651 const __m128d two
= {2.0, 2.0};
652 const __m128d three
= {3.0, 3.0};
654 /* Keep the compiler happy */
655 tmp_ai
= _mm_setzero_pd();
656 tmp
= _mm_setzero_pd();
657 xmm7
= _mm_setzero_pd();
658 xmm8
= _mm_setzero_pd();
659 raj_inv
= _mm_setzero_pd();
660 raj
= _mm_setzero_pd();
661 sk
= _mm_setzero_pd();
662 jx
= _mm_setzero_pd();
663 jy
= _mm_setzero_pd();
664 jz
= _mm_setzero_pd();
666 /* Set the dielectric offset */
667 doff
= born
->gb_doffset
;
668 doffset
= _mm_load1_pd(&doff
);
671 for(i
=0;i
<born
->nr
;i
++)
673 born
->gpol_hct_work
[i
] = 0;
676 for(i
=0;i
<nl
->nri
;i
++)
682 nj1
= nl
->jindex
[i
+1];
684 /* Load shifts for this list */
685 shift
= nl
->shift
[i
];
686 shX
= fr
->shift_vec
[shift
][0];
687 shY
= fr
->shift_vec
[shift
][1];
688 shZ
= fr
->shift_vec
[shift
][2];
690 /* Splat the shifts */
691 sX
= _mm_load1_pd(&shX
);
692 sY
= _mm_load1_pd(&shY
);
693 sZ
= _mm_load1_pd(&shZ
);
695 offset
= (nj1
-nj0
)%2;
698 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]-doff
;
699 rai
= _mm_load1_pd(&rr
);
701 rai_inv
= _mm_load1_pd(&rr
);
703 /* Zero out sums for polarisation energies */
704 sum_ai
= _mm_setzero_pd();
706 /* Load ai coordinates and add shifts */
707 ix
= _mm_load1_pd(x
+ai3
);
708 iy
= _mm_load1_pd(x
+ai3
+1);
709 iz
= _mm_load1_pd(x
+ai3
+2);
711 ix
= _mm_add_pd(sX
,ix
);
712 iy
= _mm_add_pd(sY
,iy
);
713 iz
= _mm_add_pd(sZ
,iz
);
715 sk_ai
= _mm_load1_pd(born
->param
+ai
);
716 sk2_ai
= _mm_mul_pd(sk_ai
,sk_ai
);
718 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
726 /* Load particle aj1-2 coordinates */
727 xmm1
= _mm_loadu_pd(x
+aj13
);
728 xmm2
= _mm_loadu_pd(x
+aj23
);
729 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
730 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
732 jz
= _mm_loadl_pd(jz
, x
+aj13
+2);
733 jz
= _mm_loadh_pd(jz
, x
+aj23
+2);
735 dx
= _mm_sub_pd(ix
,jx
);
736 dy
= _mm_sub_pd(iy
,jy
);
737 dz
= _mm_sub_pd(iz
,jz
);
739 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
740 rinv
= my_invrsq_pd(rsq11
);
741 r
= _mm_mul_pd(rinv
,rsq11
);
743 sk
= _mm_loadl_pd(sk
,born
->param
+aj1
);
744 sk
= _mm_loadh_pd(sk
,born
->param
+aj2
);
746 /* Load aj1,aj2 raj */
750 raj
= _mm_loadl_pd(raj
,top
->atomtypes
.gb_radius
+p1
);
751 raj
= _mm_loadh_pd(raj
,top
->atomtypes
.gb_radius
+p2
);
752 raj
= _mm_sub_pd(raj
,doffset
);
754 /* Compute 1.0/raj */
755 raj_inv
= my_inv_pd(raj
);
757 /* INTERACTION aj->ai STARS HERE */
758 /* conditional mask for rai<dr+sk */
759 xmm1
= _mm_add_pd(r
,sk
);
760 mask_cmp
= _mm_cmplt_pd(rai
,xmm1
);
762 /* conditional for rai>dr-sk, ends with mask_cmp2 */
763 xmm2
= _mm_sub_pd(r
,sk
);
764 xmm3
= my_inv_pd(xmm2
);
765 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
767 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
768 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
770 uij
= my_inv_pd(xmm1
);
771 lij2
= _mm_mul_pd(lij
,lij
);
772 lij3
= _mm_mul_pd(lij2
,lij
);
773 uij2
= _mm_mul_pd(uij
,uij
);
774 uij3
= _mm_mul_pd(uij2
,uij
);
776 diff2
= _mm_sub_pd(uij2
,lij2
);
778 lij_inv
= my_invrsq_pd(lij2
);
779 sk2
= _mm_mul_pd(sk
,sk
);
780 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
781 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
783 log_term
= _mm_mul_pd(uij
,lij_inv
);
784 log_term
= log_pd(log_term
);
786 xmm1
= _mm_sub_pd(lij
,uij
);
787 xmm2
= _mm_mul_pd(qrtr
,r
);
788 xmm2
= _mm_mul_pd(xmm2
,diff2
);
789 xmm1
= _mm_add_pd(xmm1
,xmm2
);
790 xmm2
= _mm_mul_pd(half
,rinv
);
791 xmm2
= _mm_mul_pd(xmm2
,log_term
);
792 xmm1
= _mm_add_pd(xmm1
,xmm2
);
793 xmm9
= _mm_mul_pd(neg
,diff2
);
794 xmm2
= _mm_mul_pd(xmm9
,prod
);
795 tmp_ai
= _mm_add_pd(xmm1
,xmm2
);
797 /* contitional for rai<sk-dr */
798 xmm3
= _mm_sub_pd(sk
,r
);
799 mask_cmp3
= _mm_cmplt_pd(rai
,xmm3
); /* rai<sk-dr */
801 xmm4
= _mm_sub_pd(rai_inv
,lij
);
802 xmm4
= _mm_mul_pd(two
,xmm4
);
803 xmm4
= _mm_add_pd(tmp_ai
,xmm4
);
805 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
807 /* the tmp will now contain two partial values, that not all are to be used. Which */
808 /* ones are governed by the mask_cmp mask. */
809 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
810 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
811 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
813 xmm2
= _mm_mul_pd(half
,lij2
);
814 xmm3
= _mm_mul_pd(prod
,lij3
);
815 xmm2
= _mm_add_pd(xmm2
,xmm3
);
816 xmm3
= _mm_mul_pd(lij
,rinv
);
817 xmm4
= _mm_mul_pd(lij3
,r
);
818 xmm3
= _mm_add_pd(xmm3
,xmm4
);
819 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
820 t1
= _mm_sub_pd(xmm2
,xmm3
);
822 xmm2
= _mm_mul_pd(half
,uij2
);
823 xmm2
= _mm_mul_pd(neg
,xmm2
);
824 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
825 xmm3
= _mm_mul_pd(xmm3
,uij3
);
826 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
827 xmm3
= _mm_mul_pd(uij
,rinv
);
828 xmm4
= _mm_mul_pd(uij3
,r
);
829 xmm3
= _mm_add_pd(xmm3
,xmm4
);
830 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
831 t2
= _mm_add_pd(xmm2
,xmm3
);
833 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
834 xmm2
= _mm_add_pd(one
,xmm2
);
835 xmm2
= _mm_mul_pd(eigth
,xmm2
);
836 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
837 xmm3
= _mm_mul_pd(log_term
, rinv
);
838 xmm3
= _mm_mul_pd(xmm3
,rinv
);
839 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
840 t3
= _mm_add_pd(xmm2
,xmm3
);
842 /* chain rule terms */
843 xmm2
= _mm_mul_pd(dlij
,t1
);
844 xmm2
= _mm_add_pd(xmm2
,t2
);
845 xmm2
= _mm_add_pd(xmm2
,t3
);
847 /* temporary storage of chain rule terms, since we have to compute
848 the reciprocal terms also before storing them */
849 chrule
= _mm_mul_pd(xmm2
,rinv
);
851 /* INTERACTION ai->aj starts here */
852 /* Conditional mask for raj<dr+sk_ai */
853 xmm1
= _mm_add_pd(r
,sk_ai
);
854 mask_cmp
= _mm_cmplt_pd(raj
,xmm1
);
856 /* Conditional for rai>dr-sk, ends with mask_cmp2 */
857 xmm2
= _mm_sub_pd(r
,sk_ai
);
858 xmm3
= my_inv_pd(xmm2
);
859 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
861 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
862 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
864 uij
= my_inv_pd(xmm1
);
865 lij2
= _mm_mul_pd(lij
,lij
);
866 lij3
= _mm_mul_pd(lij2
,lij
);
867 uij2
= _mm_mul_pd(uij
,uij
);
868 uij3
= _mm_mul_pd(uij2
,uij
);
870 diff2
= _mm_sub_pd(uij2
,lij2
);
872 lij_inv
= my_invrsq_pd(lij2
);
875 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
876 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
878 log_term
= _mm_mul_pd(uij
,lij_inv
);
879 log_term
= log_pd(log_term
);
881 xmm1
= _mm_sub_pd(lij
,uij
);
882 xmm2
= _mm_mul_pd(qrtr
,r
);
883 xmm2
= _mm_mul_pd(xmm2
,diff2
);
884 xmm1
= _mm_add_pd(xmm1
,xmm2
);
885 xmm2
= _mm_mul_pd(half
,rinv
);
886 xmm2
= _mm_mul_pd(xmm2
,log_term
);
887 xmm1
= _mm_add_pd(xmm1
,xmm2
);
888 xmm9
= _mm_mul_pd(neg
,diff2
);
889 xmm2
= _mm_mul_pd(xmm9
,prod
);
890 tmp
= _mm_add_pd(xmm1
,xmm2
);
892 /* contitional for rai<sk_ai-dr */
893 xmm3
= _mm_sub_pd(sk_ai
,r
);
894 mask_cmp3
= _mm_cmplt_pd(raj
,xmm3
); /* rai<sk-dr */
896 xmm4
= _mm_sub_pd(raj_inv
,lij
);
897 xmm4
= _mm_mul_pd(two
,xmm4
);
898 xmm4
= _mm_add_pd(tmp
,xmm4
);
900 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
902 /* the tmp will now contain two partial values, that not all are to be used. Which */
903 /* ones are governed by the mask_cmp mask. */
904 tmp
= _mm_mul_pd(half
,tmp
);
905 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
907 /* Load, add and store ai->aj pol energy */
908 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_hct_work
+aj1
);
909 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_hct_work
+aj2
);
911 xmm7
= _mm_add_pd(xmm7
,tmp
);
913 _mm_storel_pd(born
->gpol_hct_work
+aj1
,xmm7
);
914 _mm_storeh_pd(born
->gpol_hct_work
+aj2
,xmm7
);
916 /* Start chain rule terms */
917 xmm2
= _mm_mul_pd(half
,lij2
);
918 xmm3
= _mm_mul_pd(prod
,lij3
);
919 xmm2
= _mm_add_pd(xmm2
,xmm3
);
920 xmm3
= _mm_mul_pd(lij
,rinv
);
921 xmm4
= _mm_mul_pd(lij3
,r
);
922 xmm3
= _mm_add_pd(xmm3
,xmm4
);
923 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
924 t1
= _mm_sub_pd(xmm2
,xmm3
);
926 xmm2
= _mm_mul_pd(half
,uij2
);
927 xmm2
= _mm_mul_pd(neg
,xmm2
);
928 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
929 xmm3
= _mm_mul_pd(xmm3
,uij3
);
930 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
931 xmm3
= _mm_mul_pd(uij
,rinv
);
932 xmm4
= _mm_mul_pd(uij3
,r
);
933 xmm3
= _mm_add_pd(xmm3
,xmm4
);
934 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
935 t2
= _mm_add_pd(xmm2
,xmm3
);
937 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
938 xmm2
= _mm_add_pd(one
,xmm2
);
939 xmm2
= _mm_mul_pd(eigth
,xmm2
);
940 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
941 xmm3
= _mm_mul_pd(log_term
, rinv
);
942 xmm3
= _mm_mul_pd(xmm3
,rinv
);
943 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
944 t3
= _mm_add_pd(xmm2
,xmm3
);
946 /* chain rule terms */
947 xmm2
= _mm_mul_pd(dlij
,t1
);
948 xmm2
= _mm_add_pd(xmm2
,t2
);
949 xmm2
= _mm_add_pd(xmm2
,t3
);
950 chrule_ai
= _mm_mul_pd(xmm2
,rinv
);
952 /* Store chain rule terms
953 * same unpacking rationale as with Still above
955 xmm3
= _mm_unpacklo_pd(chrule
, chrule_ai
);
956 xmm4
= _mm_unpackhi_pd(chrule
, chrule_ai
);
958 _mm_storeu_pd(fr
->dadx
+n
, xmm3
);
960 _mm_storeu_pd(fr
->dadx
+n
, xmm4
);
970 jx
= _mm_load_sd(x
+aj13
);
971 jy
= _mm_load_sd(x
+aj13
+1);
972 jz
= _mm_load_sd(x
+aj13
+2);
974 sk
= _mm_load_sd(born
->param
+aj1
);
976 dx
= _mm_sub_sd(ix
,jx
);
977 dy
= _mm_sub_pd(iy
,jy
);
978 dz
= _mm_sub_pd(iz
,jz
);
980 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
981 rinv
= my_invrsq_pd(rsq11
);
982 r
= _mm_mul_sd(rinv
,rsq11
);
985 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+p1
);
986 raj
= _mm_sub_sd(raj
,doffset
);
987 raj_inv
= my_inv_pd(raj
);
989 /* OFFSET INTERATIONS aj->ai STARTS HERE */
990 /* conditional mask for rai<dr+sk */
991 xmm1
= _mm_add_sd(r
,sk
);
992 mask_cmp
= _mm_cmplt_sd(rai
,xmm1
);
994 /* conditional for rai>dr-sk, ends with mask_cmp2 */
995 xmm2
= _mm_sub_sd(r
,sk
);
996 xmm3
= my_inv_pd(xmm2
);
997 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
999 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1000 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1002 uij
= my_inv_pd(xmm1
);
1003 lij2
= _mm_mul_sd(lij
,lij
);
1004 lij3
= _mm_mul_sd(lij2
,lij
);
1005 uij2
= _mm_mul_sd(uij
,uij
);
1006 uij3
= _mm_mul_sd(uij2
,uij
);
1008 diff2
= _mm_sub_sd(uij2
,lij2
);
1010 lij_inv
= my_invrsq_pd(lij2
);
1011 sk2
= _mm_mul_sd(sk
,sk
);
1012 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1013 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1015 log_term
= _mm_mul_pd(uij
,lij_inv
);
1016 log_term
= log_pd(log_term
);
1018 xmm1
= _mm_sub_sd(lij
,uij
);
1019 xmm2
= _mm_mul_sd(qrtr
,r
);
1020 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1021 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1022 xmm2
= _mm_mul_sd(half
,rinv
);
1023 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1024 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1025 xmm9
= _mm_mul_sd(neg
,diff2
);
1026 xmm2
= _mm_mul_sd(xmm9
,prod
);
1028 tmp_ai
= _mm_add_sd(xmm1
,xmm2
);
1030 /* contitional for rai<sk-dr */
1031 xmm3
= _mm_sub_sd(sk
,r
);
1032 mask_cmp3
= _mm_cmplt_sd(rai
,xmm3
); /* rai<sk-dr */
1034 xmm4
= _mm_sub_sd(rai_inv
,lij
);
1035 xmm4
= _mm_mul_sd(two
,xmm4
);
1036 xmm4
= _mm_add_sd(tmp_ai
,xmm4
);
1038 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1040 /* the tmp will now contain two partial values, that not all are to be used. Which */
1041 /* ones are governed by the mask_cmp mask. */
1042 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1043 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1044 sum_ai
= _mm_add_sd(sum_ai
,tmp_ai
);
1046 xmm2
= _mm_mul_sd(half
,lij2
);
1047 xmm3
= _mm_mul_sd(prod
,lij3
);
1048 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1049 xmm3
= _mm_mul_sd(lij
,rinv
);
1050 xmm4
= _mm_mul_sd(lij3
,r
);
1051 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1052 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1053 t1
= _mm_sub_sd(xmm2
,xmm3
);
1055 xmm2
= _mm_mul_sd(half
,uij2
);
1056 xmm2
= _mm_mul_sd(neg
,xmm2
);
1057 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1058 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1059 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1060 xmm3
= _mm_mul_sd(uij
,rinv
);
1061 xmm4
= _mm_mul_sd(uij3
,r
);
1062 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1063 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1064 t2
= _mm_add_sd(xmm2
,xmm3
);
1066 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1067 xmm2
= _mm_add_sd(one
,xmm2
);
1068 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1069 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1070 xmm3
= _mm_mul_sd(log_term
, rinv
);
1071 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1072 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1073 t3
= _mm_add_sd(xmm2
,xmm3
);
1075 /* chain rule terms */
1076 xmm2
= _mm_mul_sd(dlij
,t1
);
1077 xmm2
= _mm_add_sd(xmm2
,t2
);
1078 xmm2
= _mm_add_sd(xmm2
,t3
);
1080 /* temporary storage of chain rule terms, since we have to compute
1081 the reciprocal terms also before storing them */
1082 chrule
= _mm_mul_sd(xmm2
,rinv
);
1084 /* OFFSET INTERACTION ai->aj starts here */
1085 /* conditional mask for raj<dr+sk */
1086 xmm1
= _mm_add_sd(r
,sk_ai
);
1087 mask_cmp
= _mm_cmplt_sd(raj
,xmm1
);
1089 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1090 xmm2
= _mm_sub_sd(r
,sk_ai
);
1091 xmm3
= my_inv_pd(xmm2
);
1092 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1094 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1095 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1097 uij
= my_inv_pd(xmm1
);
1098 lij2
= _mm_mul_sd(lij
,lij
);
1099 lij3
= _mm_mul_sd(lij2
,lij
);
1100 uij2
= _mm_mul_sd(uij
,uij
);
1101 uij3
= _mm_mul_sd(uij2
,uij
);
1103 diff2
= _mm_sub_sd(uij2
,lij2
);
1105 lij_inv
= my_invrsq_pd(lij2
);
1108 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1109 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1111 log_term
= _mm_mul_pd(uij
,lij_inv
);
1112 log_term
= log_pd(log_term
);
1114 xmm1
= _mm_sub_sd(lij
,uij
);
1115 xmm2
= _mm_mul_sd(qrtr
,r
);
1116 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1117 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1118 xmm2
= _mm_mul_sd(half
,rinv
);
1119 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1120 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1121 xmm9
= _mm_mul_sd(neg
,diff2
);
1122 xmm2
= _mm_mul_sd(xmm9
,prod
);
1123 tmp
= _mm_add_sd(xmm1
,xmm2
);
1125 /* contitional for rai<sk-dr */
1126 xmm3
= _mm_sub_sd(sk_ai
,r
);
1127 mask_cmp3
= _mm_cmplt_sd(raj
,xmm3
); /* rai<sk-dr */
1129 xmm4
= _mm_sub_sd(raj_inv
,lij
);
1130 xmm4
= _mm_mul_sd(two
,xmm4
);
1131 xmm4
= _mm_add_sd(tmp
,xmm4
);
1133 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1135 /* the tmp will now contain two partial values, that not all are to be used. Which */
1136 /* ones are governed by the mask_cmp mask. */
1137 tmp
= _mm_mul_pd(half
,tmp
);
1138 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1140 /* Load, add and store gpol energy */
1141 xmm7
= _mm_load_sd(born
->gpol_hct_work
+aj1
);
1142 xmm7
= _mm_add_sd(xmm7
,tmp
);
1143 _mm_store_sd(born
->gpol_hct_work
+aj1
,xmm7
);
1145 /* Start chain rule terms, t1 */
1146 xmm2
= _mm_mul_sd(half
,lij2
);
1147 xmm3
= _mm_mul_sd(prod
,lij3
);
1148 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1149 xmm3
= _mm_mul_sd(lij
,rinv
);
1150 xmm4
= _mm_mul_sd(lij3
,r
);
1151 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1152 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1153 t1
= _mm_sub_sd(xmm2
,xmm3
);
1155 xmm2
= _mm_mul_sd(half
,uij2
);
1156 xmm2
= _mm_mul_sd(neg
,xmm2
);
1157 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1158 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1159 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1160 xmm3
= _mm_mul_sd(uij
,rinv
);
1161 xmm4
= _mm_mul_sd(uij3
,r
);
1162 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1163 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1164 t2
= _mm_add_sd(xmm2
,xmm3
);
1166 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1167 xmm2
= _mm_add_sd(one
,xmm2
);
1168 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1169 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1170 xmm3
= _mm_mul_sd(log_term
, rinv
);
1171 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1172 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1173 t3
= _mm_add_sd(xmm2
,xmm3
);
1175 /* chain rule terms */
1176 xmm2
= _mm_mul_sd(dlij
,t1
);
1177 xmm2
= _mm_add_sd(xmm2
,t2
);
1178 xmm2
= _mm_add_sd(xmm2
,t3
);
1179 chrule_ai
= _mm_mul_sd(xmm2
,rinv
);
1181 _mm_store_sd(fr
->dadx
+n
, chrule
);
1183 _mm_store_sd(fr
->dadx
+n
, chrule_ai
);
1187 /* Do end processing ... */
1188 tmp_ai
= _mm_unpacklo_pd(tmp_ai
,sum_ai
);
1189 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1190 sum_ai
= _mm_shuffle_pd(sum_ai
,sum_ai
,_MM_SHUFFLE2(1,1));
1192 /* Load, add and store atom ai polarisation energy */
1193 xmm2
= _mm_load_sd(born
->gpol_hct_work
+ai
);
1194 sum_ai
= _mm_add_sd(sum_ai
,xmm2
);
1195 _mm_store_sd(born
->gpol_hct_work
+ai
,sum_ai
);
1198 /* Parallel summations */
1201 gmx_sum(natoms
, born
->gpol_hct_work
, cr
);
1203 else if(DOMAINDECOMP(cr
))
1205 dd_atom_sum_real(cr
->dd
, born
->gpol_hct_work
);
1208 /* Compute the radii */
1209 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1211 if(born
->use
[i
] != 0)
1213 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]]-doff
;
1214 sum
= 1.0/rr
- born
->gpol_hct_work
[i
];
1215 min_rad
= rr
+ doff
;
1218 born
->bRad
[i
] = rad
> min_rad
? rad
: min_rad
;
1219 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1223 /* Extra (local) communication required for DD */
1224 if(DOMAINDECOMP(cr
))
1226 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1227 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1234 calc_gb_rad_obc_sse2_double(t_commrec
*cr
, t_forcerec
* fr
, int natoms
, gmx_localtop_t
*top
,
1235 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
,t_mdatoms
*md
)
1237 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
,nj0
,nj1
,at0
,at1
,offset
;
1238 int p1
,p2
,p3
,p4
,shift
;
1239 double rr
,sum
,sum_tmp
,sum2
,sum3
,min_rad
,rad
,doff
;
1240 double tsum
,tchain
,rr_inv
,rr_inv2
,gbr
;
1243 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
1244 __m128d t1
,t2
,t3
,rsq11
,rinv
,r
,rai
;
1245 __m128d rai_inv
,sk
,sk2
,lij
,dlij
,duij
;
1246 __m128d uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
1247 __m128d lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
1248 __m128d mask_cmp
,mask_cmp2
,mask_cmp3
,doffset
,raj
,raj_inv
;
1249 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,xmm8
,xmm9
;
1250 __m128d sum_ai
,chrule
,chrule_ai
,tmp_ai
,sk_ai
,sk2_ai
;
1252 const __m128d neg
= {-1.0, -1.0};
1253 const __m128d zero
= {0.0, 0.0};
1254 const __m128d eigth
= {0.125, 0.125};
1255 const __m128d qrtr
= {0.25, 0.25};
1256 const __m128d half
= {0.5, 0.5};
1257 const __m128d one
= {1.0, 1.0};
1258 const __m128d two
= {2.0, 2.0};
1259 const __m128d three
= {3.0, 3.0};
1261 /* Keep the compiler happy */
1262 tmp_ai
= _mm_setzero_pd();
1263 tmp
= _mm_setzero_pd();
1264 raj_inv
= _mm_setzero_pd();
1265 raj
= _mm_setzero_pd();
1266 sk
= _mm_setzero_pd();
1267 jx
= _mm_setzero_pd();
1268 jy
= _mm_setzero_pd();
1269 jz
= _mm_setzero_pd();
1270 xmm7
= _mm_setzero_pd();
1271 xmm8
= _mm_setzero_pd();
1273 /* Set the dielectric offset */
1274 doff
= born
->gb_doffset
;
1275 doffset
= _mm_load1_pd(&doff
);
1279 for(i
=0;i
<born
->nr
;i
++)
1281 born
->gpol_hct_work
[i
] = 0;
1284 for(i
=0;i
<nl
->nri
;i
++)
1289 nj0
= nl
->jindex
[i
];
1290 nj1
= nl
->jindex
[i
+1];
1292 /* Load shifts for this list */
1293 shift
= nl
->shift
[i
];
1294 shX
= fr
->shift_vec
[shift
][0];
1295 shY
= fr
->shift_vec
[shift
][1];
1296 shZ
= fr
->shift_vec
[shift
][2];
1298 /* Splat the shifts */
1299 sX
= _mm_load1_pd(&shX
);
1300 sY
= _mm_load1_pd(&shY
);
1301 sZ
= _mm_load1_pd(&shZ
);
1303 offset
= (nj1
-nj0
)%2;
1306 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]-doff
;
1307 rai
= _mm_load1_pd(&rr
);
1309 rai_inv
= _mm_load1_pd(&rr
);
1311 /* Load ai coordinates and add shifts */
1312 ix
= _mm_load1_pd(x
+ai3
);
1313 iy
= _mm_load1_pd(x
+ai3
+1);
1314 iz
= _mm_load1_pd(x
+ai3
+2);
1316 ix
= _mm_add_pd(sX
,ix
);
1317 iy
= _mm_add_pd(sY
,iy
);
1318 iz
= _mm_add_pd(sZ
,iz
);
1320 /* Zero out sums for polarisation energies */
1321 sum_ai
= _mm_setzero_pd();
1323 sk_ai
= _mm_load1_pd(born
->param
+ai
);
1324 sk2_ai
= _mm_mul_pd(sk_ai
,sk_ai
);
1326 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
1329 aj2
= nl
->jjnr
[k
+1];
1334 /* Load particle aj1-2 coordinates */
1335 xmm1
= _mm_loadu_pd(x
+aj13
);
1336 xmm2
= _mm_loadu_pd(x
+aj23
);
1337 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
1338 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
1340 jz
= _mm_loadl_pd(jz
, x
+aj13
+2);
1341 jz
= _mm_loadh_pd(jz
, x
+aj23
+2);
1343 dx
= _mm_sub_pd(ix
,jx
);
1344 dy
= _mm_sub_pd(iy
,jy
);
1345 dz
= _mm_sub_pd(iz
,jz
);
1347 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
1348 rinv
= my_invrsq_pd(rsq11
);
1349 r
= _mm_mul_pd(rinv
,rsq11
);
1351 /* Load atom aj1,aj2 raj */
1352 p1
= md
->typeA
[aj1
];
1353 p2
= md
->typeA
[aj2
];
1355 raj
= _mm_loadl_pd(raj
,top
->atomtypes
.gb_radius
+p1
);
1356 raj
= _mm_loadh_pd(raj
,top
->atomtypes
.gb_radius
+p2
);
1357 raj
= _mm_sub_pd(raj
,doffset
);
1359 /* Compute 1.0/raj */
1360 raj_inv
= my_inv_pd(raj
);
1362 sk
= _mm_loadl_pd(sk
,born
->param
+aj1
);
1363 sk
= _mm_loadh_pd(sk
,born
->param
+aj2
);
1365 /* INTERACTION aj->ai STARTS HERE */
1366 /* conditional mask for rai<dr+sk */
1367 xmm1
= _mm_add_pd(r
,sk
);
1368 mask_cmp
= _mm_cmplt_pd(rai
,xmm1
);
1370 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1371 xmm2
= _mm_sub_pd(r
,sk
);
1372 xmm3
= my_inv_pd(xmm2
);
1373 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1375 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1376 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1378 uij
= my_inv_pd(xmm1
);
1379 lij2
= _mm_mul_pd(lij
,lij
);
1380 lij3
= _mm_mul_pd(lij2
,lij
);
1381 uij2
= _mm_mul_pd(uij
,uij
);
1382 uij3
= _mm_mul_pd(uij2
,uij
);
1384 diff2
= _mm_sub_pd(uij2
,lij2
);
1386 lij_inv
= my_invrsq_pd(lij2
);
1387 sk2
= _mm_mul_pd(sk
,sk
);
1388 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
1389 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
1391 log_term
= _mm_mul_pd(uij
,lij_inv
);
1392 log_term
= log_pd(log_term
);
1394 xmm1
= _mm_sub_pd(lij
,uij
);
1395 xmm2
= _mm_mul_pd(qrtr
,r
);
1396 xmm2
= _mm_mul_pd(xmm2
,diff2
);
1397 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1398 xmm2
= _mm_mul_pd(half
,rinv
);
1399 xmm2
= _mm_mul_pd(xmm2
,log_term
);
1400 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1401 xmm9
= _mm_mul_pd(neg
,diff2
);
1402 xmm2
= _mm_mul_pd(xmm9
,prod
);
1403 tmp_ai
= _mm_add_pd(xmm1
,xmm2
);
1405 /* contitional for rai<sk-dr */
1406 xmm3
= _mm_sub_pd(sk
,r
);
1407 mask_cmp3
= _mm_cmplt_pd(rai
,xmm3
); /* rai<sk-dr */
1409 xmm4
= _mm_sub_pd(rai_inv
,lij
);
1410 xmm4
= _mm_mul_pd(two
,xmm4
);
1411 xmm4
= _mm_add_pd(tmp_ai
,xmm4
);
1413 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1415 /* the tmp will now contain two partial values, that not all are to be used. Which */
1416 /* ones are governed by the mask_cmp mask. */
1417 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1418 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1419 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1421 /* Start the dadx chain rule terms */
1422 xmm2
= _mm_mul_pd(half
,lij2
);
1423 xmm3
= _mm_mul_pd(prod
,lij3
);
1424 xmm2
= _mm_add_pd(xmm2
,xmm3
);
1425 xmm3
= _mm_mul_pd(lij
,rinv
);
1426 xmm4
= _mm_mul_pd(lij3
,r
);
1427 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1428 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1429 t1
= _mm_sub_pd(xmm2
,xmm3
);
1431 xmm2
= _mm_mul_pd(half
,uij2
);
1432 xmm2
= _mm_mul_pd(neg
,xmm2
);
1433 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
1434 xmm3
= _mm_mul_pd(xmm3
,uij3
);
1435 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
1436 xmm3
= _mm_mul_pd(uij
,rinv
);
1437 xmm4
= _mm_mul_pd(uij3
,r
);
1438 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1439 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1440 t2
= _mm_add_pd(xmm2
,xmm3
);
1442 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
1443 xmm2
= _mm_add_pd(one
,xmm2
);
1444 xmm2
= _mm_mul_pd(eigth
,xmm2
);
1445 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
1446 xmm3
= _mm_mul_pd(log_term
, rinv
);
1447 xmm3
= _mm_mul_pd(xmm3
,rinv
);
1448 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1449 t3
= _mm_add_pd(xmm2
,xmm3
);
1451 /* chain rule terms */
1452 xmm2
= _mm_mul_pd(dlij
,t1
);
1453 xmm2
= _mm_add_pd(xmm2
,t2
);
1454 xmm2
= _mm_add_pd(xmm2
,t3
);
1456 /* temporary storage of chain rule terms, since we have to compute
1457 the reciprocal terms also before storing them */
1458 chrule
= _mm_mul_pd(xmm2
,rinv
);
1460 /* INTERACTION ai->aj STARTS HERE */
1461 /* conditional mask for raj<dr+sk_ai */
1462 xmm1
= _mm_add_pd(r
,sk_ai
);
1463 mask_cmp
= _mm_cmplt_pd(raj
,xmm1
);
1465 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1466 xmm2
= _mm_sub_pd(r
,sk_ai
);
1467 xmm3
= my_inv_pd(xmm2
);
1468 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1470 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1471 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1473 uij
= my_inv_pd(xmm1
);
1474 lij2
= _mm_mul_pd(lij
,lij
);
1475 lij3
= _mm_mul_pd(lij2
,lij
);
1476 uij2
= _mm_mul_pd(uij
,uij
);
1477 uij3
= _mm_mul_pd(uij2
,uij
);
1479 diff2
= _mm_sub_pd(uij2
,lij2
);
1481 lij_inv
= my_invrsq_pd(lij2
);
1484 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
1485 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
1487 log_term
= _mm_mul_pd(uij
,lij_inv
);
1488 log_term
= log_pd(log_term
);
1490 xmm1
= _mm_sub_pd(lij
,uij
);
1491 xmm2
= _mm_mul_pd(qrtr
,r
);
1492 xmm2
= _mm_mul_pd(xmm2
,diff2
);
1493 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1494 xmm2
= _mm_mul_pd(half
,rinv
);
1495 xmm2
= _mm_mul_pd(xmm2
,log_term
);
1496 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1497 xmm9
= _mm_mul_pd(neg
,diff2
);
1498 xmm2
= _mm_mul_pd(xmm9
,prod
);
1499 tmp
= _mm_add_pd(xmm1
,xmm2
);
1501 /* contitional for raj<sk_ai-dr */
1502 xmm3
= _mm_sub_pd(sk_ai
,r
);
1503 mask_cmp3
= _mm_cmplt_pd(raj
,xmm3
); /* rai<sk-dr */
1505 xmm4
= _mm_sub_pd(raj_inv
,lij
);
1506 xmm4
= _mm_mul_pd(two
,xmm4
);
1507 xmm4
= _mm_add_pd(tmp
,xmm4
);
1509 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1511 /* the tmp will now contain two partial values, that not all are to be used. Which */
1512 /* ones are governed by the mask_cmp mask. */
1513 tmp
= _mm_mul_pd(half
,tmp
);
1514 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1516 /* Load, add and store ai->aj pol energy */
1517 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_hct_work
+aj1
);
1518 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_hct_work
+aj2
);
1520 xmm7
= _mm_add_pd(xmm7
,tmp
);
1522 _mm_storel_pd(born
->gpol_hct_work
+aj1
,xmm7
);
1523 _mm_storeh_pd(born
->gpol_hct_work
+aj2
,xmm7
);
1525 /* Start dadx chain rule terms */
1526 xmm2
= _mm_mul_pd(half
,lij2
);
1527 xmm3
= _mm_mul_pd(prod
,lij3
);
1528 xmm2
= _mm_add_pd(xmm2
,xmm3
);
1529 xmm3
= _mm_mul_pd(lij
,rinv
);
1530 xmm4
= _mm_mul_pd(lij3
,r
);
1531 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1532 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1533 t1
= _mm_sub_pd(xmm2
,xmm3
);
1535 xmm2
= _mm_mul_pd(half
,uij2
);
1536 xmm2
= _mm_mul_pd(neg
,xmm2
);
1537 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
1538 xmm3
= _mm_mul_pd(xmm3
,uij3
);
1539 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
1540 xmm3
= _mm_mul_pd(uij
,rinv
);
1541 xmm4
= _mm_mul_pd(uij3
,r
);
1542 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1543 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1544 t2
= _mm_add_pd(xmm2
,xmm3
);
1546 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
1547 xmm2
= _mm_add_pd(one
,xmm2
);
1548 xmm2
= _mm_mul_pd(eigth
,xmm2
);
1549 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
1550 xmm3
= _mm_mul_pd(log_term
, rinv
);
1551 xmm3
= _mm_mul_pd(xmm3
,rinv
);
1552 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1553 t3
= _mm_add_pd(xmm2
,xmm3
);
1555 /* chain rule terms */
1556 xmm2
= _mm_mul_pd(dlij
,t1
);
1557 xmm2
= _mm_add_pd(xmm2
,t2
);
1558 xmm2
= _mm_add_pd(xmm2
,t3
);
1559 chrule_ai
= _mm_mul_pd(xmm2
,rinv
);
1561 /* Store chain rule terms
1562 * same unpacking rationale as with Still above
1564 xmm3
= _mm_unpacklo_pd(chrule
, chrule_ai
);
1565 xmm4
= _mm_unpackhi_pd(chrule
, chrule_ai
);
1567 _mm_storeu_pd(fr
->dadx
+n
, xmm3
);
1569 _mm_storeu_pd(fr
->dadx
+n
, xmm4
);
1577 p1
= md
->typeA
[aj1
];
1579 jx
= _mm_load_sd(x
+aj13
);
1580 jy
= _mm_load_sd(x
+aj13
+1);
1581 jz
= _mm_load_sd(x
+aj13
+2);
1583 sk
= _mm_load_sd(born
->param
+aj1
);
1586 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+p1
);
1587 raj
= _mm_sub_sd(raj
,doffset
);
1588 raj_inv
= my_inv_pd(raj
);
1590 dx
= _mm_sub_sd(ix
,jx
);
1591 dy
= _mm_sub_pd(iy
,jy
);
1592 dz
= _mm_sub_pd(iz
,jz
);
1594 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
1595 rinv
= my_invrsq_pd(rsq11
);
1596 r
= _mm_mul_sd(rinv
,rsq11
);
1598 /* OFFSET INTERACTION aj->ai STARTS HERE */
1599 /* conditional mask for rai<dr+sk */
1600 xmm1
= _mm_add_sd(r
,sk
);
1601 mask_cmp
= _mm_cmplt_sd(rai
,xmm1
);
1603 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1604 xmm2
= _mm_sub_sd(r
,sk
);
1605 xmm3
= my_inv_pd(xmm2
);
1606 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1608 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1609 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1611 uij
= my_inv_pd(xmm1
);
1612 lij2
= _mm_mul_sd(lij
,lij
);
1613 lij3
= _mm_mul_sd(lij2
,lij
);
1614 uij2
= _mm_mul_sd(uij
,uij
);
1615 uij3
= _mm_mul_sd(uij2
,uij
);
1617 diff2
= _mm_sub_sd(uij2
,lij2
);
1619 lij_inv
= my_invrsq_pd(lij2
);
1620 sk2
= _mm_mul_sd(sk
,sk
);
1621 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1622 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1624 log_term
= _mm_mul_pd(uij
,lij_inv
);
1625 log_term
= log_pd(log_term
);
1627 xmm1
= _mm_sub_sd(lij
,uij
);
1628 xmm2
= _mm_mul_sd(qrtr
,r
);
1629 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1630 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1631 xmm2
= _mm_mul_sd(half
,rinv
);
1632 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1633 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1634 xmm9
= _mm_mul_sd(neg
,diff2
);
1635 xmm2
= _mm_mul_sd(xmm9
,prod
);
1637 tmp_ai
= _mm_add_sd(xmm1
,xmm2
);
1639 /* contitional for rai<sk-dr */
1640 xmm3
= _mm_sub_sd(sk
,r
);
1641 mask_cmp3
= _mm_cmplt_sd(rai
,xmm3
); /* rai<sk-dr */
1643 xmm4
= _mm_sub_sd(rai_inv
,lij
);
1644 xmm4
= _mm_mul_sd(two
,xmm4
);
1645 xmm4
= _mm_add_sd(tmp_ai
,xmm4
);
1647 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1649 /* the tmp will now contain two partial values, that not all are to be used. Which */
1650 /* ones are governed by the mask_cmp mask. */
1651 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1652 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1653 sum_ai
= _mm_add_sd(sum_ai
,tmp_ai
);
1655 xmm2
= _mm_mul_sd(half
,lij2
);
1656 xmm3
= _mm_mul_sd(prod
,lij3
);
1657 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1658 xmm3
= _mm_mul_sd(lij
,rinv
);
1659 xmm4
= _mm_mul_sd(lij3
,r
);
1660 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1661 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1662 t1
= _mm_sub_sd(xmm2
,xmm3
);
1664 xmm2
= _mm_mul_sd(half
,uij2
);
1665 xmm2
= _mm_mul_sd(neg
,xmm2
);
1666 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1667 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1668 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1669 xmm3
= _mm_mul_sd(uij
,rinv
);
1670 xmm4
= _mm_mul_sd(uij3
,r
);
1671 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1672 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1673 t2
= _mm_add_sd(xmm2
,xmm3
);
1675 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1676 xmm2
= _mm_add_sd(one
,xmm2
);
1677 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1678 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1679 xmm3
= _mm_mul_sd(log_term
, rinv
);
1680 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1681 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1682 t3
= _mm_add_sd(xmm2
,xmm3
);
1684 /* chain rule terms */
1685 xmm2
= _mm_mul_sd(dlij
,t1
);
1686 xmm2
= _mm_add_sd(xmm2
,t2
);
1687 xmm2
= _mm_add_sd(xmm2
,t3
);
1688 chrule
= _mm_mul_sd(xmm2
,rinv
);
1690 /* OFFSET INTERACTION ai->aj STARTS HERE */
1691 /* conditional mask for raj<dr+sk_ai */
1692 xmm1
= _mm_add_sd(r
,sk_ai
);
1693 mask_cmp
= _mm_cmplt_sd(raj
,xmm1
);
1695 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1696 xmm2
= _mm_sub_sd(r
,sk_ai
);
1697 xmm3
= my_inv_pd(xmm2
);
1698 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1700 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1701 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1703 uij
= my_inv_pd(xmm1
);
1704 lij2
= _mm_mul_sd(lij
,lij
);
1705 lij3
= _mm_mul_sd(lij2
,lij
);
1706 uij2
= _mm_mul_sd(uij
,uij
);
1707 uij3
= _mm_mul_sd(uij2
,uij
);
1709 diff2
= _mm_sub_sd(uij2
,lij2
);
1711 lij_inv
= my_invrsq_pd(lij2
);
1714 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1715 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1717 log_term
= _mm_mul_pd(uij
,lij_inv
);
1718 log_term
= log_pd(log_term
);
1720 xmm1
= _mm_sub_sd(lij
,uij
);
1721 xmm2
= _mm_mul_sd(qrtr
,r
);
1722 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1723 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1724 xmm2
= _mm_mul_sd(half
,rinv
);
1725 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1726 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1727 xmm9
= _mm_mul_sd(neg
,diff2
);
1728 xmm2
= _mm_mul_sd(xmm9
,prod
);
1729 tmp
= _mm_add_sd(xmm1
,xmm2
);
1731 /* contitional for raj<sk_ai-dr */
1732 xmm3
= _mm_sub_sd(sk_ai
,r
);
1733 mask_cmp3
= _mm_cmplt_sd(raj
,xmm3
); /* rai<sk-dr */
1735 xmm4
= _mm_sub_sd(raj_inv
,lij
);
1736 xmm4
= _mm_mul_sd(two
,xmm4
);
1737 xmm4
= _mm_add_sd(tmp
,xmm4
);
1739 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1741 /* the tmp will now contain two partial values, that not all are to be used. Which */
1742 /* ones are governed by the mask_cmp mask. */
1743 tmp
= _mm_mul_pd(half
,tmp
);
1744 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1746 /* Load, add and store gpol energy */
1747 xmm7
= _mm_load_sd(born
->gpol_hct_work
+aj1
);
1748 xmm7
= _mm_add_sd(xmm7
,tmp
);
1749 _mm_store_sd(born
->gpol_hct_work
+aj1
,xmm7
);
1751 /* Start chain rule terms, t1 */
1752 xmm2
= _mm_mul_sd(half
,lij2
);
1753 xmm3
= _mm_mul_sd(prod
,lij3
);
1754 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1755 xmm3
= _mm_mul_sd(lij
,rinv
);
1756 xmm4
= _mm_mul_sd(lij3
,r
);
1757 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1758 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1759 t1
= _mm_sub_sd(xmm2
,xmm3
);
1761 xmm2
= _mm_mul_sd(half
,uij2
);
1762 xmm2
= _mm_mul_sd(neg
,xmm2
);
1763 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1764 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1765 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1766 xmm3
= _mm_mul_sd(uij
,rinv
);
1767 xmm4
= _mm_mul_sd(uij3
,r
);
1768 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1769 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1770 t2
= _mm_add_sd(xmm2
,xmm3
);
1772 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1773 xmm2
= _mm_add_sd(one
,xmm2
);
1774 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1775 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1776 xmm3
= _mm_mul_sd(log_term
, rinv
);
1777 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1778 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1779 t3
= _mm_add_sd(xmm2
,xmm3
);
1781 /* chain rule terms */
1782 xmm2
= _mm_mul_sd(dlij
,t1
);
1783 xmm2
= _mm_add_sd(xmm2
,t2
);
1784 xmm2
= _mm_add_sd(xmm2
,t3
);
1785 chrule_ai
= _mm_mul_sd(xmm2
,rinv
);
1787 _mm_store_sd(fr
->dadx
+n
, chrule
);
1789 _mm_store_sd(fr
->dadx
+n
, chrule_ai
);
1793 /* Do end processing ... */
1794 tmp_ai
= _mm_unpacklo_pd(tmp_ai
,sum_ai
);
1795 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1796 sum_ai
= _mm_shuffle_pd(sum_ai
,sum_ai
,_MM_SHUFFLE2(1,1));
1798 /* Load, add and store atom ai polarisation energy */
1799 xmm2
= _mm_load_sd(born
->gpol_hct_work
+ai
);
1800 sum_ai
= _mm_add_sd(sum_ai
,xmm2
);
1801 _mm_store_sd(born
->gpol_hct_work
+ai
,sum_ai
);
1804 /* Parallel summations */
1807 gmx_sum(natoms
, born
->gpol_hct_work
, cr
);
1809 else if(DOMAINDECOMP(cr
))
1811 dd_atom_sum_real(cr
->dd
, born
->gpol_hct_work
);
1814 /* Compute the radii */
1815 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1817 if(born
->use
[i
] != 0)
1819 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]];
1823 sum
= rr
* born
->gpol_hct_work
[i
];
1827 tsum
= tanh(born
->obc_alpha
*sum
-born
->obc_beta
*sum2
+born
->obc_gamma
*sum3
);
1828 born
->bRad
[i
] = rr_inv
- tsum
*rr_inv2
;
1829 born
->bRad
[i
] = 1.0 / born
->bRad
[i
];
1831 fr
->invsqrta
[ai
]=gmx_invsqrt(born
->bRad
[ai
]);
1833 tchain
= rr
* (born
->obc_alpha
-2*born
->obc_beta
*sum
+3*born
->obc_gamma
*sum2
);
1834 born
->drobc
[i
] = (1.0-tsum
*tsum
)*tchain
*rr_inv2
;
1838 /* Extra (local) communication required for DD */
1839 if(DOMAINDECOMP(cr
))
1841 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1842 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1843 dd_atom_spread_real(cr
->dd
, born
->drobc
);
1852 calc_gb_chainrule_sse2_double(int natoms
, t_nblist
*nl
, double *dadx
, double *dvda
, double *xd
, double *f
,
1853 double *fshift
, double *shift_vec
, int gb_algorithm
, gmx_genborn_t
*born
)
1855 int i
,k
,n
,ai
,aj
,ai3
,aj1
,aj2
,aj13
,aj23
,aj4
,nj0
,nj1
,offset
;
1857 double rbi
,shX
,shY
,shZ
;
1860 __m128d ix
,iy
,iz
,jx
,jy
,jz
,fix
,fiy
,fiz
,sX
,sY
,sZ
;
1861 __m128d dx
,dy
,dz
,t1
,t2
,t3
,dva
,dvaj
,dax
,dax_ai
,fgb
,fgb_ai
;
1862 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm5
,xmm6
,xmm7
,xmm8
;
1864 t1
= t2
= t3
= _mm_setzero_pd();
1867 /* Loop to get the proper form for the Born radius term */
1868 if(gb_algorithm
==egbSTILL
) {
1869 for(i
=0;i
<natoms
;i
++)
1871 rbi
= born
->bRad
[i
];
1872 rb
[i
] = (2 * rbi
* rbi
* dvda
[i
])/ONE_4PI_EPS0
;
1876 else if(gb_algorithm
==egbHCT
) {
1877 for(i
=0;i
<natoms
;i
++)
1879 rbi
= born
->bRad
[i
];
1880 rb
[i
] = rbi
* rbi
* dvda
[i
];
1884 else if(gb_algorithm
==egbOBC
) {
1885 for(i
=0;i
<natoms
;i
++)
1887 rbi
= born
->bRad
[i
];
1888 rb
[i
] = rbi
* rbi
* born
->drobc
[i
] * dvda
[i
];
1894 for(i
=0;i
<nl
->nri
;i
++)
1899 nj0
= nl
->jindex
[i
];
1900 nj1
= nl
->jindex
[i
+1];
1902 /* Load shifts for this list */
1903 shift
= 3*nl
->shift
[i
];
1904 shX
= shift_vec
[shift
+0];
1905 shY
= shift_vec
[shift
+1];
1906 shZ
= shift_vec
[shift
+2];
1908 /* Splat the shifts */
1909 sX
= _mm_load1_pd(&shX
);
1910 sY
= _mm_load1_pd(&shY
);
1911 sZ
= _mm_load1_pd(&shZ
);
1913 offset
= (nj1
-nj0
)%2;
1915 /* Load particle ai coordinates and add shifts */
1916 ix
= _mm_load1_pd(xd
+ai3
);
1917 iy
= _mm_load1_pd(xd
+ai3
+1);
1918 iz
= _mm_load1_pd(xd
+ai3
+2);
1920 ix
= _mm_add_pd(sX
,ix
);
1921 iy
= _mm_add_pd(sY
,iy
);
1922 iz
= _mm_add_pd(sZ
,iz
);
1924 /* Load particle ai dvda */
1925 dva
= _mm_load1_pd(rb
+ai
);
1927 fix
= _mm_setzero_pd();
1928 fiy
= _mm_setzero_pd();
1929 fiz
= _mm_setzero_pd();
1931 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
1934 aj2
= nl
->jjnr
[k
+1];
1937 dvaj
= _mm_set_pd(rb
[aj2
],rb
[aj1
]);
1942 /* Load particle aj1-2 coordinates */
1943 xmm1
= _mm_loadu_pd(xd
+aj13
);
1944 xmm2
= _mm_loadu_pd(xd
+aj23
);
1946 xmm5
= _mm_load_sd(xd
+aj13
+2);
1947 xmm6
= _mm_load_sd(xd
+aj23
+2);
1949 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
1950 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
1951 jz
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0));
1953 /* load derivative of born radii w.r.t. coordinates */
1954 xmm7
= _mm_loadu_pd(dadx
+n
);
1956 xmm8
= _mm_loadu_pd(dadx
+n
);
1959 /* Shuffle to get the ai and aj components right */
1960 dax
= _mm_shuffle_pd(xmm7
,xmm8
,_MM_SHUFFLE2(2,0));
1961 dax_ai
= _mm_shuffle_pd(xmm7
,xmm8
,_MM_SHUFFLE2(3,1));
1964 dx
= _mm_sub_pd(ix
,jx
);
1965 dy
= _mm_sub_pd(iy
,jy
);
1966 dz
= _mm_sub_pd(iz
,jz
);
1969 fgb
= _mm_mul_pd(dva
,dax
);
1970 fgb_ai
= _mm_mul_pd(dvaj
,dax_ai
);
1971 fgb
= _mm_add_pd(fgb
,fgb_ai
);
1973 /* partial forces */
1974 t1
= _mm_mul_pd(fgb
,dx
);
1975 t2
= _mm_mul_pd(fgb
,dy
);
1976 t3
= _mm_mul_pd(fgb
,dz
);
1978 /* update the i force */
1979 fix
= _mm_add_pd(fix
,t1
);
1980 fiy
= _mm_add_pd(fiy
,t2
);
1981 fiz
= _mm_add_pd(fiz
,t3
);
1983 /* accumulate forces from memory */
1984 xmm1
= _mm_loadu_pd(f
+aj13
); /* fx1 fx2 */
1985 xmm2
= _mm_loadu_pd(f
+aj23
); /* fy1 fy2 */
1987 xmm5
= _mm_load1_pd(f
+aj13
+2); /* fz1 fz1 */
1988 xmm6
= _mm_load1_pd(f
+aj23
+2); /* fz2 fz2 */
1991 xmm7
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1992 xmm5
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1993 xmm6
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1995 /* subtract partial forces */
1996 xmm5
= _mm_sub_pd(xmm5
, t1
);
1997 xmm6
= _mm_sub_pd(xmm6
, t2
);
1998 xmm7
= _mm_sub_pd(xmm7
, t3
);
2000 /* transpose back fx and fy */
2001 xmm1
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0));
2002 xmm2
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(1,1));
2004 /* store the force, first fx's and fy's */
2005 _mm_storeu_pd(f
+aj13
,xmm1
);
2006 _mm_storeu_pd(f
+aj23
,xmm2
);
2009 _mm_storel_pd(f
+aj13
+2,xmm7
);
2010 _mm_storeh_pd(f
+aj23
+2,xmm7
);
2017 dvaj
= _mm_load_sd(rb
+aj1
);
2021 jx
= _mm_load_sd(xd
+aj13
);
2022 jy
= _mm_load_sd(xd
+aj13
+1);
2023 jz
= _mm_load_sd(xd
+aj13
+2);
2025 dax
= _mm_load_sd(dadx
+n
);
2027 dax_ai
= _mm_load_sd(dadx
+n
);
2030 dx
= _mm_sub_sd(ix
,jx
);
2031 dy
= _mm_sub_sd(iy
,jy
);
2032 dz
= _mm_sub_sd(iz
,jz
);
2034 fgb
= _mm_mul_sd(dva
,dax
);
2035 fgb_ai
= _mm_mul_sd(dvaj
,dax_ai
);
2036 fgb
= _mm_add_pd(fgb
,fgb_ai
);
2038 t1
= _mm_mul_sd(fgb
,dx
);
2039 t2
= _mm_mul_sd(fgb
,dy
);
2040 t3
= _mm_mul_sd(fgb
,dz
);
2042 fix
= _mm_add_sd(fix
,t1
);
2043 fiy
= _mm_add_sd(fiy
,t2
);
2044 fiz
= _mm_add_sd(fiz
,t3
);
2046 /* accumulate forces from memory */
2047 xmm5
= _mm_load_sd(f
+aj13
);
2048 xmm6
= _mm_load_sd(f
+aj13
+1);
2049 xmm7
= _mm_load_sd(f
+aj13
+2);
2051 /* subtract partial forces */
2052 xmm5
= _mm_sub_sd(xmm5
, t1
);
2053 xmm6
= _mm_sub_sd(xmm6
, t2
);
2054 xmm7
= _mm_sub_sd(xmm7
, t3
);
2056 /* store the force */
2057 _mm_store_sd(f
+aj13
,xmm5
);
2058 _mm_store_sd(f
+aj13
+1,xmm6
);
2059 _mm_store_sd(f
+aj13
+2,xmm7
);
2062 /* fix/fiy/fiz now contain two partial terms, that all should be
2063 * added to the i particle forces
2065 t1
= _mm_unpacklo_pd(t1
,fix
);
2066 t2
= _mm_unpacklo_pd(t2
,fiy
);
2067 t3
= _mm_unpacklo_pd(t3
,fiz
);
2069 fix
= _mm_add_pd(fix
,t1
);
2070 fiy
= _mm_add_pd(fiy
,t2
);
2071 fiz
= _mm_add_pd(fiz
,t3
);
2073 fix
= _mm_shuffle_pd(fix
,fix
,_MM_SHUFFLE2(1,1));
2074 fiy
= _mm_shuffle_pd(fiy
,fiy
,_MM_SHUFFLE2(1,1));
2075 fiz
= _mm_shuffle_pd(fiz
,fiz
,_MM_SHUFFLE2(1,1));
2077 /* load, add and store i forces */
2078 xmm1
= _mm_load_sd(f
+ai3
);
2079 xmm2
= _mm_load_sd(f
+ai3
+1);
2080 xmm3
= _mm_load_sd(f
+ai3
+2);
2082 fix
= _mm_add_sd(fix
,xmm1
);
2083 fiy
= _mm_add_sd(fiy
,xmm2
);
2084 fiz
= _mm_add_sd(fiz
,xmm3
);
2086 _mm_store_sd(f
+ai3
,fix
);
2087 _mm_store_sd(f
+ai3
+1,fiy
);
2088 _mm_store_sd(f
+ai3
+2,fiz
);
2090 /* load, add and store i shift forces */
2091 xmm1
= _mm_load_sd(fshift
+shift
);
2092 xmm2
= _mm_load_sd(fshift
+shift
+1);
2093 xmm3
= _mm_load_sd(fshift
+shift
+2);
2095 fix
= _mm_add_sd(fix
,xmm1
);
2096 fiy
= _mm_add_sd(fiy
,xmm2
);
2097 fiz
= _mm_add_sd(fiz
,xmm3
);
2099 _mm_store_sd(fshift
+shift
,fix
);
2100 _mm_store_sd(fshift
+shift
+1,fiy
);
2101 _mm_store_sd(fshift
+shift
+2,fiz
);
2109 /* keep compiler happy */
2110 int genborn_sse_dummy
;
2112 #endif /* SSE2 intrinsics available */