Fixed typo in CMakeLists for mdrun-gpu
[gromacs/qmmm-gamess-us.git] / src / mdlib / genborn_sse2_single.c
blob921f830000a86320bdf5b2799255d5902c205715
1 /*
2 *
3 * This source code is part of
4 *
5 * G R O M A C S
6 *
7 * GROningen MAchine for Chemical Simulations
8 *
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2008, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
31 * And Hey:
32 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
38 #include <math.h>
39 #include <string.h>
41 #include "typedefs.h"
42 #include "smalloc.h"
43 #include "genborn.h"
44 #include "vec.h"
45 #include "grompp.h"
46 #include "pdbio.h"
47 #include "names.h"
48 #include "physics.h"
49 #include "partdec.h"
50 #include "domdec.h"
51 #include "network.h"
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
54 #include "genborn.h"
56 #ifdef GMX_LIB_MPI
57 #include <mpi.h>
58 #endif
59 #ifdef GMX_THREADS
60 #include "tmpi.h"
61 #endif
64 /* Only compile this file if SSE intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
67 #include <gmx_sse2_single.h>
68 #include <xmmintrin.h>
69 #include <emmintrin.h>
72 int
73 calc_gb_rad_still_sse(t_commrec *cr, t_forcerec *fr,
74 int natoms, gmx_localtop_t *top,
75 const t_atomtypes *atype, float *x, t_nblist *nl,
76 gmx_genborn_t *born)
78 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
79 int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
80 int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
81 int shift;
82 int *mdtype;
83 real shX,shY,shZ;
84 int *jjnr;
85 real *shiftvec;
87 float gpi_ai,gpi2;
88 float factor;
89 float *gb_radius;
90 float *vsolv;
91 float *work;
92 float *dadx;
94 __m128 ix,iy,iz;
95 __m128 jx,jy,jz;
96 __m128 dx,dy,dz;
97 __m128 tx,ty,tz;
98 __m128 jxB,jyB,jzB;
99 __m128 dxB,dyB,dzB;
100 __m128 txB,tyB,tzB;
101 __m128 rsq,rinv,rinv2,rinv4,rinv6;
102 __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
103 __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
104 __m128 ratioB,rajB,vajB,rvdwB;
105 __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
106 __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
107 __m128 mask,icf4,icf6,mask_cmp;
108 __m128 icf4B,icf6B,mask_cmpB;
110 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
111 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
112 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
114 const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
115 const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
116 const __m128 one = {1.0f, 1.0f , 1.0f , 1.0f };
117 const __m128 two = {2.0f , 2.0f , 2.0f, 2.0f };
118 const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
119 const __m128 four = {4.0f , 4.0f , 4.0f , 4.0f };
121 const __m128 still_p5inv = {STILL_P5INV, STILL_P5INV, STILL_P5INV, STILL_P5INV};
122 const __m128 still_pip5 = {STILL_PIP5, STILL_PIP5, STILL_PIP5, STILL_PIP5};
123 const __m128 still_p4 = {STILL_P4, STILL_P4, STILL_P4, STILL_P4};
125 factor = 0.5 * ONE_4PI_EPS0;
127 gb_radius = born->gb_radius;
128 vsolv = born->vsolv;
129 work = born->gpol_still_work;
130 jjnr = nl->jjnr;
131 shiftvec = fr->shift_vec[0];
132 dadx = fr->dadx;
134 jnrA = jnrB = jnrC = jnrD = 0;
135 jx = _mm_setzero_ps();
136 jy = _mm_setzero_ps();
137 jz = _mm_setzero_ps();
139 n = 0;
141 for(i=0;i<natoms;i++)
143 work[i]=0;
146 for(i=0;i<nl->nri;i++)
148 ii = nl->iinr[i];
149 ii3 = ii*3;
150 is3 = 3*nl->shift[i];
151 shX = shiftvec[is3];
152 shY = shiftvec[is3+1];
153 shZ = shiftvec[is3+2];
154 nj0 = nl->jindex[i];
155 nj1 = nl->jindex[i+1];
157 ix = _mm_set1_ps(shX+x[ii3+0]);
158 iy = _mm_set1_ps(shY+x[ii3+1]);
159 iz = _mm_set1_ps(shZ+x[ii3+2]);
161 offset = (nj1-nj0)%4;
163 /* Polarization energy for atom ai */
164 gpi = _mm_setzero_ps();
166 rai = _mm_load1_ps(gb_radius+ii);
167 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
169 for(k=nj0;k<nj1-4-offset;k+=8)
171 jnrA = jjnr[k];
172 jnrB = jjnr[k+1];
173 jnrC = jjnr[k+2];
174 jnrD = jjnr[k+3];
175 jnrE = jjnr[k+4];
176 jnrF = jjnr[k+5];
177 jnrG = jjnr[k+6];
178 jnrH = jjnr[k+7];
180 j3A = 3*jnrA;
181 j3B = 3*jnrB;
182 j3C = 3*jnrC;
183 j3D = 3*jnrD;
184 j3E = 3*jnrE;
185 j3F = 3*jnrF;
186 j3G = 3*jnrG;
187 j3H = 3*jnrH;
189 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
190 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
192 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
193 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
194 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
195 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
197 dx = _mm_sub_ps(ix,jx);
198 dy = _mm_sub_ps(iy,jy);
199 dz = _mm_sub_ps(iz,jz);
200 dxB = _mm_sub_ps(ix,jxB);
201 dyB = _mm_sub_ps(iy,jyB);
202 dzB = _mm_sub_ps(iz,jzB);
204 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
205 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
206 rinv = gmx_mm_invsqrt_ps(rsq);
207 rinvB = gmx_mm_invsqrt_ps(rsqB);
208 rinv2 = _mm_mul_ps(rinv,rinv);
209 rinv2B = _mm_mul_ps(rinvB,rinvB);
210 rinv4 = _mm_mul_ps(rinv2,rinv2);
211 rinv4B = _mm_mul_ps(rinv2B,rinv2B);
212 rinv6 = _mm_mul_ps(rinv4,rinv2);
213 rinv6B = _mm_mul_ps(rinv4B,rinv2B);
215 rvdw = _mm_add_ps(rai,raj);
216 rvdwB = _mm_add_ps(rai,rajB);
217 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
218 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
220 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
221 mask_cmpB = _mm_cmple_ps(ratioB,still_p5inv);
223 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
224 if( 0 == _mm_movemask_ps(mask_cmp) )
226 /* if ratio>still_p5inv for ALL elements */
227 ccf = one;
228 dccf = _mm_setzero_ps();
230 else
232 ratio = _mm_min_ps(ratio,still_p5inv);
233 theta = _mm_mul_ps(ratio,still_pip5);
234 GMX_MM_SINCOS_PS(theta,sinq,cosq);
235 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
236 ccf = _mm_mul_ps(term,term);
237 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
238 _mm_mul_ps(sinq,theta));
240 if( 0 == _mm_movemask_ps(mask_cmpB) )
242 /* if ratio>still_p5inv for ALL elements */
243 ccfB = one;
244 dccfB = _mm_setzero_ps();
246 else
248 ratioB = _mm_min_ps(ratioB,still_p5inv);
249 thetaB = _mm_mul_ps(ratioB,still_pip5);
250 GMX_MM_SINCOS_PS(thetaB,sinqB,cosqB);
251 termB = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
252 ccfB = _mm_mul_ps(termB,termB);
253 dccfB = _mm_mul_ps(_mm_mul_ps(two,termB),
254 _mm_mul_ps(sinqB,thetaB));
257 prod = _mm_mul_ps(still_p4,vaj);
258 prodB = _mm_mul_ps(still_p4,vajB);
259 icf4 = _mm_mul_ps(ccf,rinv4);
260 icf4B = _mm_mul_ps(ccfB,rinv4B);
261 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
262 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
264 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
265 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
267 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
269 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
270 dadx+=4;
271 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
272 dadx+=4;
273 _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
274 dadx+=4;
275 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
276 dadx+=4;
279 for(;k<nj1-offset;k+=4)
281 jnrA = jjnr[k];
282 jnrB = jjnr[k+1];
283 jnrC = jjnr[k+2];
284 jnrD = jjnr[k+3];
286 j3A = 3*jnrA;
287 j3B = 3*jnrB;
288 j3C = 3*jnrC;
289 j3D = 3*jnrD;
291 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
293 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
294 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
296 dx = _mm_sub_ps(ix,jx);
297 dy = _mm_sub_ps(iy,jy);
298 dz = _mm_sub_ps(iz,jz);
300 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
301 rinv = gmx_mm_invsqrt_ps(rsq);
302 rinv2 = _mm_mul_ps(rinv,rinv);
303 rinv4 = _mm_mul_ps(rinv2,rinv2);
304 rinv6 = _mm_mul_ps(rinv4,rinv2);
306 rvdw = _mm_add_ps(rai,raj);
307 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
309 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
311 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
312 if(0 == _mm_movemask_ps(mask_cmp))
314 /* if ratio>still_p5inv for ALL elements */
315 ccf = one;
316 dccf = _mm_setzero_ps();
318 else
320 ratio = _mm_min_ps(ratio,still_p5inv);
321 theta = _mm_mul_ps(ratio,still_pip5);
322 GMX_MM_SINCOS_PS(theta,sinq,cosq);
323 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
324 ccf = _mm_mul_ps(term,term);
325 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
326 _mm_mul_ps(sinq,theta));
329 prod = _mm_mul_ps(still_p4,vaj);
330 icf4 = _mm_mul_ps(ccf,rinv4);
331 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
334 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
336 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
338 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
339 dadx+=4;
340 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
341 dadx+=4;
344 if(offset!=0)
346 if(offset==1)
348 jnrA = jjnr[k];
349 j3A = 3*jnrA;
350 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
351 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
352 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
353 mask = mask1;
355 else if(offset==2)
357 jnrA = jjnr[k];
358 jnrB = jjnr[k+1];
359 j3A = 3*jnrA;
360 j3B = 3*jnrB;
361 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
362 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
363 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
364 mask = mask2;
366 else
368 /* offset must be 3 */
369 jnrA = jjnr[k];
370 jnrB = jjnr[k+1];
371 jnrC = jjnr[k+2];
372 j3A = 3*jnrA;
373 j3B = 3*jnrB;
374 j3C = 3*jnrC;
375 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
376 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
377 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
378 mask = mask3;
381 dx = _mm_sub_ps(ix,jx);
382 dy = _mm_sub_ps(iy,jy);
383 dz = _mm_sub_ps(iz,jz);
385 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
386 rinv = gmx_mm_invsqrt_ps(rsq);
387 rinv2 = _mm_mul_ps(rinv,rinv);
388 rinv4 = _mm_mul_ps(rinv2,rinv2);
389 rinv6 = _mm_mul_ps(rinv4,rinv2);
391 rvdw = _mm_add_ps(rai,raj);
392 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
394 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
396 if(0 == _mm_movemask_ps(mask_cmp))
398 /* if ratio>still_p5inv for ALL elements */
399 ccf = one;
400 dccf = _mm_setzero_ps();
402 else
404 ratio = _mm_min_ps(ratio,still_p5inv);
405 theta = _mm_mul_ps(ratio,still_pip5);
406 GMX_MM_SINCOS_PS(theta,sinq,cosq);
407 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
408 ccf = _mm_mul_ps(term,term);
409 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
410 _mm_mul_ps(sinq,theta));
413 prod = _mm_mul_ps(still_p4,vaj);
414 icf4 = _mm_mul_ps(ccf,rinv4);
415 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
417 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
419 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
420 dadx+=4;
421 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
422 dadx+=4;
424 tmp = _mm_mul_ps(prod_ai,icf4);
426 if(offset==1)
428 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
430 else if(offset==2)
432 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
434 else
436 /* offset must be 3 */
437 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
440 gmx_mm_update_1pot_ps(gpi,work+ii);
443 /* Sum up the polarization energy from other nodes */
444 if(PARTDECOMP(cr))
446 gmx_sum(natoms, work, cr);
448 else if(DOMAINDECOMP(cr))
450 dd_atom_sum_real(cr->dd, work);
453 /* Compute the radii */
454 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
456 if(born->use[i] != 0)
458 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
459 gpi2 = gpi_ai * gpi_ai;
460 born->bRad[i] = factor*gmx_invsqrt(gpi2);
461 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
465 /* Extra (local) communication required for DD */
466 if(DOMAINDECOMP(cr))
468 dd_atom_spread_real(cr->dd, born->bRad);
469 dd_atom_spread_real(cr->dd, fr->invsqrta);
472 return 0;
476 int
477 calc_gb_rad_hct_obc_sse(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
478 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
480 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
481 int jnrA,jnrB,jnrC,jnrD;
482 int j3A,j3B,j3C,j3D;
483 int jnrE,jnrF,jnrG,jnrH;
484 int j3E,j3F,j3G,j3H;
485 float shX,shY,shZ;
486 float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
487 float sum_ai2, sum_ai3,tsum,tchain,doffset;
488 float *obc_param;
489 float *gb_radius;
490 float *work;
491 int * jjnr;
492 float *dadx;
493 float *shiftvec;
494 float min_rad,rad;
496 __m128 ix,iy,iz,jx,jy,jz;
497 __m128 dx,dy,dz,t1,t2,t3,t4;
498 __m128 rsq,rinv,r;
499 __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
500 __m128 uij,lij2,uij2,lij3,uij3,diff2;
501 __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
502 __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
503 __m128 dadx1,dadx2;
504 __m128 logterm;
505 __m128 mask;
506 __m128 obc_mask1,obc_mask2,obc_mask3;
507 __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
508 __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
509 __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
510 __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
511 __m128 lij_invB,sk2_invB,prodB;
512 __m128 sk_ajB,sk2_ajB,sk2_rinvB;
513 __m128 dadx1B,dadx2B;
514 __m128 logtermB;
515 __m128 obc_mask1B,obc_mask2B,obc_mask3B;
517 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
518 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
519 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
521 __m128 oneeighth = _mm_set1_ps(0.125);
522 __m128 onefourth = _mm_set1_ps(0.25);
524 const __m128 neg = {-1.0f , -1.0f , -1.0f , -1.0f };
525 const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
526 const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
527 const __m128 one = {1.0f , 1.0f , 1.0f , 1.0f };
528 const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
529 const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
531 /* Set the dielectric offset */
532 doffset = born->gb_doffset;
533 gb_radius = born->gb_radius;
534 obc_param = born->param;
535 work = born->gpol_hct_work;
536 jjnr = nl->jjnr;
537 dadx = fr->dadx;
538 shiftvec = fr->shift_vec[0];
540 jx = _mm_setzero_ps();
541 jy = _mm_setzero_ps();
542 jz = _mm_setzero_ps();
544 jnrA = jnrB = jnrC = jnrD = 0;
546 for(i=0;i<born->nr;i++)
548 work[i] = 0;
551 for(i=0;i<nl->nri;i++)
553 ii = nl->iinr[i];
554 ii3 = ii*3;
555 is3 = 3*nl->shift[i];
556 shX = shiftvec[is3];
557 shY = shiftvec[is3+1];
558 shZ = shiftvec[is3+2];
559 nj0 = nl->jindex[i];
560 nj1 = nl->jindex[i+1];
562 ix = _mm_set1_ps(shX+x[ii3+0]);
563 iy = _mm_set1_ps(shY+x[ii3+1]);
564 iz = _mm_set1_ps(shZ+x[ii3+2]);
566 offset = (nj1-nj0)%4;
568 rai = _mm_load1_ps(gb_radius+ii);
569 rai_inv= gmx_mm_inv_ps(rai);
571 sum_ai = _mm_setzero_ps();
573 sk_ai = _mm_load1_ps(born->param+ii);
574 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
576 for(k=nj0;k<nj1-4-offset;k+=8)
578 jnrA = jjnr[k];
579 jnrB = jjnr[k+1];
580 jnrC = jjnr[k+2];
581 jnrD = jjnr[k+3];
582 jnrE = jjnr[k+4];
583 jnrF = jjnr[k+5];
584 jnrG = jjnr[k+6];
585 jnrH = jjnr[k+7];
587 j3A = 3*jnrA;
588 j3B = 3*jnrB;
589 j3C = 3*jnrC;
590 j3D = 3*jnrD;
591 j3E = 3*jnrE;
592 j3F = 3*jnrF;
593 j3G = 3*jnrG;
594 j3H = 3*jnrH;
596 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
597 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
598 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
599 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
600 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
601 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
603 dx = _mm_sub_ps(ix, jx);
604 dy = _mm_sub_ps(iy, jy);
605 dz = _mm_sub_ps(iz, jz);
606 dxB = _mm_sub_ps(ix, jxB);
607 dyB = _mm_sub_ps(iy, jyB);
608 dzB = _mm_sub_ps(iz, jzB);
610 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
611 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
613 rinv = gmx_mm_invsqrt_ps(rsq);
614 r = _mm_mul_ps(rsq,rinv);
615 rinvB = gmx_mm_invsqrt_ps(rsqB);
616 rB = _mm_mul_ps(rsqB,rinvB);
618 /* Compute raj_inv aj1-4 */
619 raj_inv = gmx_mm_inv_ps(raj);
620 raj_invB = gmx_mm_inv_ps(rajB);
622 /* Evaluate influence of atom aj -> ai */
623 t1 = _mm_add_ps(r,sk_aj);
624 t2 = _mm_sub_ps(r,sk_aj);
625 t3 = _mm_sub_ps(sk_aj,r);
626 t1B = _mm_add_ps(rB,sk_ajB);
627 t2B = _mm_sub_ps(rB,sk_ajB);
628 t3B = _mm_sub_ps(sk_ajB,rB);
629 obc_mask1 = _mm_cmplt_ps(rai, t1);
630 obc_mask2 = _mm_cmplt_ps(rai, t2);
631 obc_mask3 = _mm_cmplt_ps(rai, t3);
632 obc_mask1B = _mm_cmplt_ps(rai, t1B);
633 obc_mask2B = _mm_cmplt_ps(rai, t2B);
634 obc_mask3B = _mm_cmplt_ps(rai, t3B);
636 uij = gmx_mm_inv_ps(t1);
637 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
638 _mm_andnot_ps(obc_mask2,rai_inv));
639 dlij = _mm_and_ps(one,obc_mask2);
640 uij2 = _mm_mul_ps(uij, uij);
641 uij3 = _mm_mul_ps(uij2,uij);
642 lij2 = _mm_mul_ps(lij, lij);
643 lij3 = _mm_mul_ps(lij2,lij);
645 uijB = gmx_mm_inv_ps(t1B);
646 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
647 _mm_andnot_ps(obc_mask2B,rai_inv));
648 dlijB = _mm_and_ps(one,obc_mask2B);
649 uij2B = _mm_mul_ps(uijB, uijB);
650 uij3B = _mm_mul_ps(uij2B,uijB);
651 lij2B = _mm_mul_ps(lijB, lijB);
652 lij3B = _mm_mul_ps(lij2B,lijB);
654 diff2 = _mm_sub_ps(uij2,lij2);
655 lij_inv = gmx_mm_invsqrt_ps(lij2);
656 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
657 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
658 prod = _mm_mul_ps(onefourth,sk2_rinv);
660 diff2B = _mm_sub_ps(uij2B,lij2B);
661 lij_invB = gmx_mm_invsqrt_ps(lij2B);
662 sk2_ajB = _mm_mul_ps(sk_ajB,sk_ajB);
663 sk2_rinvB = _mm_mul_ps(sk2_ajB,rinvB);
664 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
666 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
667 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
669 t1 = _mm_sub_ps(lij,uij);
670 t2 = _mm_mul_ps(diff2,
671 _mm_sub_ps(_mm_mul_ps(onefourth,r),
672 prod));
673 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
674 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
675 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
676 t4 = _mm_and_ps(t4,obc_mask3);
677 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
679 t1B = _mm_sub_ps(lijB,uijB);
680 t2B = _mm_mul_ps(diff2B,
681 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
682 prodB));
683 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
684 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
685 t4B = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
686 t4B = _mm_and_ps(t4B,obc_mask3B);
687 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
689 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
691 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
692 _mm_mul_ps(prod,lij3));
693 t1 = _mm_sub_ps(t1,
694 _mm_mul_ps(onefourth,
695 _mm_add_ps(_mm_mul_ps(lij,rinv),
696 _mm_mul_ps(lij3,r))));
697 t2 = _mm_mul_ps(onefourth,
698 _mm_add_ps(_mm_mul_ps(uij,rinv),
699 _mm_mul_ps(uij3,r)));
700 t2 = _mm_sub_ps(t2,
701 _mm_add_ps(_mm_mul_ps(half,uij2),
702 _mm_mul_ps(prod,uij3)));
703 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
704 _mm_mul_ps(rinv,rinv));
705 t3 = _mm_sub_ps(t3,
706 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
707 _mm_add_ps(one,
708 _mm_mul_ps(sk2_rinv,rinv))));
709 t1 = _mm_mul_ps(rinv,
710 _mm_add_ps(_mm_mul_ps(dlij,t1),
711 _mm_add_ps(t2,t3)));
715 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
716 _mm_mul_ps(prodB,lij3B));
717 t1B = _mm_sub_ps(t1B,
718 _mm_mul_ps(onefourth,
719 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
720 _mm_mul_ps(lij3B,rB))));
721 t2B = _mm_mul_ps(onefourth,
722 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
723 _mm_mul_ps(uij3B,rB)));
724 t2B = _mm_sub_ps(t2B,
725 _mm_add_ps(_mm_mul_ps(half,uij2B),
726 _mm_mul_ps(prodB,uij3B)));
727 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
728 _mm_mul_ps(rinvB,rinvB));
729 t3B = _mm_sub_ps(t3B,
730 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
731 _mm_add_ps(one,
732 _mm_mul_ps(sk2_rinvB,rinvB))));
733 t1B = _mm_mul_ps(rinvB,
734 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
735 _mm_add_ps(t2B,t3B)));
737 dadx1 = _mm_and_ps(t1,obc_mask1);
738 dadx1B = _mm_and_ps(t1B,obc_mask1B);
741 /* Evaluate influence of atom ai -> aj */
742 t1 = _mm_add_ps(r,sk_ai);
743 t2 = _mm_sub_ps(r,sk_ai);
744 t3 = _mm_sub_ps(sk_ai,r);
745 t1B = _mm_add_ps(rB,sk_ai);
746 t2B = _mm_sub_ps(rB,sk_ai);
747 t3B = _mm_sub_ps(sk_ai,rB);
748 obc_mask1 = _mm_cmplt_ps(raj, t1);
749 obc_mask2 = _mm_cmplt_ps(raj, t2);
750 obc_mask3 = _mm_cmplt_ps(raj, t3);
751 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
752 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
753 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
755 uij = gmx_mm_inv_ps(t1);
756 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
757 _mm_andnot_ps(obc_mask2,raj_inv));
758 dlij = _mm_and_ps(one,obc_mask2);
759 uij2 = _mm_mul_ps(uij, uij);
760 uij3 = _mm_mul_ps(uij2,uij);
761 lij2 = _mm_mul_ps(lij, lij);
762 lij3 = _mm_mul_ps(lij2,lij);
764 uijB = gmx_mm_inv_ps(t1B);
765 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
766 _mm_andnot_ps(obc_mask2B,raj_invB));
767 dlijB = _mm_and_ps(one,obc_mask2B);
768 uij2B = _mm_mul_ps(uijB, uijB);
769 uij3B = _mm_mul_ps(uij2B,uijB);
770 lij2B = _mm_mul_ps(lijB, lijB);
771 lij3B = _mm_mul_ps(lij2B,lijB);
773 diff2 = _mm_sub_ps(uij2,lij2);
774 lij_inv = gmx_mm_invsqrt_ps(lij2);
775 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
776 prod = _mm_mul_ps(onefourth,sk2_rinv);
778 diff2B = _mm_sub_ps(uij2B,lij2B);
779 lij_invB = gmx_mm_invsqrt_ps(lij2B);
780 sk2_rinvB = _mm_mul_ps(sk2_ai,rinvB);
781 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
783 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
784 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
786 t1 = _mm_sub_ps(lij,uij);
787 t2 = _mm_mul_ps(diff2,
788 _mm_sub_ps(_mm_mul_ps(onefourth,r),
789 prod));
790 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
791 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
792 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
793 t4 = _mm_and_ps(t4,obc_mask3);
794 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
796 t1B = _mm_sub_ps(lijB,uijB);
797 t2B = _mm_mul_ps(diff2B,
798 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
799 prodB));
800 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
801 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
802 t4B = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
803 t4B = _mm_and_ps(t4B,obc_mask3B);
804 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
806 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
807 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
809 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
810 _mm_mul_ps(prod,lij3));
811 t1 = _mm_sub_ps(t1,
812 _mm_mul_ps(onefourth,
813 _mm_add_ps(_mm_mul_ps(lij,rinv),
814 _mm_mul_ps(lij3,r))));
815 t2 = _mm_mul_ps(onefourth,
816 _mm_add_ps(_mm_mul_ps(uij,rinv),
817 _mm_mul_ps(uij3,r)));
818 t2 = _mm_sub_ps(t2,
819 _mm_add_ps(_mm_mul_ps(half,uij2),
820 _mm_mul_ps(prod,uij3)));
821 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
822 _mm_mul_ps(rinv,rinv));
823 t3 = _mm_sub_ps(t3,
824 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
825 _mm_add_ps(one,
826 _mm_mul_ps(sk2_rinv,rinv))));
827 t1 = _mm_mul_ps(rinv,
828 _mm_add_ps(_mm_mul_ps(dlij,t1),
829 _mm_add_ps(t2,t3)));
832 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
833 _mm_mul_ps(prodB,lij3B));
834 t1B = _mm_sub_ps(t1B,
835 _mm_mul_ps(onefourth,
836 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
837 _mm_mul_ps(lij3B,rB))));
838 t2B = _mm_mul_ps(onefourth,
839 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
840 _mm_mul_ps(uij3B,rB)));
841 t2B = _mm_sub_ps(t2B,
842 _mm_add_ps(_mm_mul_ps(half,uij2B),
843 _mm_mul_ps(prodB,uij3B)));
844 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
845 _mm_mul_ps(rinvB,rinvB));
846 t3B = _mm_sub_ps(t3B,
847 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
848 _mm_add_ps(one,
849 _mm_mul_ps(sk2_rinvB,rinvB))));
850 t1B = _mm_mul_ps(rinvB,
851 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
852 _mm_add_ps(t2B,t3B)));
855 dadx2 = _mm_and_ps(t1,obc_mask1);
856 dadx2B = _mm_and_ps(t1B,obc_mask1B);
858 _mm_store_ps(dadx,dadx1);
859 dadx += 4;
860 _mm_store_ps(dadx,dadx2);
861 dadx += 4;
862 _mm_store_ps(dadx,dadx1B);
863 dadx += 4;
864 _mm_store_ps(dadx,dadx2B);
865 dadx += 4;
867 } /* end normal inner loop */
869 for(;k<nj1-offset;k+=4)
871 jnrA = jjnr[k];
872 jnrB = jjnr[k+1];
873 jnrC = jjnr[k+2];
874 jnrD = jjnr[k+3];
876 j3A = 3*jnrA;
877 j3B = 3*jnrB;
878 j3C = 3*jnrC;
879 j3D = 3*jnrD;
881 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
882 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
883 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
885 dx = _mm_sub_ps(ix, jx);
886 dy = _mm_sub_ps(iy, jy);
887 dz = _mm_sub_ps(iz, jz);
889 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
891 rinv = gmx_mm_invsqrt_ps(rsq);
892 r = _mm_mul_ps(rsq,rinv);
894 /* Compute raj_inv aj1-4 */
895 raj_inv = gmx_mm_inv_ps(raj);
897 /* Evaluate influence of atom aj -> ai */
898 t1 = _mm_add_ps(r,sk_aj);
899 obc_mask1 = _mm_cmplt_ps(rai, t1);
901 if(_mm_movemask_ps(obc_mask1))
903 /* If any of the elements has rai<dr+sk, this is executed */
904 t2 = _mm_sub_ps(r,sk_aj);
905 t3 = _mm_sub_ps(sk_aj,r);
907 obc_mask2 = _mm_cmplt_ps(rai, t2);
908 obc_mask3 = _mm_cmplt_ps(rai, t3);
910 uij = gmx_mm_inv_ps(t1);
911 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
912 _mm_andnot_ps(obc_mask2,rai_inv));
913 dlij = _mm_and_ps(one,obc_mask2);
914 uij2 = _mm_mul_ps(uij, uij);
915 uij3 = _mm_mul_ps(uij2,uij);
916 lij2 = _mm_mul_ps(lij, lij);
917 lij3 = _mm_mul_ps(lij2,lij);
918 diff2 = _mm_sub_ps(uij2,lij2);
919 lij_inv = gmx_mm_invsqrt_ps(lij2);
920 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
921 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
922 prod = _mm_mul_ps(onefourth,sk2_rinv);
923 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
924 t1 = _mm_sub_ps(lij,uij);
925 t2 = _mm_mul_ps(diff2,
926 _mm_sub_ps(_mm_mul_ps(onefourth,r),
927 prod));
928 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
929 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
930 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
931 t4 = _mm_and_ps(t4,obc_mask3);
932 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
933 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
934 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
935 _mm_mul_ps(prod,lij3));
936 t1 = _mm_sub_ps(t1,
937 _mm_mul_ps(onefourth,
938 _mm_add_ps(_mm_mul_ps(lij,rinv),
939 _mm_mul_ps(lij3,r))));
940 t2 = _mm_mul_ps(onefourth,
941 _mm_add_ps(_mm_mul_ps(uij,rinv),
942 _mm_mul_ps(uij3,r)));
943 t2 = _mm_sub_ps(t2,
944 _mm_add_ps(_mm_mul_ps(half,uij2),
945 _mm_mul_ps(prod,uij3)));
946 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
947 _mm_mul_ps(rinv,rinv));
948 t3 = _mm_sub_ps(t3,
949 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
950 _mm_add_ps(one,
951 _mm_mul_ps(sk2_rinv,rinv))));
952 t1 = _mm_mul_ps(rinv,
953 _mm_add_ps(_mm_mul_ps(dlij,t1),
954 _mm_add_ps(t2,t3)));
956 dadx1 = _mm_and_ps(t1,obc_mask1);
958 else
960 dadx1 = _mm_setzero_ps();
963 /* Evaluate influence of atom ai -> aj */
964 t1 = _mm_add_ps(r,sk_ai);
965 obc_mask1 = _mm_cmplt_ps(raj, t1);
967 if(_mm_movemask_ps(obc_mask1))
969 t2 = _mm_sub_ps(r,sk_ai);
970 t3 = _mm_sub_ps(sk_ai,r);
971 obc_mask2 = _mm_cmplt_ps(raj, t2);
972 obc_mask3 = _mm_cmplt_ps(raj, t3);
974 uij = gmx_mm_inv_ps(t1);
975 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
976 _mm_andnot_ps(obc_mask2,raj_inv));
977 dlij = _mm_and_ps(one,obc_mask2);
978 uij2 = _mm_mul_ps(uij, uij);
979 uij3 = _mm_mul_ps(uij2,uij);
980 lij2 = _mm_mul_ps(lij, lij);
981 lij3 = _mm_mul_ps(lij2,lij);
982 diff2 = _mm_sub_ps(uij2,lij2);
983 lij_inv = gmx_mm_invsqrt_ps(lij2);
984 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
985 prod = _mm_mul_ps(onefourth,sk2_rinv);
986 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
987 t1 = _mm_sub_ps(lij,uij);
988 t2 = _mm_mul_ps(diff2,
989 _mm_sub_ps(_mm_mul_ps(onefourth,r),
990 prod));
991 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
992 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
993 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
994 t4 = _mm_and_ps(t4,obc_mask3);
995 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
997 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
999 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1000 _mm_mul_ps(prod,lij3));
1001 t1 = _mm_sub_ps(t1,
1002 _mm_mul_ps(onefourth,
1003 _mm_add_ps(_mm_mul_ps(lij,rinv),
1004 _mm_mul_ps(lij3,r))));
1005 t2 = _mm_mul_ps(onefourth,
1006 _mm_add_ps(_mm_mul_ps(uij,rinv),
1007 _mm_mul_ps(uij3,r)));
1008 t2 = _mm_sub_ps(t2,
1009 _mm_add_ps(_mm_mul_ps(half,uij2),
1010 _mm_mul_ps(prod,uij3)));
1011 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1012 _mm_mul_ps(rinv,rinv));
1013 t3 = _mm_sub_ps(t3,
1014 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1015 _mm_add_ps(one,
1016 _mm_mul_ps(sk2_rinv,rinv))));
1017 t1 = _mm_mul_ps(rinv,
1018 _mm_add_ps(_mm_mul_ps(dlij,t1),
1019 _mm_add_ps(t2,t3)));
1020 dadx2 = _mm_and_ps(t1,obc_mask1);
1022 else
1024 dadx2 = _mm_setzero_ps();
1027 _mm_store_ps(dadx,dadx1);
1028 dadx += 4;
1029 _mm_store_ps(dadx,dadx2);
1030 dadx += 4;
1031 } /* end normal inner loop */
1033 if(offset!=0)
1035 if(offset==1)
1037 jnrA = jjnr[k];
1038 j3A = 3*jnrA;
1039 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1040 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1041 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1042 mask = mask1;
1044 else if(offset==2)
1046 jnrA = jjnr[k];
1047 jnrB = jjnr[k+1];
1048 j3A = 3*jnrA;
1049 j3B = 3*jnrB;
1050 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1051 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1052 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1053 mask = mask2;
1055 else
1057 /* offset must be 3 */
1058 jnrA = jjnr[k];
1059 jnrB = jjnr[k+1];
1060 jnrC = jjnr[k+2];
1061 j3A = 3*jnrA;
1062 j3B = 3*jnrB;
1063 j3C = 3*jnrC;
1064 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1065 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1066 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1067 mask = mask3;
1070 dx = _mm_sub_ps(ix, jx);
1071 dy = _mm_sub_ps(iy, jy);
1072 dz = _mm_sub_ps(iz, jz);
1074 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
1076 rinv = gmx_mm_invsqrt_ps(rsq);
1077 r = _mm_mul_ps(rsq,rinv);
1079 /* Compute raj_inv aj1-4 */
1080 raj_inv = gmx_mm_inv_ps(raj);
1082 /* Evaluate influence of atom aj -> ai */
1083 t1 = _mm_add_ps(r,sk_aj);
1084 obc_mask1 = _mm_cmplt_ps(rai, t1);
1085 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1087 if(_mm_movemask_ps(obc_mask1))
1089 t2 = _mm_sub_ps(r,sk_aj);
1090 t3 = _mm_sub_ps(sk_aj,r);
1091 obc_mask2 = _mm_cmplt_ps(rai, t2);
1092 obc_mask3 = _mm_cmplt_ps(rai, t3);
1094 uij = gmx_mm_inv_ps(t1);
1095 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1096 _mm_andnot_ps(obc_mask2,rai_inv));
1097 dlij = _mm_and_ps(one,obc_mask2);
1098 uij2 = _mm_mul_ps(uij, uij);
1099 uij3 = _mm_mul_ps(uij2,uij);
1100 lij2 = _mm_mul_ps(lij, lij);
1101 lij3 = _mm_mul_ps(lij2,lij);
1102 diff2 = _mm_sub_ps(uij2,lij2);
1103 lij_inv = gmx_mm_invsqrt_ps(lij2);
1104 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
1105 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
1106 prod = _mm_mul_ps(onefourth,sk2_rinv);
1107 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1108 t1 = _mm_sub_ps(lij,uij);
1109 t2 = _mm_mul_ps(diff2,
1110 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1111 prod));
1112 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1113 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1114 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1115 t4 = _mm_and_ps(t4,obc_mask3);
1116 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1117 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1118 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1119 _mm_mul_ps(prod,lij3));
1120 t1 = _mm_sub_ps(t1,
1121 _mm_mul_ps(onefourth,
1122 _mm_add_ps(_mm_mul_ps(lij,rinv),
1123 _mm_mul_ps(lij3,r))));
1124 t2 = _mm_mul_ps(onefourth,
1125 _mm_add_ps(_mm_mul_ps(uij,rinv),
1126 _mm_mul_ps(uij3,r)));
1127 t2 = _mm_sub_ps(t2,
1128 _mm_add_ps(_mm_mul_ps(half,uij2),
1129 _mm_mul_ps(prod,uij3)));
1130 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1131 _mm_mul_ps(rinv,rinv));
1132 t3 = _mm_sub_ps(t3,
1133 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1134 _mm_add_ps(one,
1135 _mm_mul_ps(sk2_rinv,rinv))));
1136 t1 = _mm_mul_ps(rinv,
1137 _mm_add_ps(_mm_mul_ps(dlij,t1),
1138 _mm_add_ps(t2,t3)));
1139 dadx1 = _mm_and_ps(t1,obc_mask1);
1141 else
1143 dadx1 = _mm_setzero_ps();
1146 /* Evaluate influence of atom ai -> aj */
1147 t1 = _mm_add_ps(r,sk_ai);
1148 obc_mask1 = _mm_cmplt_ps(raj, t1);
1149 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1151 if(_mm_movemask_ps(obc_mask1))
1153 t2 = _mm_sub_ps(r,sk_ai);
1154 t3 = _mm_sub_ps(sk_ai,r);
1155 obc_mask2 = _mm_cmplt_ps(raj, t2);
1156 obc_mask3 = _mm_cmplt_ps(raj, t3);
1158 uij = gmx_mm_inv_ps(t1);
1159 lij = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1160 _mm_andnot_ps(obc_mask2,raj_inv));
1161 dlij = _mm_and_ps(one,obc_mask2);
1162 uij2 = _mm_mul_ps(uij, uij);
1163 uij3 = _mm_mul_ps(uij2,uij);
1164 lij2 = _mm_mul_ps(lij, lij);
1165 lij3 = _mm_mul_ps(lij2,lij);
1166 diff2 = _mm_sub_ps(uij2,lij2);
1167 lij_inv = gmx_mm_invsqrt_ps(lij2);
1168 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
1169 prod = _mm_mul_ps(onefourth,sk2_rinv);
1170 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1171 t1 = _mm_sub_ps(lij,uij);
1172 t2 = _mm_mul_ps(diff2,
1173 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1174 prod));
1175 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1176 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1177 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1178 t4 = _mm_and_ps(t4,obc_mask3);
1179 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1181 tmp = _mm_and_ps(t1,obc_mask1);
1183 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1184 _mm_mul_ps(prod,lij3));
1185 t1 = _mm_sub_ps(t1,
1186 _mm_mul_ps(onefourth,
1187 _mm_add_ps(_mm_mul_ps(lij,rinv),
1188 _mm_mul_ps(lij3,r))));
1189 t2 = _mm_mul_ps(onefourth,
1190 _mm_add_ps(_mm_mul_ps(uij,rinv),
1191 _mm_mul_ps(uij3,r)));
1192 t2 = _mm_sub_ps(t2,
1193 _mm_add_ps(_mm_mul_ps(half,uij2),
1194 _mm_mul_ps(prod,uij3)));
1195 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1196 _mm_mul_ps(rinv,rinv));
1197 t3 = _mm_sub_ps(t3,
1198 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1199 _mm_add_ps(one,
1200 _mm_mul_ps(sk2_rinv,rinv))));
1201 t1 = _mm_mul_ps(rinv,
1202 _mm_add_ps(_mm_mul_ps(dlij,t1),
1203 _mm_add_ps(t2,t3)));
1204 dadx2 = _mm_and_ps(t1,obc_mask1);
1206 else
1208 dadx2 = _mm_setzero_ps();
1209 tmp = _mm_setzero_ps();
1212 _mm_store_ps(dadx,dadx1);
1213 dadx += 4;
1214 _mm_store_ps(dadx,dadx2);
1215 dadx += 4;
1217 if(offset==1)
1219 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1221 else if(offset==2)
1223 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1225 else
1227 /* offset must be 3 */
1228 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1232 gmx_mm_update_1pot_ps(sum_ai,work+ii);
1236 /* Parallel summations */
1237 if(PARTDECOMP(cr))
1239 gmx_sum(natoms, work, cr);
1241 else if(DOMAINDECOMP(cr))
1243 dd_atom_sum_real(cr->dd, work);
1246 if(gb_algorithm==egbHCT)
1248 /* HCT */
1249 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1251 if(born->use[i] != 0)
1253 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1254 sum = 1.0/rr - work[i];
1255 min_rad = rr + doffset;
1256 rad = 1.0/sum;
1258 born->bRad[i] = rad > min_rad ? rad : min_rad;
1259 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1263 /* Extra communication required for DD */
1264 if(DOMAINDECOMP(cr))
1266 dd_atom_spread_real(cr->dd, born->bRad);
1267 dd_atom_spread_real(cr->dd, fr->invsqrta);
1270 else
1272 /* OBC */
1273 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1275 if(born->use[i] != 0)
1277 rr = top->atomtypes.gb_radius[md->typeA[i]];
1278 rr_inv2 = 1.0/rr;
1279 rr = rr-doffset;
1280 rr_inv = 1.0/rr;
1281 sum = rr * work[i];
1282 sum2 = sum * sum;
1283 sum3 = sum2 * sum;
1285 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1286 born->bRad[i] = rr_inv - tsum*rr_inv2;
1287 born->bRad[i] = 1.0 / born->bRad[i];
1289 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1291 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1292 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1295 /* Extra (local) communication required for DD */
1296 if(DOMAINDECOMP(cr))
1298 dd_atom_spread_real(cr->dd, born->bRad);
1299 dd_atom_spread_real(cr->dd, fr->invsqrta);
1300 dd_atom_spread_real(cr->dd, born->drobc);
1306 return 0;
1311 float calc_gb_chainrule_sse(int natoms, t_nblist *nl, float *dadx, float *dvda,
1312 float *x, float *f, float *fshift, float *shiftvec,
1313 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1315 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1316 int jnrA,jnrB,jnrC,jnrD;
1317 int j3A,j3B,j3C,j3D;
1318 int jnrE,jnrF,jnrG,jnrH;
1319 int j3E,j3F,j3G,j3H;
1320 int * jjnr;
1322 float rbi,shX,shY,shZ;
1323 float *rb;
1325 __m128 ix,iy,iz;
1326 __m128 jx,jy,jz;
1327 __m128 jxB,jyB,jzB;
1328 __m128 fix,fiy,fiz;
1329 __m128 dx,dy,dz;
1330 __m128 tx,ty,tz;
1331 __m128 dxB,dyB,dzB;
1332 __m128 txB,tyB,tzB;
1334 __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1335 __m128 xmm1,xmm2,xmm3;
1337 const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
1339 rb = born->work;
1341 jjnr = nl->jjnr;
1343 /* Loop to get the proper form for the Born radius term, sse style */
1344 offset=natoms%4;
1346 n0 = md->start;
1347 n1 = md->start+md->homenr+1+natoms/2;
1349 if(gb_algorithm==egbSTILL)
1351 for(i=n0;i<n1;i++)
1353 k = i % natoms;
1354 rbi = born->bRad[k];
1355 rb[k] = (2 * rbi * rbi * dvda[k])/ONE_4PI_EPS0;
1358 else if(gb_algorithm==egbHCT)
1360 for(i=n0;i<n1;i++)
1362 k = i % natoms;
1363 rbi = born->bRad[k];
1364 rb[k] = rbi * rbi * dvda[k];
1367 else if(gb_algorithm==egbOBC)
1369 for(i=n0;i<n1;i++)
1371 k = i % natoms;
1372 rbi = born->bRad[k];
1373 rb[k] = rbi * rbi * born->drobc[k] * dvda[k];
1377 jz = _mm_setzero_ps();
1379 n = j3A = j3B = j3C = j3D = 0;
1381 for(i=0;i<nl->nri;i++)
1383 ii = nl->iinr[i];
1384 ii3 = ii*3;
1385 is3 = 3*nl->shift[i];
1386 shX = shiftvec[is3];
1387 shY = shiftvec[is3+1];
1388 shZ = shiftvec[is3+2];
1389 nj0 = nl->jindex[i];
1390 nj1 = nl->jindex[i+1];
1392 ix = _mm_set1_ps(shX+x[ii3+0]);
1393 iy = _mm_set1_ps(shY+x[ii3+1]);
1394 iz = _mm_set1_ps(shZ+x[ii3+2]);
1396 offset = (nj1-nj0)%4;
1398 rbai = _mm_load1_ps(rb+ii);
1399 fix = _mm_setzero_ps();
1400 fiy = _mm_setzero_ps();
1401 fiz = _mm_setzero_ps();
1404 for(k=nj0;k<nj1-offset;k+=4)
1406 jnrA = jjnr[k];
1407 jnrB = jjnr[k+1];
1408 jnrC = jjnr[k+2];
1409 jnrD = jjnr[k+3];
1411 j3A = 3*jnrA;
1412 j3B = 3*jnrB;
1413 j3C = 3*jnrC;
1414 j3D = 3*jnrD;
1416 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1418 dx = _mm_sub_ps(ix,jx);
1419 dy = _mm_sub_ps(iy,jy);
1420 dz = _mm_sub_ps(iz,jz);
1422 GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1424 /* load chain rule terms for j1-4 */
1425 f_gb = _mm_load_ps(dadx);
1426 dadx += 4;
1427 f_gb_ai = _mm_load_ps(dadx);
1428 dadx += 4;
1430 /* calculate scalar force */
1431 f_gb = _mm_mul_ps(f_gb,rbai);
1432 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1433 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1435 tx = _mm_mul_ps(f_gb,dx);
1436 ty = _mm_mul_ps(f_gb,dy);
1437 tz = _mm_mul_ps(f_gb,dz);
1439 fix = _mm_add_ps(fix,tx);
1440 fiy = _mm_add_ps(fiy,ty);
1441 fiz = _mm_add_ps(fiz,tz);
1443 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1446 /*deal with odd elements */
1447 if(offset!=0)
1449 if(offset==1)
1451 jnrA = jjnr[k];
1452 j3A = 3*jnrA;
1453 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1454 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1456 else if(offset==2)
1458 jnrA = jjnr[k];
1459 jnrB = jjnr[k+1];
1460 j3A = 3*jnrA;
1461 j3B = 3*jnrB;
1462 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1463 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1465 else
1467 /* offset must be 3 */
1468 jnrA = jjnr[k];
1469 jnrB = jjnr[k+1];
1470 jnrC = jjnr[k+2];
1471 j3A = 3*jnrA;
1472 j3B = 3*jnrB;
1473 j3C = 3*jnrC;
1474 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1475 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1478 dx = _mm_sub_ps(ix,jx);
1479 dy = _mm_sub_ps(iy,jy);
1480 dz = _mm_sub_ps(iz,jz);
1482 /* load chain rule terms for j1-4 */
1483 f_gb = _mm_load_ps(dadx);
1484 dadx += 4;
1485 f_gb_ai = _mm_load_ps(dadx);
1486 dadx += 4;
1488 /* calculate scalar force */
1489 f_gb = _mm_mul_ps(f_gb,rbai);
1490 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1491 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1493 tx = _mm_mul_ps(f_gb,dx);
1494 ty = _mm_mul_ps(f_gb,dy);
1495 tz = _mm_mul_ps(f_gb,dz);
1497 fix = _mm_add_ps(fix,tx);
1498 fiy = _mm_add_ps(fiy,ty);
1499 fiz = _mm_add_ps(fiz,tz);
1501 if(offset==1)
1503 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1505 else if(offset==2)
1507 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1509 else
1511 /* offset must be 3 */
1512 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1516 /* fix/fiy/fiz now contain four partial force terms, that all should be
1517 * added to the i particle forces and shift forces.
1519 gmx_mm_update_iforce_1atom_ps(fix,fiy,fiz,f+ii3,fshift+is3);
1522 return 0;
1526 float gb_bonds_analytic(real *x, real *f, real *charge, real *bRad, real *dvda,
1527 t_idef *idef, real epsilon_r, real gb_epsilon_solvent, real facel)
1530 int i,j,nral,nri;
1531 int type1,type2,type3,type4,ai1,ai2,ai3,ai4,aj1,aj2,aj3,aj4;
1532 int ai13,ai23,ai33,ai43,aj13,aj23,aj33,aj43;
1533 int offset;
1535 float vctot;
1537 __m128 ix,iy,iz,jx,jy,jz,dx,dy,dz;
1538 __m128 rsq11,rinv,r,t1,t2,t3;
1539 __m128 isai,isaj,isaprod,inv_isaprod,expterm;
1540 __m128 ci,cj,qq,fac,dva,dva_i,dva_j,di,dj;
1541 __m128 vgb,vgb_tot,fgb,fgb2,fijC,fscal;
1542 __m128 tx,ty,tz,fix1,fiy1,fiz1;
1543 __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1544 __m128 mask;
1546 const __m128 neg = {-1.0f , -1.0f , -1.0f , -1.0f };
1547 const __m128 qrtr = {0.25f , 0.25f , 0.25f , 0.25f };
1548 const __m128 eigth = {0.125f , 0.125f , 0.125f , 0.125f };
1549 const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
1550 const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
1552 t_iatom *forceatoms;
1554 /* Keep the compiler happy */
1555 vgb_tot = _mm_setzero_ps();
1556 vgb = _mm_setzero_ps();
1557 xmm1 = _mm_setzero_ps();
1558 xmm2 = _mm_setzero_ps();
1559 xmm3 = _mm_setzero_ps();
1560 xmm4 = _mm_setzero_ps();
1561 ai1 = ai2 = ai3 = ai4 = 0;
1562 aj1 = aj2 = aj3 = aj4 = 0;
1563 ai13 = ai23 = ai33 = ai43 = 0;
1564 aj13 = aj23 = aj33 = aj43 = 0;
1566 /* Scale the electrostatics by gb_epsilon_solvent */
1567 facel = (-1.0) * facel * ((1.0/epsilon_r) - 1.0/gb_epsilon_solvent);
1568 fac = _mm_load1_ps(&facel);
1570 vctot = 0.0;
1572 for(j=F_GB12;j<=F_GB14;j++)
1574 forceatoms = idef->il[j].iatoms;
1576 /* Number of atoms involved in each GB interaction plus the interaction type */
1577 nral = NRAL(j)+1;
1578 nri = idef->il[j].nr/nral;
1579 offset = nri%4;
1581 /* In the for loop, i is updated for every element in the forceatom list, so we have to stop
1582 * the loop at offset*nral before the maximum idef->il[j].nr number, instead of just offset
1584 for(i=0;i<idef->il[j].nr-(offset*nral); )
1586 /* Load everything separately for now, test with sse load and shuffles later
1587 * Also, to avoid reading in the interaction type, we just increment i to pass over
1588 * the types in the forceatoms array, this saves some memory accesses.
1590 i++;
1591 ai1 = forceatoms[i++];
1592 aj1 = forceatoms[i++];
1594 i++;
1595 ai2 = forceatoms[i++];
1596 aj2 = forceatoms[i++];
1598 i++;
1599 ai3 = forceatoms[i++];
1600 aj3 = forceatoms[i++];
1602 i++;
1603 ai4 = forceatoms[i++];
1604 aj4 = forceatoms[i++];
1606 ai13 = ai1*3;
1607 ai23 = ai2*3;
1608 ai33 = ai3*3;
1609 ai43 = ai4*3;
1611 aj13 = aj1*3;
1612 aj23 = aj2*3;
1613 aj33 = aj3*3;
1614 aj43 = aj4*3;
1616 /* Load particle ai1-4 and transpose */
1617 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+ai13));
1618 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+ai23));
1619 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+ai33));
1620 xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+ai43));
1622 xmm5 = _mm_load1_ps(x+ai13+2);
1623 xmm6 = _mm_load1_ps(x+ai23+2);
1624 xmm7 = _mm_load1_ps(x+ai33+2);
1625 xmm8 = _mm_load1_ps(x+ai43+2);
1627 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1628 xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
1629 iz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
1631 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1632 xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
1633 ix = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
1634 iy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
1636 /* Load particle aj1-4 and transpose */
1637 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
1638 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
1639 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
1640 xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+aj43));
1642 xmm5 = _mm_load1_ps(x+aj13+2);
1643 xmm6 = _mm_load1_ps(x+aj23+2);
1644 xmm7 = _mm_load1_ps(x+aj33+2);
1645 xmm8 = _mm_load1_ps(x+aj43+2);
1647 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1648 xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
1649 jz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
1651 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1652 xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
1653 jx = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
1654 jy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
1656 /* Distances */
1657 dx = _mm_sub_ps(ix, jx);
1658 dy = _mm_sub_ps(iy, jy);
1659 dz = _mm_sub_ps(iz, jz);
1661 rsq11 = _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx) , _mm_mul_ps(dy,dy) ) , _mm_mul_ps(dz,dz) );
1663 rinv = gmx_mm_inv_ps(rsq11);
1664 r = _mm_mul_ps(rsq11,rinv);
1666 /* Load Born radii for ai's and aj's */
1667 xmm1 = _mm_load_ss(bRad+ai1);
1668 xmm2 = _mm_load_ss(bRad+ai2);
1669 xmm3 = _mm_load_ss(bRad+ai3);
1670 xmm4 = _mm_load_ss(bRad+ai4);
1672 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1673 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1674 isai = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1676 xmm1 = _mm_load_ss(bRad+aj1);
1677 xmm2 = _mm_load_ss(bRad+aj2);
1678 xmm3 = _mm_load_ss(bRad+aj3);
1679 xmm4 = _mm_load_ss(bRad+aj4);
1681 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1682 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1683 isaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1685 isaprod = _mm_mul_ps(isai,isaj); // rb2 in tinker
1686 inv_isaprod = _mm_mul_ps(isaprod,isaprod);
1687 inv_isaprod = gmx_mm_inv_ps(inv_isaprod); //1/rb2 in tinker
1689 /* Load charges for ai's and aj's */
1690 xmm1 = _mm_load_ss(charge+ai1);
1691 xmm2 = _mm_load_ss(charge+ai2);
1692 xmm3 = _mm_load_ss(charge+ai3);
1693 xmm4 = _mm_load_ss(charge+ai4);
1695 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1696 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1697 ci = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1699 xmm1 = _mm_load_ss(charge+aj1);
1700 xmm2 = _mm_load_ss(charge+aj2);
1701 xmm3 = _mm_load_ss(charge+aj3);
1702 xmm4 = _mm_load_ss(charge+aj4);
1704 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
1705 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0));
1706 cj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
1708 qq = _mm_mul_ps(ci,fac);
1709 qq = _mm_mul_ps(cj,qq);
1711 /* Calculate scalar GB force */
1712 expterm = _mm_mul_ps(rsq11,inv_isaprod);
1713 expterm = _mm_mul_ps(expterm,qrtr);
1714 expterm = _mm_mul_ps(expterm,neg);
1715 expterm = gmx_mm_exp_ps(expterm);
1717 fgb2 = _mm_mul_ps(isaprod,expterm);
1718 fgb2 = _mm_add_ps(fgb2,rsq11);
1720 fgb = gmx_mm_inv_ps(fgb2);
1722 /* Potential energy */
1723 vgb = _mm_mul_ps(qq,fgb);
1724 vgb_tot = _mm_add_ps(vgb_tot,vgb);
1726 fijC = _mm_mul_ps(qrtr,r);
1727 fijC = _mm_mul_ps(fijC,expterm);
1728 fijC = _mm_sub_ps(r,fijC);
1729 fijC = _mm_mul_ps(fijC,vgb);
1730 fijC = _mm_mul_ps(fijC,neg);
1731 fijC = _mm_mul_ps(fijC,fgb);
1732 fijC = _mm_mul_ps(fijC,fgb); /* fijC = fijC in tab code, de */
1734 /* Chain rule terms */
1735 dva = _mm_mul_ps(rsq11,inv_isaprod);
1736 dva = _mm_mul_ps(dva,eigth);
1737 dva = _mm_add_ps(dva,half);
1738 dva = _mm_mul_ps(dva,expterm);
1739 dva = _mm_mul_ps(dva,vgb);
1740 dva = _mm_mul_ps(dva,neg);
1741 dva = _mm_mul_ps(dva,fgb);
1742 dva = _mm_mul_ps(dva,fgb); /* dva * isaj = dvatmp*isai*isai */
1744 /* Calculate vectorial force */
1745 fscal = _mm_mul_ps(fijC,rinv);
1746 fscal = _mm_mul_ps(fscal,neg);
1748 tx = _mm_mul_ps(fscal,dx);
1749 ty = _mm_mul_ps(fscal,dy);
1750 tz = _mm_mul_ps(fscal,dz);
1752 /* Load, update and store chain rule terms for ai and aj atoms */
1753 dva_i = _mm_mul_ps(dva,isaj);
1754 dva_j = _mm_mul_ps(dva,isai);
1756 xmm1 = _mm_load_ss(dvda+ai1);
1757 xmm2 = _mm_load_ss(dvda+aj1);
1758 xmm1 = _mm_add_ss(xmm1,dva_i);
1759 xmm2 = _mm_add_ss(xmm2,dva_j);
1760 _mm_store_ss(dvda+ai1,xmm1);
1761 _mm_store_ss(dvda+aj1,xmm2);
1763 /* Need to shuffle dva_i and dva_j */
1764 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
1765 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
1767 xmm1 = _mm_load_ss(dvda+ai2);
1768 xmm2 = _mm_load_ss(dvda+aj2);
1769 xmm1 = _mm_add_ss(xmm1,dva_i);
1770 xmm2 = _mm_add_ss(xmm2,dva_j);
1771 _mm_store_ss(dvda+ai2,xmm1);
1772 _mm_store_ss(dvda+aj2,xmm2);
1774 /* Need to shuffle dva_i and dva_j */
1775 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
1776 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
1778 xmm1 = _mm_load_ss(dvda+ai3);
1779 xmm2 = _mm_load_ss(dvda+aj3);
1780 xmm1 = _mm_add_ss(xmm1,dva_i);
1781 xmm2 = _mm_add_ss(xmm2,dva_j);
1782 _mm_store_ss(dvda+ai3,xmm1);
1783 _mm_store_ss(dvda+aj3,xmm2);
1785 /* Need to shuffle dva_i and dva_j */
1786 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
1787 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
1789 xmm1 = _mm_load_ss(dvda+ai4);
1790 xmm2 = _mm_load_ss(dvda+aj4);
1791 xmm1 = _mm_add_ss(xmm1,dva_i);
1792 xmm2 = _mm_add_ss(xmm2,dva_j);
1793 _mm_store_ss(dvda+ai4,xmm1);
1794 _mm_store_ss(dvda+aj4,xmm2);
1796 /* Load, update and store partial forces on ai and ai atoms
1797 * This needs to be done in the order ai1+aj1,ai2+aj2 etc...
1798 * since loading all four values at once will mean the force
1799 * is not updated properly
1801 /* ai1 and aj1 */
1802 xmm1 = _mm_load_ss(f+ai13);
1803 xmm2 = _mm_load_ss(f+ai13+1);
1804 xmm3 = _mm_load_ss(f+ai13+2);
1806 xmm4 = _mm_load_ss(f+aj13);
1807 xmm5 = _mm_load_ss(f+aj13+1);
1808 xmm6 = _mm_load_ss(f+aj13+2);
1810 xmm1 = _mm_add_ss(xmm1,tx);
1811 xmm2 = _mm_add_ss(xmm2,ty);
1812 xmm3 = _mm_add_ss(xmm3,tz);
1814 xmm4 = _mm_sub_ss(xmm4,tx);
1815 xmm5 = _mm_sub_ss(xmm5,ty);
1816 xmm6 = _mm_sub_ss(xmm6,tz);
1818 _mm_store_ss(f+ai13,xmm1);
1819 _mm_store_ss(f+ai13+1,xmm2);
1820 _mm_store_ss(f+ai13+2,xmm3);
1822 _mm_store_ss(f+aj13,xmm4);
1823 _mm_store_ss(f+aj13+1,xmm5);
1824 _mm_store_ss(f+aj13+2,xmm6);
1826 /* ai2 and aj2 */
1827 xmm1 = _mm_load_ss(f+ai23);
1828 xmm2 = _mm_load_ss(f+ai23+1);
1829 xmm3 = _mm_load_ss(f+ai23+2);
1831 xmm4 = _mm_load_ss(f+aj23);
1832 xmm5 = _mm_load_ss(f+aj23+1);
1833 xmm6 = _mm_load_ss(f+aj23+2);
1835 /* Need to shuffle tx/ty/tz */
1836 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
1837 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
1838 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
1840 xmm1 = _mm_add_ss(xmm1,tx);
1841 xmm2 = _mm_add_ss(xmm2,ty);
1842 xmm3 = _mm_add_ss(xmm3,tz);
1844 xmm4 = _mm_sub_ss(xmm4,tx);
1845 xmm5 = _mm_sub_ss(xmm5,ty);
1846 xmm6 = _mm_sub_ss(xmm6,tz);
1848 _mm_store_ss(f+ai23,xmm1);
1849 _mm_store_ss(f+ai23+1,xmm2);
1850 _mm_store_ss(f+ai23+2,xmm3);
1852 _mm_store_ss(f+aj23,xmm4);
1853 _mm_store_ss(f+aj23+1,xmm5);
1854 _mm_store_ss(f+aj23+2,xmm6);
1856 /* ai3 and aj3 */
1857 xmm1 = _mm_load_ss(f+ai33);
1858 xmm2 = _mm_load_ss(f+ai33+1);
1859 xmm3 = _mm_load_ss(f+ai33+2);
1861 xmm4 = _mm_load_ss(f+aj33);
1862 xmm5 = _mm_load_ss(f+aj33+1);
1863 xmm6 = _mm_load_ss(f+aj33+2);
1865 /* Need to shuffle tx/ty/tz */
1866 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
1867 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
1868 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
1870 xmm1 = _mm_add_ss(xmm1,tx);
1871 xmm2 = _mm_add_ss(xmm2,ty);
1872 xmm3 = _mm_add_ss(xmm3,tz);
1874 xmm4 = _mm_sub_ss(xmm4,tx);
1875 xmm5 = _mm_sub_ss(xmm5,ty);
1876 xmm6 = _mm_sub_ss(xmm6,tz);
1878 _mm_store_ss(f+ai33,xmm1);
1879 _mm_store_ss(f+ai33+1,xmm2);
1880 _mm_store_ss(f+ai33+2,xmm3);
1882 _mm_store_ss(f+aj33,xmm4);
1883 _mm_store_ss(f+aj33+1,xmm5);
1884 _mm_store_ss(f+aj33+2,xmm6);
1886 /* ai4 and aj4 */
1887 xmm1 = _mm_load_ss(f+ai43);
1888 xmm2 = _mm_load_ss(f+ai43+1);
1889 xmm3 = _mm_load_ss(f+ai43+2);
1891 xmm4 = _mm_load_ss(f+aj43);
1892 xmm5 = _mm_load_ss(f+aj43+1);
1893 xmm6 = _mm_load_ss(f+aj43+2);
1895 /* Need to shuffle tx/ty/tz */
1896 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
1897 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
1898 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
1900 xmm1 = _mm_add_ss(xmm1,tx);
1901 xmm2 = _mm_add_ss(xmm2,ty);
1902 xmm3 = _mm_add_ss(xmm3,tz);
1904 xmm4 = _mm_sub_ss(xmm4,tx);
1905 xmm5 = _mm_sub_ss(xmm5,ty);
1906 xmm6 = _mm_sub_ss(xmm6,tz);
1908 _mm_store_ss(f+ai43,xmm1);
1909 _mm_store_ss(f+ai43+1,xmm2);
1910 _mm_store_ss(f+ai43+2,xmm3);
1912 _mm_store_ss(f+aj43,xmm4);
1913 _mm_store_ss(f+aj43+1,xmm5);
1914 _mm_store_ss(f+aj43+2,xmm6);
1917 if(offset!=0)
1919 if(offset==1)
1921 i++;
1922 ai1 = forceatoms[i++];
1923 aj1 = forceatoms[i++];
1925 ai13 = ai1*3;
1926 aj13 = aj1*3;
1928 /* Load ai and aj coordinates */
1929 xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (x+ai13));
1930 iz = _mm_load1_ps(x+ai13+2);
1932 ix = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(0,0,0,0));
1933 iy = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1,1,1,1));
1935 xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (x+aj13));
1936 jz = _mm_load1_ps(x+aj13+2);
1938 jx = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(0,0,0,0));
1939 jy = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1,1,1,1));
1941 /* Load Born radii for ai1 and aj1 */
1942 isai = _mm_set_ps(0.0f, 0.0f, 0.0f, bRad[ai1]);
1943 isaj = _mm_set_ps(0.0f, 0.0f, 0.0f, bRad[aj1]);
1945 /* Load charges for ai1 and aj1 */
1946 ci = _mm_set_ps(0.0f, 0.0f, 0.0f, charge[ai1]);
1947 cj = _mm_set_ps(0.0f, 0.0f, 0.0f, charge[aj1]);
1949 /* Load chain rule terms for ai1 and aj1 */
1950 di = _mm_set_ps(0.0f,0.0f,0.0f,dvda[ai1]);
1951 dj = _mm_set_ps(0.0f,0.0f,0.0f,dvda[aj1]);
1953 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0,0xffffffff) );
1955 else if(offset==2)
1957 i++;
1958 ai1 = forceatoms[i++];
1959 aj1 = forceatoms[i++];
1961 i++;
1962 ai2 = forceatoms[i++];
1963 aj2 = forceatoms[i++];
1965 ai13 = ai1 * 3;
1966 ai23 = ai2 * 3;
1967 aj13 = aj1 * 3;
1968 aj23 = aj2 * 3;
1970 /* Load ai1-2 and aj1-2 coordinates */
1971 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+ai13));
1972 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+ai23));
1974 xmm5 = _mm_load1_ps(x+ai13+2);
1975 xmm6 = _mm_load1_ps(x+ai23+2);
1977 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1978 iz = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
1980 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1981 ix = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
1982 iy = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
1984 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
1985 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
1987 xmm5 = _mm_load1_ps(x+aj13+2);
1988 xmm6 = _mm_load1_ps(x+aj23+2);
1990 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
1991 jz = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
1993 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
1994 jx = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
1995 jy = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
1997 /* Load Born radii for ai1-2 and aj1-2 */
1998 isai = _mm_set_ps(0.0f, 0.0f, bRad[ai2], bRad[ai1]);
1999 isaj = _mm_set_ps(0.0f, 0.0f, bRad[aj2], bRad[aj1]);
2001 /* Load charges for ai1-2 and aj1-2 */
2002 ci = _mm_set_ps(0.0f, 0.0f, charge[ai2], charge[ai1]);
2003 cj = _mm_set_ps(0.0f, 0.0f, charge[aj2], charge[aj1]);
2005 /* Load chain rule terms for ai1-2 and aj1-2 */
2006 di = _mm_set_ps(0.0f,0.0f,dvda[ai2],dvda[ai1]);
2007 dj = _mm_set_ps(0.0f,0.0f,dvda[aj2],dvda[aj1]);
2009 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0xffffffff,0xffffffff) );
2011 else
2013 i++;
2014 ai1 = forceatoms[i++];
2015 aj1 = forceatoms[i++];
2017 i++;
2018 ai2 = forceatoms[i++];
2019 aj2 = forceatoms[i++];
2021 i++;
2022 ai3 = forceatoms[i++];
2023 aj3 = forceatoms[i++];
2025 ai13 = ai1 * 3;
2026 ai23 = ai2 * 3;
2027 ai33 = ai3 * 3;
2029 aj13 = aj1 * 3;
2030 aj23 = aj2 * 3;
2031 aj33 = aj3 * 3;
2033 /* Load ai1-3 and aj1-3 coordinates */
2034 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+ai13));
2035 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+ai23));
2036 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+ai33));
2038 xmm5 = _mm_load1_ps(x+ai13+2);
2039 xmm6 = _mm_load1_ps(x+ai23+2);
2040 xmm7 = _mm_load1_ps(x+ai33+2);
2042 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
2043 iz = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
2045 xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
2046 xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
2048 ix = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
2049 iy = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
2051 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
2052 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
2053 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
2055 xmm5 = _mm_load1_ps(x+aj13+2);
2056 xmm6 = _mm_load1_ps(x+aj23+2);
2057 xmm7 = _mm_load1_ps(x+aj33+2);
2059 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
2060 jz = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
2062 xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
2063 xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
2065 jx = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
2066 jy = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
2068 /* Load Born radii for ai1-3 and aj1-3 */
2069 isai = _mm_set_ps(0.0f, bRad[ai3], bRad[ai2], bRad[ai1]);
2070 isaj = _mm_set_ps(0.0f, bRad[aj3], bRad[aj2], bRad[aj1]);
2072 /* Load charges for ai1-3 and aj1-3 */
2073 ci = _mm_set_ps(0.0f, charge[ai3], charge[ai2], charge[ai1]);
2074 cj = _mm_set_ps(0.0f, charge[aj3], charge[aj2], charge[aj1]);
2076 /* Load chain rule terms for ai1-3 and aj1-3 */
2077 di = _mm_set_ps(0.0f,dvda[ai3],dvda[ai2],dvda[ai1]);
2078 dj = _mm_set_ps(0.0f,dvda[aj3],dvda[aj2],dvda[aj1]);
2080 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff) );
2083 ix = _mm_and_ps( mask, ix);
2084 iy = _mm_and_ps( mask, iy);
2085 iz = _mm_and_ps( mask, iz);
2087 jx = _mm_and_ps( mask, jx);
2088 jy = _mm_and_ps( mask, jy);
2089 jz = _mm_and_ps( mask, jz);
2091 /* Distances */
2092 dx = _mm_sub_ps(ix, jx);
2093 dy = _mm_sub_ps(iy, jy);
2094 dz = _mm_sub_ps(iz, jz);
2096 rsq11 = _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx) , _mm_mul_ps(dy,dy) ) , _mm_mul_ps(dz,dz) );
2098 rinv = gmx_mm_inv_ps(rsq11);
2099 r = _mm_mul_ps(rsq11,rinv);
2101 isaprod = _mm_mul_ps(isai,isaj);
2102 inv_isaprod = _mm_mul_ps(isaprod,isaprod);
2103 inv_isaprod = gmx_mm_inv_ps(inv_isaprod);
2105 qq = _mm_mul_ps(ci,fac);
2106 qq = _mm_mul_ps(cj,qq);
2108 /* Calculate scalar GB force */
2109 expterm = _mm_mul_ps(rsq11,inv_isaprod);
2110 expterm = _mm_mul_ps(expterm,qrtr);
2111 expterm = _mm_mul_ps(expterm,neg);
2112 expterm = gmx_mm_exp_ps(expterm);
2114 fgb2 = _mm_mul_ps(isaprod,expterm);
2115 fgb2 = _mm_add_ps(fgb2,rsq11);
2117 fgb = gmx_mm_inv_ps(fgb2);
2119 /* Potential energy */
2120 vgb = _mm_mul_ps(qq,fgb);
2121 vgb = _mm_and_ps(mask,vgb);
2122 vgb_tot = _mm_add_ps(vgb_tot,vgb);
2124 fijC = _mm_mul_ps(qrtr,r);
2125 fijC = _mm_mul_ps(fijC,expterm);
2126 fijC = _mm_sub_ps(r,fijC);
2127 fijC = _mm_mul_ps(fijC,vgb);
2128 fijC = _mm_mul_ps(fijC,neg);
2129 fijC = _mm_mul_ps(fijC,fgb);
2130 fijC = _mm_mul_ps(fijC,fgb); /* fscal = fijC */
2132 /* Chain rule terms */
2133 dva = _mm_mul_ps(rsq11,inv_isaprod);
2134 dva = _mm_mul_ps(dva,eigth);
2135 dva = _mm_add_ps(dva,half);
2136 dva = _mm_mul_ps(dva,expterm);
2137 dva = _mm_mul_ps(dva,vgb);
2138 dva = _mm_mul_ps(dva,neg);
2139 dva = _mm_mul_ps(dva,fgb);
2140 dva = _mm_mul_ps(dva,fgb);
2142 /* Calculate vectorial force */
2143 fscal = _mm_mul_ps(fijC,rinv);
2144 fscal = _mm_mul_ps(fscal,neg);
2146 tx = _mm_mul_ps(fscal,dx);
2147 ty = _mm_mul_ps(fscal,dy);
2148 tz = _mm_mul_ps(fscal,dz);
2150 /* Calculate chain rule terms */
2151 dva_i = _mm_mul_ps(dva,isaj);
2152 dva_j = _mm_mul_ps(dva,isai);
2154 if(offset==1)
2156 /* Load, update and store chain rule terms for ai1 and aj1 */
2157 xmm1 = _mm_load_ss(dvda+ai1);
2158 xmm2 = _mm_load_ss(dvda+aj1);
2159 xmm1 = _mm_add_ss(xmm1,dva_i);
2160 xmm2 = _mm_add_ss(xmm2,dva_j);
2161 _mm_store_ss(dvda+ai1,xmm1);
2162 _mm_store_ss(dvda+aj1,xmm2);
2164 /* Load, update and store partial forces on ai1 and aj1 */
2165 xmm1 = _mm_load_ss(f+ai13);
2166 xmm2 = _mm_load_ss(f+ai13+1);
2167 xmm3 = _mm_load_ss(f+ai13+2);
2169 xmm4 = _mm_load_ss(f+aj13);
2170 xmm5 = _mm_load_ss(f+aj13+1);
2171 xmm6 = _mm_load_ss(f+aj13+2);
2173 xmm1 = _mm_add_ss(xmm1,tx);
2174 xmm2 = _mm_add_ss(xmm2,ty);
2175 xmm3 = _mm_add_ss(xmm3,tz);
2177 xmm4 = _mm_sub_ss(xmm4,tx);
2178 xmm5 = _mm_sub_ss(xmm5,ty);
2179 xmm6 = _mm_sub_ss(xmm6,tz);
2181 _mm_store_ss(f+ai13,xmm1);
2182 _mm_store_ss(f+ai13+1,xmm2);
2183 _mm_store_ss(f+ai13+2,xmm3);
2185 _mm_store_ss(f+aj13,xmm4);
2186 _mm_store_ss(f+aj13+1,xmm5);
2187 _mm_store_ss(f+aj13+2,xmm6);
2189 else if(offset==2)
2191 /* Load, update and store chain rule terms for ai1-2 and aj1-2 atoms */
2192 dva_i = _mm_mul_ps(dva,isaj);
2193 dva_j = _mm_mul_ps(dva,isai);
2195 xmm1 = _mm_load_ss(dvda+ai1);
2196 xmm2 = _mm_load_ss(dvda+aj1);
2197 xmm1 = _mm_add_ss(xmm1,dva_i);
2198 xmm2 = _mm_add_ss(xmm2,dva_j);
2199 _mm_store_ss(dvda+ai1,xmm1);
2200 _mm_store_ss(dvda+aj1,xmm2);
2202 /* Need to shuffle dva_i and dva_j */
2203 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
2204 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
2206 xmm1 = _mm_load_ss(dvda+ai2);
2207 xmm2 = _mm_load_ss(dvda+aj2);
2208 xmm1 = _mm_add_ss(xmm1,dva_i);
2209 xmm2 = _mm_add_ss(xmm2,dva_j);
2210 _mm_store_ss(dvda+ai2,xmm1);
2211 _mm_store_ss(dvda+aj2,xmm2);
2213 /* Load, update and store partial forces on ai1-2 and aj1-2 */
2214 xmm1 = _mm_load_ss(f+ai13);
2215 xmm2 = _mm_load_ss(f+ai13+1);
2216 xmm3 = _mm_load_ss(f+ai13+2);
2218 xmm4 = _mm_load_ss(f+aj13);
2219 xmm5 = _mm_load_ss(f+aj13+1);
2220 xmm6 = _mm_load_ss(f+aj13+2);
2222 xmm1 = _mm_add_ss(xmm1,tx);
2223 xmm2 = _mm_add_ss(xmm2,ty);
2224 xmm3 = _mm_add_ss(xmm3,tz);
2226 xmm4 = _mm_sub_ss(xmm4,tx);
2227 xmm5 = _mm_sub_ss(xmm5,ty);
2228 xmm6 = _mm_sub_ss(xmm6,tz);
2230 _mm_store_ss(f+ai13,xmm1);
2231 _mm_store_ss(f+ai13+1,xmm2);
2232 _mm_store_ss(f+ai13+2,xmm3);
2234 _mm_store_ss(f+aj13,xmm4);
2235 _mm_store_ss(f+aj13+1,xmm5);
2236 _mm_store_ss(f+aj13+2,xmm6);
2238 /* ai2 and aj2 */
2239 xmm1 = _mm_load_ss(f+ai23);
2240 xmm2 = _mm_load_ss(f+ai23+1);
2241 xmm3 = _mm_load_ss(f+ai23+2);
2243 xmm4 = _mm_load_ss(f+aj23);
2244 xmm5 = _mm_load_ss(f+aj23+1);
2245 xmm6 = _mm_load_ss(f+aj23+2);
2247 /* Need to shuffle tx/ty/tz */
2248 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
2249 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
2250 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
2252 xmm1 = _mm_add_ss(xmm1,tx);
2253 xmm2 = _mm_add_ss(xmm2,ty);
2254 xmm3 = _mm_add_ss(xmm3,tz);
2256 xmm4 = _mm_sub_ss(xmm4,tx);
2257 xmm5 = _mm_sub_ss(xmm5,ty);
2258 xmm6 = _mm_sub_ss(xmm6,tz);
2260 _mm_store_ss(f+ai23,xmm1);
2261 _mm_store_ss(f+ai23+1,xmm2);
2262 _mm_store_ss(f+ai23+2,xmm3);
2264 _mm_store_ss(f+aj23,xmm4);
2265 _mm_store_ss(f+aj23+1,xmm5);
2266 _mm_store_ss(f+aj23+2,xmm6);
2268 else
2270 /* Load, update and store chain rule terms for ai1-3 and aj1-3 atoms */
2271 dva_i = _mm_mul_ps(dva,isaj);
2272 dva_j = _mm_mul_ps(dva,isai);
2274 xmm1 = _mm_load_ss(dvda+ai1);
2275 xmm2 = _mm_load_ss(dvda+aj1);
2276 xmm1 = _mm_add_ss(xmm1,dva_i);
2277 xmm2 = _mm_add_ss(xmm2,dva_j);
2278 _mm_store_ss(dvda+ai1,xmm1);
2279 _mm_store_ss(dvda+aj1,xmm2);
2281 /* Need to shuffle dva_i and dva_j */
2282 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
2283 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
2285 xmm1 = _mm_load_ss(dvda+ai2);
2286 xmm2 = _mm_load_ss(dvda+aj2);
2287 xmm1 = _mm_add_ss(xmm1,dva_i);
2288 xmm2 = _mm_add_ss(xmm2,dva_j);
2289 _mm_store_ss(dvda+ai2,xmm1);
2290 _mm_store_ss(dvda+aj2,xmm2);
2292 /* Need to shuffle dva_i and dva_j */
2293 dva_i = _mm_shuffle_ps(dva_i,dva_i,_MM_SHUFFLE(0,3,2,1));
2294 dva_j = _mm_shuffle_ps(dva_j,dva_j,_MM_SHUFFLE(0,3,2,1));
2296 xmm1 = _mm_load_ss(dvda+ai3);
2297 xmm2 = _mm_load_ss(dvda+aj3);
2298 xmm1 = _mm_add_ss(xmm1,dva_i);
2299 xmm2 = _mm_add_ss(xmm2,dva_j);
2300 _mm_store_ss(dvda+ai3,xmm1);
2301 _mm_store_ss(dvda+aj3,xmm2);
2303 /* Load, update and store partial forces on ai1-3 and aj1-3 */
2304 xmm1 = _mm_load_ss(f+ai13);
2305 xmm2 = _mm_load_ss(f+ai13+1);
2306 xmm3 = _mm_load_ss(f+ai13+2);
2308 xmm4 = _mm_load_ss(f+aj13);
2309 xmm5 = _mm_load_ss(f+aj13+1);
2310 xmm6 = _mm_load_ss(f+aj13+2);
2312 xmm1 = _mm_add_ss(xmm1,tx);
2313 xmm2 = _mm_add_ss(xmm2,ty);
2314 xmm3 = _mm_add_ss(xmm3,tz);
2316 xmm4 = _mm_sub_ss(xmm4,tx);
2317 xmm5 = _mm_sub_ss(xmm5,ty);
2318 xmm6 = _mm_sub_ss(xmm6,tz);
2320 _mm_store_ss(f+ai13,xmm1);
2321 _mm_store_ss(f+ai13+1,xmm2);
2322 _mm_store_ss(f+ai13+2,xmm3);
2324 _mm_store_ss(f+aj13,xmm4);
2325 _mm_store_ss(f+aj13+1,xmm5);
2326 _mm_store_ss(f+aj13+2,xmm6);
2328 /* ai2 and aj2 */
2329 xmm1 = _mm_load_ss(f+ai23);
2330 xmm2 = _mm_load_ss(f+ai23+1);
2331 xmm3 = _mm_load_ss(f+ai23+2);
2333 xmm4 = _mm_load_ss(f+aj23);
2334 xmm5 = _mm_load_ss(f+aj23+1);
2335 xmm6 = _mm_load_ss(f+aj23+2);
2337 /* Need to shuffle tx/ty/tz */
2338 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
2339 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
2340 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
2342 xmm1 = _mm_add_ss(xmm1,tx);
2343 xmm2 = _mm_add_ss(xmm2,ty);
2344 xmm3 = _mm_add_ss(xmm3,tz);
2346 xmm4 = _mm_sub_ss(xmm4,tx);
2347 xmm5 = _mm_sub_ss(xmm5,ty);
2348 xmm6 = _mm_sub_ss(xmm6,tz);
2350 _mm_store_ss(f+ai23,xmm1);
2351 _mm_store_ss(f+ai23+1,xmm2);
2352 _mm_store_ss(f+ai23+2,xmm3);
2354 _mm_store_ss(f+aj23,xmm4);
2355 _mm_store_ss(f+aj23+1,xmm5);
2356 _mm_store_ss(f+aj23+2,xmm6);
2358 /* ai3 and aj3 */
2359 xmm1 = _mm_load_ss(f+ai33);
2360 xmm2 = _mm_load_ss(f+ai33+1);
2361 xmm3 = _mm_load_ss(f+ai33+2);
2363 xmm4 = _mm_load_ss(f+aj33);
2364 xmm5 = _mm_load_ss(f+aj33+1);
2365 xmm6 = _mm_load_ss(f+aj33+2);
2367 /* Need to shuffle tx/ty/tz */
2368 tx = _mm_shuffle_ps(tx,tx,_MM_SHUFFLE(0,3,2,1));
2369 ty = _mm_shuffle_ps(ty,ty,_MM_SHUFFLE(0,3,2,1));
2370 tz = _mm_shuffle_ps(tz,tz,_MM_SHUFFLE(0,3,2,1));
2372 xmm1 = _mm_add_ss(xmm1,tx);
2373 xmm2 = _mm_add_ss(xmm2,ty);
2374 xmm3 = _mm_add_ss(xmm3,tz);
2376 xmm4 = _mm_sub_ss(xmm4,tx);
2377 xmm5 = _mm_sub_ss(xmm5,ty);
2378 xmm6 = _mm_sub_ss(xmm6,tz);
2380 _mm_store_ss(f+ai33,xmm1);
2381 _mm_store_ss(f+ai33+1,xmm2);
2382 _mm_store_ss(f+ai33+2,xmm3);
2384 _mm_store_ss(f+aj33,xmm4);
2385 _mm_store_ss(f+aj33+1,xmm5);
2386 _mm_store_ss(f+aj33+2,xmm6);
2391 /* End processing for potential energy */
2392 vgb = _mm_movehl_ps(vgb,vgb_tot);
2393 vgb_tot = _mm_add_ps(vgb_tot,vgb);
2394 vgb = _mm_shuffle_ps(vgb_tot,vgb_tot,_MM_SHUFFLE(1,1,1,1));
2395 vgb_tot = _mm_add_ss(vgb_tot,vgb);
2397 _mm_store_ss(&vctot,vgb_tot);
2399 return vctot;
2405 #else
2406 /* keep compiler happy */
2407 int genborn_sse_dummy;
2409 #endif /* SSE intrinsics available */