3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2008, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
64 /* Only compile this file if SSE intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
67 #include <gmx_sse2_single.h>
68 #include <xmmintrin.h>
69 #include <emmintrin.h>
73 calc_gb_rad_still_sse(t_commrec
*cr
, t_forcerec
*fr
,
74 int natoms
, gmx_localtop_t
*top
,
75 const t_atomtypes
*atype
, float *x
, t_nblist
*nl
,
78 int i
,k
,n
,ii
,is3
,ii3
,nj0
,nj1
,offset
;
79 int jnrA
,jnrB
,jnrC
,jnrD
,j3A
,j3B
,j3C
,j3D
;
80 int jnrE
,jnrF
,jnrG
,jnrH
,j3E
,j3F
,j3G
,j3H
;
101 __m128 rsq
,rinv
,rinv2
,rinv4
,rinv6
;
102 __m128 rsqB
,rinvB
,rinv2B
,rinv4B
,rinv6B
;
103 __m128 ratio
,gpi
,rai
,raj
,vai
,vaj
,rvdw
;
104 __m128 ratioB
,rajB
,vajB
,rvdwB
;
105 __m128 ccf
,dccf
,theta
,cosq
,term
,sinq
,res
,prod
,prod_ai
,tmp
;
106 __m128 ccfB
,dccfB
,thetaB
,cosqB
,termB
,sinqB
,resB
,prodB
;
107 __m128 mask
,icf4
,icf6
,mask_cmp
;
108 __m128 icf4B
,icf6B
,mask_cmpB
;
110 __m128 mask1
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
111 __m128 mask2
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
112 __m128 mask3
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
114 const __m128 half
= {0.5f
, 0.5f
, 0.5f
, 0.5f
};
115 const __m128 three
= {3.0f
, 3.0f
, 3.0f
, 3.0f
};
116 const __m128 one
= {1.0f
, 1.0f
, 1.0f
, 1.0f
};
117 const __m128 two
= {2.0f
, 2.0f
, 2.0f
, 2.0f
};
118 const __m128 zero
= {0.0f
, 0.0f
, 0.0f
, 0.0f
};
119 const __m128 four
= {4.0f
, 4.0f
, 4.0f
, 4.0f
};
121 const __m128 still_p5inv
= {STILL_P5INV
, STILL_P5INV
, STILL_P5INV
, STILL_P5INV
};
122 const __m128 still_pip5
= {STILL_PIP5
, STILL_PIP5
, STILL_PIP5
, STILL_PIP5
};
123 const __m128 still_p4
= {STILL_P4
, STILL_P4
, STILL_P4
, STILL_P4
};
125 factor
= 0.5 * ONE_4PI_EPS0
;
127 gb_radius
= born
->gb_radius
;
129 work
= born
->gpol_still_work
;
131 shiftvec
= fr
->shift_vec
[0];
134 jnrA
= jnrB
= jnrC
= jnrD
= 0;
135 jx
= _mm_setzero_ps();
136 jy
= _mm_setzero_ps();
137 jz
= _mm_setzero_ps();
141 for(i
=0;i
<natoms
;i
++)
146 for(i
=0;i
<nl
->nri
;i
++)
150 is3
= 3*nl
->shift
[i
];
152 shY
= shiftvec
[is3
+1];
153 shZ
= shiftvec
[is3
+2];
155 nj1
= nl
->jindex
[i
+1];
157 ix
= _mm_set1_ps(shX
+x
[ii3
+0]);
158 iy
= _mm_set1_ps(shY
+x
[ii3
+1]);
159 iz
= _mm_set1_ps(shZ
+x
[ii3
+2]);
161 offset
= (nj1
-nj0
)%4;
163 /* Polarization energy for atom ai */
164 gpi
= _mm_setzero_ps();
166 rai
= _mm_load1_ps(gb_radius
+ii
);
167 prod_ai
= _mm_set1_ps(STILL_P4
*vsolv
[ii
]);
169 for(k
=nj0
;k
<nj1
-4-offset
;k
+=8)
189 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
190 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3E
,x
+j3F
,x
+j3G
,x
+j3H
,jxB
,jyB
,jzB
);
192 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
193 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrE
,gb_radius
+jnrF
,gb_radius
+jnrG
,gb_radius
+jnrH
,rajB
);
194 GMX_MM_LOAD_4VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vsolv
+jnrC
,vsolv
+jnrD
,vaj
);
195 GMX_MM_LOAD_4VALUES_PS(vsolv
+jnrE
,vsolv
+jnrF
,vsolv
+jnrG
,vsolv
+jnrH
,vajB
);
197 dx
= _mm_sub_ps(ix
,jx
);
198 dy
= _mm_sub_ps(iy
,jy
);
199 dz
= _mm_sub_ps(iz
,jz
);
200 dxB
= _mm_sub_ps(ix
,jxB
);
201 dyB
= _mm_sub_ps(iy
,jyB
);
202 dzB
= _mm_sub_ps(iz
,jzB
);
204 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
205 rsqB
= gmx_mm_calc_rsq_ps(dxB
,dyB
,dzB
);
206 rinv
= gmx_mm_invsqrt_ps(rsq
);
207 rinvB
= gmx_mm_invsqrt_ps(rsqB
);
208 rinv2
= _mm_mul_ps(rinv
,rinv
);
209 rinv2B
= _mm_mul_ps(rinvB
,rinvB
);
210 rinv4
= _mm_mul_ps(rinv2
,rinv2
);
211 rinv4B
= _mm_mul_ps(rinv2B
,rinv2B
);
212 rinv6
= _mm_mul_ps(rinv4
,rinv2
);
213 rinv6B
= _mm_mul_ps(rinv4B
,rinv2B
);
215 rvdw
= _mm_add_ps(rai
,raj
);
216 rvdwB
= _mm_add_ps(rai
,rajB
);
217 ratio
= _mm_mul_ps(rsq
, gmx_mm_inv_ps( _mm_mul_ps(rvdw
,rvdw
)));
218 ratioB
= _mm_mul_ps(rsqB
, gmx_mm_inv_ps( _mm_mul_ps(rvdwB
,rvdwB
)));
220 mask_cmp
= _mm_cmple_ps(ratio
,still_p5inv
);
221 mask_cmpB
= _mm_cmple_ps(ratioB
,still_p5inv
);
223 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
224 if( 0 == _mm_movemask_ps(mask_cmp
) )
226 /* if ratio>still_p5inv for ALL elements */
228 dccf
= _mm_setzero_ps();
232 ratio
= _mm_min_ps(ratio
,still_p5inv
);
233 theta
= _mm_mul_ps(ratio
,still_pip5
);
234 GMX_MM_SINCOS_PS(theta
,sinq
,cosq
);
235 term
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosq
));
236 ccf
= _mm_mul_ps(term
,term
);
237 dccf
= _mm_mul_ps(_mm_mul_ps(two
,term
),
238 _mm_mul_ps(sinq
,theta
));
240 if( 0 == _mm_movemask_ps(mask_cmpB
) )
242 /* if ratio>still_p5inv for ALL elements */
244 dccfB
= _mm_setzero_ps();
248 ratioB
= _mm_min_ps(ratioB
,still_p5inv
);
249 thetaB
= _mm_mul_ps(ratioB
,still_pip5
);
250 GMX_MM_SINCOS_PS(thetaB
,sinqB
,cosqB
);
251 termB
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosqB
));
252 ccfB
= _mm_mul_ps(termB
,termB
);
253 dccfB
= _mm_mul_ps(_mm_mul_ps(two
,termB
),
254 _mm_mul_ps(sinqB
,thetaB
));
257 prod
= _mm_mul_ps(still_p4
,vaj
);
258 prodB
= _mm_mul_ps(still_p4
,vajB
);
259 icf4
= _mm_mul_ps(ccf
,rinv4
);
260 icf4B
= _mm_mul_ps(ccfB
,rinv4B
);
261 icf6
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccf
),dccf
), rinv6
);
262 icf6B
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccfB
),dccfB
), rinv6B
);
264 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_mul_ps(prod_ai
,icf4
));
265 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrE
,work
+jnrF
,work
+jnrG
,work
+jnrH
,_mm_mul_ps(prod_ai
,icf4B
));
267 gpi
= _mm_add_ps(gpi
, _mm_add_ps( _mm_mul_ps(prod
,icf4
) , _mm_mul_ps(prodB
,icf4B
) ) );
269 _mm_store_ps(dadx
,_mm_mul_ps(prod
,icf6
));
271 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6
));
273 _mm_store_ps(dadx
,_mm_mul_ps(prodB
,icf6B
));
275 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6B
));
279 for(;k
<nj1
-offset
;k
+=4)
291 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
293 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
294 GMX_MM_LOAD_4VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vsolv
+jnrC
,vsolv
+jnrD
,vaj
);
296 dx
= _mm_sub_ps(ix
,jx
);
297 dy
= _mm_sub_ps(iy
,jy
);
298 dz
= _mm_sub_ps(iz
,jz
);
300 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
301 rinv
= gmx_mm_invsqrt_ps(rsq
);
302 rinv2
= _mm_mul_ps(rinv
,rinv
);
303 rinv4
= _mm_mul_ps(rinv2
,rinv2
);
304 rinv6
= _mm_mul_ps(rinv4
,rinv2
);
306 rvdw
= _mm_add_ps(rai
,raj
);
307 ratio
= _mm_mul_ps(rsq
, gmx_mm_inv_ps( _mm_mul_ps(rvdw
,rvdw
)));
309 mask_cmp
= _mm_cmple_ps(ratio
,still_p5inv
);
311 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
312 if(0 == _mm_movemask_ps(mask_cmp
))
314 /* if ratio>still_p5inv for ALL elements */
316 dccf
= _mm_setzero_ps();
320 ratio
= _mm_min_ps(ratio
,still_p5inv
);
321 theta
= _mm_mul_ps(ratio
,still_pip5
);
322 GMX_MM_SINCOS_PS(theta
,sinq
,cosq
);
323 term
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosq
));
324 ccf
= _mm_mul_ps(term
,term
);
325 dccf
= _mm_mul_ps(_mm_mul_ps(two
,term
),
326 _mm_mul_ps(sinq
,theta
));
329 prod
= _mm_mul_ps(still_p4
,vaj
);
330 icf4
= _mm_mul_ps(ccf
,rinv4
);
331 icf6
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccf
),dccf
), rinv6
);
334 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_mul_ps(prod_ai
,icf4
));
336 gpi
= _mm_add_ps(gpi
, _mm_mul_ps(prod
,icf4
));
338 _mm_store_ps(dadx
,_mm_mul_ps(prod
,icf6
));
340 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6
));
350 GMX_MM_LOAD_1RVEC_1POINTER_PS(x
+j3A
,jx
,jy
,jz
);
351 GMX_MM_LOAD_1VALUE_PS(gb_radius
+jnrA
,raj
);
352 GMX_MM_LOAD_1VALUE_PS(vsolv
+jnrA
,vaj
);
361 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x
+j3A
,x
+j3B
,jx
,jy
,jz
);
362 GMX_MM_LOAD_2VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,raj
);
363 GMX_MM_LOAD_2VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vaj
);
368 /* offset must be 3 */
375 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,jx
,jy
,jz
);
376 GMX_MM_LOAD_3VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,raj
);
377 GMX_MM_LOAD_3VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vsolv
+jnrC
,vaj
);
381 dx
= _mm_sub_ps(ix
,jx
);
382 dy
= _mm_sub_ps(iy
,jy
);
383 dz
= _mm_sub_ps(iz
,jz
);
385 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
386 rinv
= gmx_mm_invsqrt_ps(rsq
);
387 rinv2
= _mm_mul_ps(rinv
,rinv
);
388 rinv4
= _mm_mul_ps(rinv2
,rinv2
);
389 rinv6
= _mm_mul_ps(rinv4
,rinv2
);
391 rvdw
= _mm_add_ps(rai
,raj
);
392 ratio
= _mm_mul_ps(rsq
, gmx_mm_inv_ps( _mm_mul_ps(rvdw
,rvdw
)));
394 mask_cmp
= _mm_cmple_ps(ratio
,still_p5inv
);
396 if(0 == _mm_movemask_ps(mask_cmp
))
398 /* if ratio>still_p5inv for ALL elements */
400 dccf
= _mm_setzero_ps();
404 ratio
= _mm_min_ps(ratio
,still_p5inv
);
405 theta
= _mm_mul_ps(ratio
,still_pip5
);
406 GMX_MM_SINCOS_PS(theta
,sinq
,cosq
);
407 term
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosq
));
408 ccf
= _mm_mul_ps(term
,term
);
409 dccf
= _mm_mul_ps(_mm_mul_ps(two
,term
),
410 _mm_mul_ps(sinq
,theta
));
413 prod
= _mm_mul_ps(still_p4
,vaj
);
414 icf4
= _mm_mul_ps(ccf
,rinv4
);
415 icf6
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccf
),dccf
), rinv6
);
417 gpi
= _mm_add_ps(gpi
, _mm_mul_ps(prod
,icf4
));
419 _mm_store_ps(dadx
,_mm_mul_ps(prod
,icf6
));
421 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6
));
424 tmp
= _mm_mul_ps(prod_ai
,icf4
);
428 GMX_MM_INCREMENT_1VALUE_PS(work
+jnrA
,tmp
);
432 GMX_MM_INCREMENT_2VALUES_PS(work
+jnrA
,work
+jnrB
,tmp
);
436 /* offset must be 3 */
437 GMX_MM_INCREMENT_3VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,tmp
);
440 gmx_mm_update_1pot_ps(gpi
,work
+ii
);
443 /* Sum up the polarization energy from other nodes */
446 gmx_sum(natoms
, work
, cr
);
448 else if(DOMAINDECOMP(cr
))
450 dd_atom_sum_real(cr
->dd
, work
);
453 /* Compute the radii */
454 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
456 if(born
->use
[i
] != 0)
458 gpi_ai
= born
->gpol
[i
] + work
[i
]; /* add gpi to the initial pol energy gpi_ai*/
459 gpi2
= gpi_ai
* gpi_ai
;
460 born
->bRad
[i
] = factor
*gmx_invsqrt(gpi2
);
461 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
465 /* Extra (local) communication required for DD */
468 dd_atom_spread_real(cr
->dd
, born
->bRad
);
469 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
477 calc_gb_rad_hct_obc_sse(t_commrec
*cr
, t_forcerec
* fr
, int natoms
, gmx_localtop_t
*top
,
478 const t_atomtypes
*atype
, float *x
, t_nblist
*nl
, gmx_genborn_t
*born
,t_mdatoms
*md
,int gb_algorithm
)
480 int i
,ai
,k
,n
,ii
,ii3
,is3
,nj0
,nj1
,at0
,at1
,offset
;
481 int jnrA
,jnrB
,jnrC
,jnrD
;
483 int jnrE
,jnrF
,jnrG
,jnrH
;
486 float rr
,rr_inv
,rr_inv2
,sum_tmp
,sum
,sum2
,sum3
,gbr
;
487 float sum_ai2
, sum_ai3
,tsum
,tchain
,doffset
;
496 __m128 ix
,iy
,iz
,jx
,jy
,jz
;
497 __m128 dx
,dy
,dz
,t1
,t2
,t3
,t4
;
499 __m128 rai
,rai_inv
,raj
, raj_inv
,rai_inv2
,sk
,sk2
,lij
,dlij
,duij
;
500 __m128 uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
501 __m128 lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
502 __m128 sum_ai
, tmp_ai
,sk_ai
,sk_aj
,sk2_ai
,sk2_aj
,sk2_rinv
;
506 __m128 obc_mask1
,obc_mask2
,obc_mask3
;
507 __m128 jxB
,jyB
,jzB
,t1B
,t2B
,t3B
,t4B
;
508 __m128 dxB
,dyB
,dzB
,rsqB
,rinvB
,rB
;
509 __m128 rajB
, raj_invB
,rai_inv2B
,sk2B
,lijB
,dlijB
,duijB
;
510 __m128 uijB
,lij2B
,uij2B
,lij3B
,uij3B
,diff2B
;
511 __m128 lij_invB
,sk2_invB
,prodB
;
512 __m128 sk_ajB
,sk2_ajB
,sk2_rinvB
;
513 __m128 dadx1B
,dadx2B
;
515 __m128 obc_mask1B
,obc_mask2B
,obc_mask3B
;
517 __m128 mask1
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
518 __m128 mask2
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
519 __m128 mask3
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
521 __m128 oneeighth
= _mm_set1_ps(0.125);
522 __m128 onefourth
= _mm_set1_ps(0.25);
524 const __m128 neg
= {-1.0f
, -1.0f
, -1.0f
, -1.0f
};
525 const __m128 zero
= {0.0f
, 0.0f
, 0.0f
, 0.0f
};
526 const __m128 half
= {0.5f
, 0.5f
, 0.5f
, 0.5f
};
527 const __m128 one
= {1.0f
, 1.0f
, 1.0f
, 1.0f
};
528 const __m128 two
= {2.0f
, 2.0f
, 2.0f
, 2.0f
};
529 const __m128 three
= {3.0f
, 3.0f
, 3.0f
, 3.0f
};
531 /* Set the dielectric offset */
532 doffset
= born
->gb_doffset
;
533 gb_radius
= born
->gb_radius
;
534 obc_param
= born
->param
;
535 work
= born
->gpol_hct_work
;
538 shiftvec
= fr
->shift_vec
[0];
540 jx
= _mm_setzero_ps();
541 jy
= _mm_setzero_ps();
542 jz
= _mm_setzero_ps();
544 jnrA
= jnrB
= jnrC
= jnrD
= 0;
546 for(i
=0;i
<born
->nr
;i
++)
551 for(i
=0;i
<nl
->nri
;i
++)
555 is3
= 3*nl
->shift
[i
];
557 shY
= shiftvec
[is3
+1];
558 shZ
= shiftvec
[is3
+2];
560 nj1
= nl
->jindex
[i
+1];
562 ix
= _mm_set1_ps(shX
+x
[ii3
+0]);
563 iy
= _mm_set1_ps(shY
+x
[ii3
+1]);
564 iz
= _mm_set1_ps(shZ
+x
[ii3
+2]);
566 offset
= (nj1
-nj0
)%4;
568 rai
= _mm_load1_ps(gb_radius
+ii
);
569 rai_inv
= gmx_mm_inv_ps(rai
);
571 sum_ai
= _mm_setzero_ps();
573 sk_ai
= _mm_load1_ps(born
->param
+ii
);
574 sk2_ai
= _mm_mul_ps(sk_ai
,sk_ai
);
576 for(k
=nj0
;k
<nj1
-4-offset
;k
+=8)
596 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
597 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3E
,x
+j3F
,x
+j3G
,x
+j3H
,jxB
,jyB
,jzB
);
598 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
599 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrE
,gb_radius
+jnrF
,gb_radius
+jnrG
,gb_radius
+jnrH
,rajB
);
600 GMX_MM_LOAD_4VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,obc_param
+jnrC
,obc_param
+jnrD
,sk_aj
);
601 GMX_MM_LOAD_4VALUES_PS(obc_param
+jnrE
,obc_param
+jnrF
,obc_param
+jnrG
,obc_param
+jnrH
,sk_ajB
);
603 dx
= _mm_sub_ps(ix
, jx
);
604 dy
= _mm_sub_ps(iy
, jy
);
605 dz
= _mm_sub_ps(iz
, jz
);
606 dxB
= _mm_sub_ps(ix
, jxB
);
607 dyB
= _mm_sub_ps(iy
, jyB
);
608 dzB
= _mm_sub_ps(iz
, jzB
);
610 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
611 rsqB
= gmx_mm_calc_rsq_ps(dxB
,dyB
,dzB
);
613 rinv
= gmx_mm_invsqrt_ps(rsq
);
614 r
= _mm_mul_ps(rsq
,rinv
);
615 rinvB
= gmx_mm_invsqrt_ps(rsqB
);
616 rB
= _mm_mul_ps(rsqB
,rinvB
);
618 /* Compute raj_inv aj1-4 */
619 raj_inv
= gmx_mm_inv_ps(raj
);
620 raj_invB
= gmx_mm_inv_ps(rajB
);
622 /* Evaluate influence of atom aj -> ai */
623 t1
= _mm_add_ps(r
,sk_aj
);
624 t2
= _mm_sub_ps(r
,sk_aj
);
625 t3
= _mm_sub_ps(sk_aj
,r
);
626 t1B
= _mm_add_ps(rB
,sk_ajB
);
627 t2B
= _mm_sub_ps(rB
,sk_ajB
);
628 t3B
= _mm_sub_ps(sk_ajB
,rB
);
629 obc_mask1
= _mm_cmplt_ps(rai
, t1
);
630 obc_mask2
= _mm_cmplt_ps(rai
, t2
);
631 obc_mask3
= _mm_cmplt_ps(rai
, t3
);
632 obc_mask1B
= _mm_cmplt_ps(rai
, t1B
);
633 obc_mask2B
= _mm_cmplt_ps(rai
, t2B
);
634 obc_mask3B
= _mm_cmplt_ps(rai
, t3B
);
636 uij
= gmx_mm_inv_ps(t1
);
637 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
638 _mm_andnot_ps(obc_mask2
,rai_inv
));
639 dlij
= _mm_and_ps(one
,obc_mask2
);
640 uij2
= _mm_mul_ps(uij
, uij
);
641 uij3
= _mm_mul_ps(uij2
,uij
);
642 lij2
= _mm_mul_ps(lij
, lij
);
643 lij3
= _mm_mul_ps(lij2
,lij
);
645 uijB
= gmx_mm_inv_ps(t1B
);
646 lijB
= _mm_or_ps( _mm_and_ps(obc_mask2B
,gmx_mm_inv_ps(t2B
)),
647 _mm_andnot_ps(obc_mask2B
,rai_inv
));
648 dlijB
= _mm_and_ps(one
,obc_mask2B
);
649 uij2B
= _mm_mul_ps(uijB
, uijB
);
650 uij3B
= _mm_mul_ps(uij2B
,uijB
);
651 lij2B
= _mm_mul_ps(lijB
, lijB
);
652 lij3B
= _mm_mul_ps(lij2B
,lijB
);
654 diff2
= _mm_sub_ps(uij2
,lij2
);
655 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
656 sk2_aj
= _mm_mul_ps(sk_aj
,sk_aj
);
657 sk2_rinv
= _mm_mul_ps(sk2_aj
,rinv
);
658 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
660 diff2B
= _mm_sub_ps(uij2B
,lij2B
);
661 lij_invB
= gmx_mm_invsqrt_ps(lij2B
);
662 sk2_ajB
= _mm_mul_ps(sk_ajB
,sk_ajB
);
663 sk2_rinvB
= _mm_mul_ps(sk2_ajB
,rinvB
);
664 prodB
= _mm_mul_ps(onefourth
,sk2_rinvB
);
666 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
667 logtermB
= gmx_mm_log_ps(_mm_mul_ps(uijB
,lij_invB
));
669 t1
= _mm_sub_ps(lij
,uij
);
670 t2
= _mm_mul_ps(diff2
,
671 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
673 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
674 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
675 t4
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lij
));
676 t4
= _mm_and_ps(t4
,obc_mask3
);
677 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
679 t1B
= _mm_sub_ps(lijB
,uijB
);
680 t2B
= _mm_mul_ps(diff2B
,
681 _mm_sub_ps(_mm_mul_ps(onefourth
,rB
),
683 t3B
= _mm_mul_ps(half
,_mm_mul_ps(rinvB
,logtermB
));
684 t1B
= _mm_add_ps(t1B
,_mm_add_ps(t2B
,t3B
));
685 t4B
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lijB
));
686 t4B
= _mm_and_ps(t4B
,obc_mask3B
);
687 t1B
= _mm_mul_ps(half
,_mm_add_ps(t1B
,t4B
));
689 sum_ai
= _mm_add_ps(sum_ai
, _mm_add_ps( _mm_and_ps(t1
,obc_mask1
), _mm_and_ps(t1B
,obc_mask1B
) ));
691 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
692 _mm_mul_ps(prod
,lij3
));
694 _mm_mul_ps(onefourth
,
695 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
696 _mm_mul_ps(lij3
,r
))));
697 t2
= _mm_mul_ps(onefourth
,
698 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
699 _mm_mul_ps(uij3
,r
)));
701 _mm_add_ps(_mm_mul_ps(half
,uij2
),
702 _mm_mul_ps(prod
,uij3
)));
703 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
704 _mm_mul_ps(rinv
,rinv
));
706 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
708 _mm_mul_ps(sk2_rinv
,rinv
))));
709 t1
= _mm_mul_ps(rinv
,
710 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
715 t1B
= _mm_add_ps(_mm_mul_ps(half
,lij2B
),
716 _mm_mul_ps(prodB
,lij3B
));
717 t1B
= _mm_sub_ps(t1B
,
718 _mm_mul_ps(onefourth
,
719 _mm_add_ps(_mm_mul_ps(lijB
,rinvB
),
720 _mm_mul_ps(lij3B
,rB
))));
721 t2B
= _mm_mul_ps(onefourth
,
722 _mm_add_ps(_mm_mul_ps(uijB
,rinvB
),
723 _mm_mul_ps(uij3B
,rB
)));
724 t2B
= _mm_sub_ps(t2B
,
725 _mm_add_ps(_mm_mul_ps(half
,uij2B
),
726 _mm_mul_ps(prodB
,uij3B
)));
727 t3B
= _mm_mul_ps(_mm_mul_ps(onefourth
,logtermB
),
728 _mm_mul_ps(rinvB
,rinvB
));
729 t3B
= _mm_sub_ps(t3B
,
730 _mm_mul_ps(_mm_mul_ps(diff2B
,oneeighth
),
732 _mm_mul_ps(sk2_rinvB
,rinvB
))));
733 t1B
= _mm_mul_ps(rinvB
,
734 _mm_add_ps(_mm_mul_ps(dlijB
,t1B
),
735 _mm_add_ps(t2B
,t3B
)));
737 dadx1
= _mm_and_ps(t1
,obc_mask1
);
738 dadx1B
= _mm_and_ps(t1B
,obc_mask1B
);
741 /* Evaluate influence of atom ai -> aj */
742 t1
= _mm_add_ps(r
,sk_ai
);
743 t2
= _mm_sub_ps(r
,sk_ai
);
744 t3
= _mm_sub_ps(sk_ai
,r
);
745 t1B
= _mm_add_ps(rB
,sk_ai
);
746 t2B
= _mm_sub_ps(rB
,sk_ai
);
747 t3B
= _mm_sub_ps(sk_ai
,rB
);
748 obc_mask1
= _mm_cmplt_ps(raj
, t1
);
749 obc_mask2
= _mm_cmplt_ps(raj
, t2
);
750 obc_mask3
= _mm_cmplt_ps(raj
, t3
);
751 obc_mask1B
= _mm_cmplt_ps(rajB
, t1B
);
752 obc_mask2B
= _mm_cmplt_ps(rajB
, t2B
);
753 obc_mask3B
= _mm_cmplt_ps(rajB
, t3B
);
755 uij
= gmx_mm_inv_ps(t1
);
756 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
757 _mm_andnot_ps(obc_mask2
,raj_inv
));
758 dlij
= _mm_and_ps(one
,obc_mask2
);
759 uij2
= _mm_mul_ps(uij
, uij
);
760 uij3
= _mm_mul_ps(uij2
,uij
);
761 lij2
= _mm_mul_ps(lij
, lij
);
762 lij3
= _mm_mul_ps(lij2
,lij
);
764 uijB
= gmx_mm_inv_ps(t1B
);
765 lijB
= _mm_or_ps( _mm_and_ps(obc_mask2B
,gmx_mm_inv_ps(t2B
)),
766 _mm_andnot_ps(obc_mask2B
,raj_invB
));
767 dlijB
= _mm_and_ps(one
,obc_mask2B
);
768 uij2B
= _mm_mul_ps(uijB
, uijB
);
769 uij3B
= _mm_mul_ps(uij2B
,uijB
);
770 lij2B
= _mm_mul_ps(lijB
, lijB
);
771 lij3B
= _mm_mul_ps(lij2B
,lijB
);
773 diff2
= _mm_sub_ps(uij2
,lij2
);
774 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
775 sk2_rinv
= _mm_mul_ps(sk2_ai
,rinv
);
776 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
778 diff2B
= _mm_sub_ps(uij2B
,lij2B
);
779 lij_invB
= gmx_mm_invsqrt_ps(lij2B
);
780 sk2_rinvB
= _mm_mul_ps(sk2_ai
,rinvB
);
781 prodB
= _mm_mul_ps(onefourth
,sk2_rinvB
);
783 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
784 logtermB
= gmx_mm_log_ps(_mm_mul_ps(uijB
,lij_invB
));
786 t1
= _mm_sub_ps(lij
,uij
);
787 t2
= _mm_mul_ps(diff2
,
788 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
790 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
791 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
792 t4
= _mm_mul_ps(two
,_mm_sub_ps(raj_inv
,lij
));
793 t4
= _mm_and_ps(t4
,obc_mask3
);
794 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
796 t1B
= _mm_sub_ps(lijB
,uijB
);
797 t2B
= _mm_mul_ps(diff2B
,
798 _mm_sub_ps(_mm_mul_ps(onefourth
,rB
),
800 t3B
= _mm_mul_ps(half
,_mm_mul_ps(rinvB
,logtermB
));
801 t1B
= _mm_add_ps(t1B
,_mm_add_ps(t2B
,t3B
));
802 t4B
= _mm_mul_ps(two
,_mm_sub_ps(raj_invB
,lijB
));
803 t4B
= _mm_and_ps(t4B
,obc_mask3B
);
804 t1B
= _mm_mul_ps(half
,_mm_add_ps(t1B
,t4B
));
806 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_and_ps(t1
,obc_mask1
));
807 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrE
,work
+jnrF
,work
+jnrG
,work
+jnrH
,_mm_and_ps(t1B
,obc_mask1B
));
809 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
810 _mm_mul_ps(prod
,lij3
));
812 _mm_mul_ps(onefourth
,
813 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
814 _mm_mul_ps(lij3
,r
))));
815 t2
= _mm_mul_ps(onefourth
,
816 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
817 _mm_mul_ps(uij3
,r
)));
819 _mm_add_ps(_mm_mul_ps(half
,uij2
),
820 _mm_mul_ps(prod
,uij3
)));
821 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
822 _mm_mul_ps(rinv
,rinv
));
824 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
826 _mm_mul_ps(sk2_rinv
,rinv
))));
827 t1
= _mm_mul_ps(rinv
,
828 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
832 t1B
= _mm_add_ps(_mm_mul_ps(half
,lij2B
),
833 _mm_mul_ps(prodB
,lij3B
));
834 t1B
= _mm_sub_ps(t1B
,
835 _mm_mul_ps(onefourth
,
836 _mm_add_ps(_mm_mul_ps(lijB
,rinvB
),
837 _mm_mul_ps(lij3B
,rB
))));
838 t2B
= _mm_mul_ps(onefourth
,
839 _mm_add_ps(_mm_mul_ps(uijB
,rinvB
),
840 _mm_mul_ps(uij3B
,rB
)));
841 t2B
= _mm_sub_ps(t2B
,
842 _mm_add_ps(_mm_mul_ps(half
,uij2B
),
843 _mm_mul_ps(prodB
,uij3B
)));
844 t3B
= _mm_mul_ps(_mm_mul_ps(onefourth
,logtermB
),
845 _mm_mul_ps(rinvB
,rinvB
));
846 t3B
= _mm_sub_ps(t3B
,
847 _mm_mul_ps(_mm_mul_ps(diff2B
,oneeighth
),
849 _mm_mul_ps(sk2_rinvB
,rinvB
))));
850 t1B
= _mm_mul_ps(rinvB
,
851 _mm_add_ps(_mm_mul_ps(dlijB
,t1B
),
852 _mm_add_ps(t2B
,t3B
)));
855 dadx2
= _mm_and_ps(t1
,obc_mask1
);
856 dadx2B
= _mm_and_ps(t1B
,obc_mask1B
);
858 _mm_store_ps(dadx
,dadx1
);
860 _mm_store_ps(dadx
,dadx2
);
862 _mm_store_ps(dadx
,dadx1B
);
864 _mm_store_ps(dadx
,dadx2B
);
867 } /* end normal inner loop */
869 for(;k
<nj1
-offset
;k
+=4)
881 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
882 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
883 GMX_MM_LOAD_4VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,obc_param
+jnrC
,obc_param
+jnrD
,sk_aj
);
885 dx
= _mm_sub_ps(ix
, jx
);
886 dy
= _mm_sub_ps(iy
, jy
);
887 dz
= _mm_sub_ps(iz
, jz
);
889 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
891 rinv
= gmx_mm_invsqrt_ps(rsq
);
892 r
= _mm_mul_ps(rsq
,rinv
);
894 /* Compute raj_inv aj1-4 */
895 raj_inv
= gmx_mm_inv_ps(raj
);
897 /* Evaluate influence of atom aj -> ai */
898 t1
= _mm_add_ps(r
,sk_aj
);
899 obc_mask1
= _mm_cmplt_ps(rai
, t1
);
901 if(_mm_movemask_ps(obc_mask1
))
903 /* If any of the elements has rai<dr+sk, this is executed */
904 t2
= _mm_sub_ps(r
,sk_aj
);
905 t3
= _mm_sub_ps(sk_aj
,r
);
907 obc_mask2
= _mm_cmplt_ps(rai
, t2
);
908 obc_mask3
= _mm_cmplt_ps(rai
, t3
);
910 uij
= gmx_mm_inv_ps(t1
);
911 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
912 _mm_andnot_ps(obc_mask2
,rai_inv
));
913 dlij
= _mm_and_ps(one
,obc_mask2
);
914 uij2
= _mm_mul_ps(uij
, uij
);
915 uij3
= _mm_mul_ps(uij2
,uij
);
916 lij2
= _mm_mul_ps(lij
, lij
);
917 lij3
= _mm_mul_ps(lij2
,lij
);
918 diff2
= _mm_sub_ps(uij2
,lij2
);
919 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
920 sk2_aj
= _mm_mul_ps(sk_aj
,sk_aj
);
921 sk2_rinv
= _mm_mul_ps(sk2_aj
,rinv
);
922 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
923 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
924 t1
= _mm_sub_ps(lij
,uij
);
925 t2
= _mm_mul_ps(diff2
,
926 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
928 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
929 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
930 t4
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lij
));
931 t4
= _mm_and_ps(t4
,obc_mask3
);
932 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
933 sum_ai
= _mm_add_ps(sum_ai
,_mm_and_ps(t1
,obc_mask1
));
934 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
935 _mm_mul_ps(prod
,lij3
));
937 _mm_mul_ps(onefourth
,
938 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
939 _mm_mul_ps(lij3
,r
))));
940 t2
= _mm_mul_ps(onefourth
,
941 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
942 _mm_mul_ps(uij3
,r
)));
944 _mm_add_ps(_mm_mul_ps(half
,uij2
),
945 _mm_mul_ps(prod
,uij3
)));
946 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
947 _mm_mul_ps(rinv
,rinv
));
949 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
951 _mm_mul_ps(sk2_rinv
,rinv
))));
952 t1
= _mm_mul_ps(rinv
,
953 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
956 dadx1
= _mm_and_ps(t1
,obc_mask1
);
960 dadx1
= _mm_setzero_ps();
963 /* Evaluate influence of atom ai -> aj */
964 t1
= _mm_add_ps(r
,sk_ai
);
965 obc_mask1
= _mm_cmplt_ps(raj
, t1
);
967 if(_mm_movemask_ps(obc_mask1
))
969 t2
= _mm_sub_ps(r
,sk_ai
);
970 t3
= _mm_sub_ps(sk_ai
,r
);
971 obc_mask2
= _mm_cmplt_ps(raj
, t2
);
972 obc_mask3
= _mm_cmplt_ps(raj
, t3
);
974 uij
= gmx_mm_inv_ps(t1
);
975 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
976 _mm_andnot_ps(obc_mask2
,raj_inv
));
977 dlij
= _mm_and_ps(one
,obc_mask2
);
978 uij2
= _mm_mul_ps(uij
, uij
);
979 uij3
= _mm_mul_ps(uij2
,uij
);
980 lij2
= _mm_mul_ps(lij
, lij
);
981 lij3
= _mm_mul_ps(lij2
,lij
);
982 diff2
= _mm_sub_ps(uij2
,lij2
);
983 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
984 sk2_rinv
= _mm_mul_ps(sk2_ai
,rinv
);
985 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
986 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
987 t1
= _mm_sub_ps(lij
,uij
);
988 t2
= _mm_mul_ps(diff2
,
989 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
991 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
992 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
993 t4
= _mm_mul_ps(two
,_mm_sub_ps(raj_inv
,lij
));
994 t4
= _mm_and_ps(t4
,obc_mask3
);
995 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
997 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_and_ps(t1
,obc_mask1
));
999 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
1000 _mm_mul_ps(prod
,lij3
));
1002 _mm_mul_ps(onefourth
,
1003 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
1004 _mm_mul_ps(lij3
,r
))));
1005 t2
= _mm_mul_ps(onefourth
,
1006 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
1007 _mm_mul_ps(uij3
,r
)));
1009 _mm_add_ps(_mm_mul_ps(half
,uij2
),
1010 _mm_mul_ps(prod
,uij3
)));
1011 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
1012 _mm_mul_ps(rinv
,rinv
));
1014 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
1016 _mm_mul_ps(sk2_rinv
,rinv
))));
1017 t1
= _mm_mul_ps(rinv
,
1018 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
1019 _mm_add_ps(t2
,t3
)));
1020 dadx2
= _mm_and_ps(t1
,obc_mask1
);
1024 dadx2
= _mm_setzero_ps();
1027 _mm_store_ps(dadx
,dadx1
);
1029 _mm_store_ps(dadx
,dadx2
);
1031 } /* end normal inner loop */
1039 GMX_MM_LOAD_1RVEC_1POINTER_PS(x
+j3A
,jx
,jy
,jz
);
1040 GMX_MM_LOAD_1VALUE_PS(gb_radius
+jnrA
,raj
);
1041 GMX_MM_LOAD_1VALUE_PS(obc_param
+jnrA
,sk_aj
);
1050 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x
+j3A
,x
+j3B
,jx
,jy
,jz
);
1051 GMX_MM_LOAD_2VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,raj
);
1052 GMX_MM_LOAD_2VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,sk_aj
);
1057 /* offset must be 3 */
1064 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,jx
,jy
,jz
);
1065 GMX_MM_LOAD_3VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,raj
);
1066 GMX_MM_LOAD_3VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,obc_param
+jnrC
,sk_aj
);
1070 dx
= _mm_sub_ps(ix
, jx
);
1071 dy
= _mm_sub_ps(iy
, jy
);
1072 dz
= _mm_sub_ps(iz
, jz
);
1074 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
1076 rinv
= gmx_mm_invsqrt_ps(rsq
);
1077 r
= _mm_mul_ps(rsq
,rinv
);
1079 /* Compute raj_inv aj1-4 */
1080 raj_inv
= gmx_mm_inv_ps(raj
);
1082 /* Evaluate influence of atom aj -> ai */
1083 t1
= _mm_add_ps(r
,sk_aj
);
1084 obc_mask1
= _mm_cmplt_ps(rai
, t1
);
1085 obc_mask1
= _mm_and_ps(obc_mask1
,mask
);
1087 if(_mm_movemask_ps(obc_mask1
))
1089 t2
= _mm_sub_ps(r
,sk_aj
);
1090 t3
= _mm_sub_ps(sk_aj
,r
);
1091 obc_mask2
= _mm_cmplt_ps(rai
, t2
);
1092 obc_mask3
= _mm_cmplt_ps(rai
, t3
);
1094 uij
= gmx_mm_inv_ps(t1
);
1095 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
1096 _mm_andnot_ps(obc_mask2
,rai_inv
));
1097 dlij
= _mm_and_ps(one
,obc_mask2
);
1098 uij2
= _mm_mul_ps(uij
, uij
);
1099 uij3
= _mm_mul_ps(uij2
,uij
);
1100 lij2
= _mm_mul_ps(lij
, lij
);
1101 lij3
= _mm_mul_ps(lij2
,lij
);
1102 diff2
= _mm_sub_ps(uij2
,lij2
);
1103 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
1104 sk2_aj
= _mm_mul_ps(sk_aj
,sk_aj
);
1105 sk2_rinv
= _mm_mul_ps(sk2_aj
,rinv
);
1106 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
1107 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
1108 t1
= _mm_sub_ps(lij
,uij
);
1109 t2
= _mm_mul_ps(diff2
,
1110 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
1112 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
1113 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
1114 t4
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lij
));
1115 t4
= _mm_and_ps(t4
,obc_mask3
);
1116 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
1117 sum_ai
= _mm_add_ps(sum_ai
,_mm_and_ps(t1
,obc_mask1
));
1118 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
1119 _mm_mul_ps(prod
,lij3
));
1121 _mm_mul_ps(onefourth
,
1122 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
1123 _mm_mul_ps(lij3
,r
))));
1124 t2
= _mm_mul_ps(onefourth
,
1125 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
1126 _mm_mul_ps(uij3
,r
)));
1128 _mm_add_ps(_mm_mul_ps(half
,uij2
),
1129 _mm_mul_ps(prod
,uij3
)));
1130 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
1131 _mm_mul_ps(rinv
,rinv
));
1133 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
1135 _mm_mul_ps(sk2_rinv
,rinv
))));
1136 t1
= _mm_mul_ps(rinv
,
1137 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
1138 _mm_add_ps(t2
,t3
)));
1139 dadx1
= _mm_and_ps(t1
,obc_mask1
);
1143 dadx1
= _mm_setzero_ps();
1146 /* Evaluate influence of atom ai -> aj */
1147 t1
= _mm_add_ps(r
,sk_ai
);
1148 obc_mask1
= _mm_cmplt_ps(raj
, t1
);
1149 obc_mask1
= _mm_and_ps(obc_mask1
,mask
);
1151 if(_mm_movemask_ps(obc_mask1
))
1153 t2
= _mm_sub_ps(r
,sk_ai
);
1154 t3
= _mm_sub_ps(sk_ai
,r
);
1155 obc_mask2
= _mm_cmplt_ps(raj
, t2
);
1156 obc_mask3
= _mm_cmplt_ps(raj
, t3
);
1158 uij
= gmx_mm_inv_ps(t1
);
1159 lij
= _mm_or_ps(_mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
1160 _mm_andnot_ps(obc_mask2
,raj_inv
));
1161 dlij
= _mm_and_ps(one
,obc_mask2
);
1162 uij2
= _mm_mul_ps(uij
, uij
);
1163 uij3
= _mm_mul_ps(uij2
,uij
);
1164 lij2
= _mm_mul_ps(lij
, lij
);
1165 lij3
= _mm_mul_ps(lij2
,lij
);
1166 diff2
= _mm_sub_ps(uij2
,lij2
);
1167 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
1168 sk2_rinv
= _mm_mul_ps(sk2_ai
,rinv
);
1169 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
1170 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
1171 t1
= _mm_sub_ps(lij
,uij
);
1172 t2
= _mm_mul_ps(diff2
,
1173 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
1175 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
1176 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
1177 t4
= _mm_mul_ps(two
,_mm_sub_ps(raj_inv
,lij
));
1178 t4
= _mm_and_ps(t4
,obc_mask3
);
1179 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
1181 tmp
= _mm_and_ps(t1
,obc_mask1
);
1183 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
1184 _mm_mul_ps(prod
,lij3
));
1186 _mm_mul_ps(onefourth
,
1187 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
1188 _mm_mul_ps(lij3
,r
))));
1189 t2
= _mm_mul_ps(onefourth
,
1190 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
1191 _mm_mul_ps(uij3
,r
)));
1193 _mm_add_ps(_mm_mul_ps(half
,uij2
),
1194 _mm_mul_ps(prod
,uij3
)));
1195 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
1196 _mm_mul_ps(rinv
,rinv
));
1198 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
1200 _mm_mul_ps(sk2_rinv
,rinv
))));
1201 t1
= _mm_mul_ps(rinv
,
1202 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
1203 _mm_add_ps(t2
,t3
)));
1204 dadx2
= _mm_and_ps(t1
,obc_mask1
);
1208 dadx2
= _mm_setzero_ps();
1209 tmp
= _mm_setzero_ps();
1212 _mm_store_ps(dadx
,dadx1
);
1214 _mm_store_ps(dadx
,dadx2
);
1219 GMX_MM_INCREMENT_1VALUE_PS(work
+jnrA
,tmp
);
1223 GMX_MM_INCREMENT_2VALUES_PS(work
+jnrA
,work
+jnrB
,tmp
);
1227 /* offset must be 3 */
1228 GMX_MM_INCREMENT_3VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,tmp
);
1232 gmx_mm_update_1pot_ps(sum_ai
,work
+ii
);
1236 /* Parallel summations */
1239 gmx_sum(natoms
, work
, cr
);
1241 else if(DOMAINDECOMP(cr
))
1243 dd_atom_sum_real(cr
->dd
, work
);
1246 if(gb_algorithm
==egbHCT
)
1249 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1251 if(born
->use
[i
] != 0)
1253 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]]-doffset
;
1254 sum
= 1.0/rr
- work
[i
];
1255 min_rad
= rr
+ doffset
;
1258 born
->bRad
[i
] = rad
> min_rad
? rad
: min_rad
;
1259 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1263 /* Extra communication required for DD */
1264 if(DOMAINDECOMP(cr
))
1266 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1267 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1273 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1275 if(born
->use
[i
] != 0)
1277 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]];
1285 tsum
= tanh(born
->obc_alpha
*sum
-born
->obc_beta
*sum2
+born
->obc_gamma
*sum3
);
1286 born
->bRad
[i
] = rr_inv
- tsum
*rr_inv2
;
1287 born
->bRad
[i
] = 1.0 / born
->bRad
[i
];
1289 fr
->invsqrta
[i
]=gmx_invsqrt(born
->bRad
[i
]);
1291 tchain
= rr
* (born
->obc_alpha
-2*born
->obc_beta
*sum
+3*born
->obc_gamma
*sum2
);
1292 born
->drobc
[i
] = (1.0-tsum
*tsum
)*tchain
*rr_inv2
;
1295 /* Extra (local) communication required for DD */
1296 if(DOMAINDECOMP(cr
))
1298 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1299 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1300 dd_atom_spread_real(cr
->dd
, born
->drobc
);
1311 float calc_gb_chainrule_sse(int natoms
, t_nblist
*nl
, float *dadx
, float *dvda
,
1312 float *x
, float *f
, float *fshift
, float *shiftvec
,
1313 int gb_algorithm
, gmx_genborn_t
*born
, t_mdatoms
*md
)
1315 int i
,k
,n
,ii
,jnr
,ii3
,is3
,nj0
,nj1
,offset
,n0
,n1
;
1316 int jnrA
,jnrB
,jnrC
,jnrD
;
1317 int j3A
,j3B
,j3C
,j3D
;
1318 int jnrE
,jnrF
,jnrG
,jnrH
;
1319 int j3E
,j3F
,j3G
,j3H
;
1322 float rbi
,shX
,shY
,shZ
;
1334 __m128 rbai
,rbaj
,rbajB
, f_gb
, f_gb_ai
,f_gbB
,f_gb_aiB
;
1335 __m128 xmm1
,xmm2
,xmm3
;
1337 const __m128 two
= {2.0f
, 2.0f
, 2.0f
, 2.0f
};
1343 /* Loop to get the proper form for the Born radius term, sse style */
1347 n1
= md
->start
+md
->homenr
+1+natoms
/2;
1349 if(gb_algorithm
==egbSTILL
)
1354 rbi
= born
->bRad
[k
];
1355 rb
[k
] = (2 * rbi
* rbi
* dvda
[k
])/ONE_4PI_EPS0
;
1358 else if(gb_algorithm
==egbHCT
)
1363 rbi
= born
->bRad
[k
];
1364 rb
[k
] = rbi
* rbi
* dvda
[k
];
1367 else if(gb_algorithm
==egbOBC
)
1372 rbi
= born
->bRad
[k
];
1373 rb
[k
] = rbi
* rbi
* born
->drobc
[k
] * dvda
[k
];
1377 jz
= _mm_setzero_ps();
1379 n
= j3A
= j3B
= j3C
= j3D
= 0;
1381 for(i
=0;i
<nl
->nri
;i
++)
1385 is3
= 3*nl
->shift
[i
];
1386 shX
= shiftvec
[is3
];
1387 shY
= shiftvec
[is3
+1];
1388 shZ
= shiftvec
[is3
+2];
1389 nj0
= nl
->jindex
[i
];
1390 nj1
= nl
->jindex
[i
+1];
1392 ix
= _mm_set1_ps(shX
+x
[ii3
+0]);
1393 iy
= _mm_set1_ps(shY
+x
[ii3
+1]);
1394 iz
= _mm_set1_ps(shZ
+x
[ii3
+2]);
1396 offset
= (nj1
-nj0
)%4;
1398 rbai
= _mm_load1_ps(rb
+ii
);
1399 fix
= _mm_setzero_ps();
1400 fiy
= _mm_setzero_ps();
1401 fiz
= _mm_setzero_ps();
1404 for(k
=nj0
;k
<nj1
-offset
;k
+=4)
1416 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
1418 dx
= _mm_sub_ps(ix
,jx
);
1419 dy
= _mm_sub_ps(iy
,jy
);
1420 dz
= _mm_sub_ps(iz
,jz
);
1422 GMX_MM_LOAD_4VALUES_PS(rb
+jnrA
,rb
+jnrB
,rb
+jnrC
,rb
+jnrD
,rbaj
);
1424 /* load chain rule terms for j1-4 */
1425 f_gb
= _mm_load_ps(dadx
);
1427 f_gb_ai
= _mm_load_ps(dadx
);
1430 /* calculate scalar force */
1431 f_gb
= _mm_mul_ps(f_gb
,rbai
);
1432 f_gb_ai
= _mm_mul_ps(f_gb_ai
,rbaj
);
1433 f_gb
= _mm_add_ps(f_gb
,f_gb_ai
);
1435 tx
= _mm_mul_ps(f_gb
,dx
);
1436 ty
= _mm_mul_ps(f_gb
,dy
);
1437 tz
= _mm_mul_ps(f_gb
,dz
);
1439 fix
= _mm_add_ps(fix
,tx
);
1440 fiy
= _mm_add_ps(fiy
,ty
);
1441 fiz
= _mm_add_ps(fiz
,tz
);
1443 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f
+j3A
,f
+j3B
,f
+j3C
,f
+j3D
,tx
,ty
,tz
);
1446 /*deal with odd elements */
1453 GMX_MM_LOAD_1RVEC_1POINTER_PS(x
+j3A
,jx
,jy
,jz
);
1454 GMX_MM_LOAD_1VALUE_PS(rb
+jnrA
,rbaj
);
1462 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x
+j3A
,x
+j3B
,jx
,jy
,jz
);
1463 GMX_MM_LOAD_2VALUES_PS(rb
+jnrA
,rb
+jnrB
,rbaj
);
1467 /* offset must be 3 */
1474 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,jx
,jy
,jz
);
1475 GMX_MM_LOAD_3VALUES_PS(rb
+jnrA
,rb
+jnrB
,rb
+jnrC
,rbaj
);
1478 dx
= _mm_sub_ps(ix
,jx
);
1479 dy
= _mm_sub_ps(iy
,jy
);
1480 dz
= _mm_sub_ps(iz
,jz
);
1482 /* load chain rule terms for j1-4 */
1483 f_gb
= _mm_load_ps(dadx
);
1485 f_gb_ai
= _mm_load_ps(dadx
);
1488 /* calculate scalar force */
1489 f_gb
= _mm_mul_ps(f_gb
,rbai
);
1490 f_gb_ai
= _mm_mul_ps(f_gb_ai
,rbaj
);
1491 f_gb
= _mm_add_ps(f_gb
,f_gb_ai
);
1493 tx
= _mm_mul_ps(f_gb
,dx
);
1494 ty
= _mm_mul_ps(f_gb
,dy
);
1495 tz
= _mm_mul_ps(f_gb
,dz
);
1497 fix
= _mm_add_ps(fix
,tx
);
1498 fiy
= _mm_add_ps(fiy
,ty
);
1499 fiz
= _mm_add_ps(fiz
,tz
);
1503 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f
+j3A
,tx
,ty
,tz
);
1507 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f
+j3A
,f
+j3B
,tx
,ty
,tz
);
1511 /* offset must be 3 */
1512 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f
+j3A
,f
+j3B
,f
+j3C
,tx
,ty
,tz
);
1516 /* fix/fiy/fiz now contain four partial force terms, that all should be
1517 * added to the i particle forces and shift forces.
1519 gmx_mm_update_iforce_1atom_ps(fix
,fiy
,fiz
,f
+ii3
,fshift
+is3
);
1526 float gb_bonds_analytic(real
*x
, real
*f
, real
*charge
, real
*bRad
, real
*dvda
,
1527 t_idef
*idef
, real epsilon_r
, real gb_epsilon_solvent
, real facel
)
1531 int type1
,type2
,type3
,type4
,ai1
,ai2
,ai3
,ai4
,aj1
,aj2
,aj3
,aj4
;
1532 int ai13
,ai23
,ai33
,ai43
,aj13
,aj23
,aj33
,aj43
;
1537 __m128 ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
;
1538 __m128 rsq11
,rinv
,r
,t1
,t2
,t3
;
1539 __m128 isai
,isaj
,isaprod
,inv_isaprod
,expterm
;
1540 __m128 ci
,cj
,qq
,fac
,dva
,dva_i
,dva_j
,di
,dj
;
1541 __m128 vgb
,vgb_tot
,fgb
,fgb2
,fijC
,fscal
;
1542 __m128 tx
,ty
,tz
,fix1
,fiy1
,fiz1
;
1543 __m128 xmm1
,xmm2
,xmm3
,xmm4
,xmm5
,xmm6
,xmm7
,xmm8
;
1546 const __m128 neg
= {-1.0f
, -1.0f
, -1.0f
, -1.0f
};
1547 const __m128 qrtr
= {0.25f
, 0.25f
, 0.25f
, 0.25f
};
1548 const __m128 eigth
= {0.125f
, 0.125f
, 0.125f
, 0.125f
};
1549 const __m128 half
= {0.5f
, 0.5f
, 0.5f
, 0.5f
};
1550 const __m128 three
= {3.0f
, 3.0f
, 3.0f
, 3.0f
};
1552 t_iatom
*forceatoms
;
1554 /* Keep the compiler happy */
1555 vgb_tot
= _mm_setzero_ps();
1556 vgb
= _mm_setzero_ps();
1557 xmm1
= _mm_setzero_ps();
1558 xmm2
= _mm_setzero_ps();
1559 xmm3
= _mm_setzero_ps();
1560 xmm4
= _mm_setzero_ps();
1561 ai1
= ai2
= ai3
= ai4
= 0;
1562 aj1
= aj2
= aj3
= aj4
= 0;
1563 ai13
= ai23
= ai33
= ai43
= 0;
1564 aj13
= aj23
= aj33
= aj43
= 0;
1566 /* Scale the electrostatics by gb_epsilon_solvent */
1567 facel
= (-1.0) * facel
* ((1.0/epsilon_r
) - 1.0/gb_epsilon_solvent
);
1568 fac
= _mm_load1_ps(&facel
);
1572 for(j
=F_GB12
;j
<=F_GB14
;j
++)
1574 forceatoms
= idef
->il
[j
].iatoms
;
1576 /* Number of atoms involved in each GB interaction plus the interaction type */
1578 nri
= idef
->il
[j
].nr
/nral
;
1581 /* In the for loop, i is updated for every element in the forceatom list, so we have to stop
1582 * the loop at offset*nral before the maximum idef->il[j].nr number, instead of just offset
1584 for(i
=0;i
<idef
->il
[j
].nr
-(offset
*nral
); )
1586 /* Load everything separately for now, test with sse load and shuffles later
1587 * Also, to avoid reading in the interaction type, we just increment i to pass over
1588 * the types in the forceatoms array, this saves some memory accesses.
1591 ai1
= forceatoms
[i
++];
1592 aj1
= forceatoms
[i
++];
1595 ai2
= forceatoms
[i
++];
1596 aj2
= forceatoms
[i
++];
1599 ai3
= forceatoms
[i
++];
1600 aj3
= forceatoms
[i
++];
1603 ai4
= forceatoms
[i
++];
1604 aj4
= forceatoms
[i
++];
1616 /* Load particle ai1-4 and transpose */
1617 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+ai13
));
1618 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+ai23
));
1619 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+ai33
));
1620 xmm4
= _mm_loadh_pi(xmm4
,(__m64
*) (x
+ai43
));
1622 xmm5
= _mm_load1_ps(x
+ai13
+2);
1623 xmm6
= _mm_load1_ps(x
+ai23
+2);
1624 xmm7
= _mm_load1_ps(x
+ai33
+2);
1625 xmm8
= _mm_load1_ps(x
+ai43
+2);
1627 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1628 xmm6
= _mm_shuffle_ps(xmm7
,xmm8
,_MM_SHUFFLE(0,0,0,0));
1629 iz
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(2,0,2,0));
1631 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1632 xmm2
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(3,2,3,2));
1633 ix
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(2,0,2,0));
1634 iy
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,1,3,1));
1636 /* Load particle aj1-4 and transpose */
1637 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+aj13
));
1638 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+aj23
));
1639 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+aj33
));
1640 xmm4
= _mm_loadh_pi(xmm4
,(__m64
*) (x
+aj43
));
1642 xmm5
= _mm_load1_ps(x
+aj13
+2);
1643 xmm6
= _mm_load1_ps(x
+aj23
+2);
1644 xmm7
= _mm_load1_ps(x
+aj33
+2);
1645 xmm8
= _mm_load1_ps(x
+aj43
+2);
1647 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1648 xmm6
= _mm_shuffle_ps(xmm7
,xmm8
,_MM_SHUFFLE(0,0,0,0));
1649 jz
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(2,0,2,0));
1651 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1652 xmm2
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(3,2,3,2));
1653 jx
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(2,0,2,0));
1654 jy
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,1,3,1));
1657 dx
= _mm_sub_ps(ix
, jx
);
1658 dy
= _mm_sub_ps(iy
, jy
);
1659 dz
= _mm_sub_ps(iz
, jz
);
1661 rsq11
= _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
) , _mm_mul_ps(dy
,dy
) ) , _mm_mul_ps(dz
,dz
) );
1663 rinv
= gmx_mm_inv_ps(rsq11
);
1664 r
= _mm_mul_ps(rsq11
,rinv
);
1666 /* Load Born radii for ai's and aj's */
1667 xmm1
= _mm_load_ss(bRad
+ai1
);
1668 xmm2
= _mm_load_ss(bRad
+ai2
);
1669 xmm3
= _mm_load_ss(bRad
+ai3
);
1670 xmm4
= _mm_load_ss(bRad
+ai4
);
1672 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1673 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1674 isai
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1676 xmm1
= _mm_load_ss(bRad
+aj1
);
1677 xmm2
= _mm_load_ss(bRad
+aj2
);
1678 xmm3
= _mm_load_ss(bRad
+aj3
);
1679 xmm4
= _mm_load_ss(bRad
+aj4
);
1681 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1682 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1683 isaj
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1685 isaprod
= _mm_mul_ps(isai
,isaj
); // rb2 in tinker
1686 inv_isaprod
= _mm_mul_ps(isaprod
,isaprod
);
1687 inv_isaprod
= gmx_mm_inv_ps(inv_isaprod
); //1/rb2 in tinker
1689 /* Load charges for ai's and aj's */
1690 xmm1
= _mm_load_ss(charge
+ai1
);
1691 xmm2
= _mm_load_ss(charge
+ai2
);
1692 xmm3
= _mm_load_ss(charge
+ai3
);
1693 xmm4
= _mm_load_ss(charge
+ai4
);
1695 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1696 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1697 ci
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1699 xmm1
= _mm_load_ss(charge
+aj1
);
1700 xmm2
= _mm_load_ss(charge
+aj2
);
1701 xmm3
= _mm_load_ss(charge
+aj3
);
1702 xmm4
= _mm_load_ss(charge
+aj4
);
1704 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1705 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1706 cj
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1708 qq
= _mm_mul_ps(ci
,fac
);
1709 qq
= _mm_mul_ps(cj
,qq
);
1711 /* Calculate scalar GB force */
1712 expterm
= _mm_mul_ps(rsq11
,inv_isaprod
);
1713 expterm
= _mm_mul_ps(expterm
,qrtr
);
1714 expterm
= _mm_mul_ps(expterm
,neg
);
1715 expterm
= gmx_mm_exp_ps(expterm
);
1717 fgb2
= _mm_mul_ps(isaprod
,expterm
);
1718 fgb2
= _mm_add_ps(fgb2
,rsq11
);
1720 fgb
= gmx_mm_inv_ps(fgb2
);
1722 /* Potential energy */
1723 vgb
= _mm_mul_ps(qq
,fgb
);
1724 vgb_tot
= _mm_add_ps(vgb_tot
,vgb
);
1726 fijC
= _mm_mul_ps(qrtr
,r
);
1727 fijC
= _mm_mul_ps(fijC
,expterm
);
1728 fijC
= _mm_sub_ps(r
,fijC
);
1729 fijC
= _mm_mul_ps(fijC
,vgb
);
1730 fijC
= _mm_mul_ps(fijC
,neg
);
1731 fijC
= _mm_mul_ps(fijC
,fgb
);
1732 fijC
= _mm_mul_ps(fijC
,fgb
); /* fijC = fijC in tab code, de */
1734 /* Chain rule terms */
1735 dva
= _mm_mul_ps(rsq11
,inv_isaprod
);
1736 dva
= _mm_mul_ps(dva
,eigth
);
1737 dva
= _mm_add_ps(dva
,half
);
1738 dva
= _mm_mul_ps(dva
,expterm
);
1739 dva
= _mm_mul_ps(dva
,vgb
);
1740 dva
= _mm_mul_ps(dva
,neg
);
1741 dva
= _mm_mul_ps(dva
,fgb
);
1742 dva
= _mm_mul_ps(dva
,fgb
); /* dva * isaj = dvatmp*isai*isai */
1744 /* Calculate vectorial force */
1745 fscal
= _mm_mul_ps(fijC
,rinv
);
1746 fscal
= _mm_mul_ps(fscal
,neg
);
1748 tx
= _mm_mul_ps(fscal
,dx
);
1749 ty
= _mm_mul_ps(fscal
,dy
);
1750 tz
= _mm_mul_ps(fscal
,dz
);
1752 /* Load, update and store chain rule terms for ai and aj atoms */
1753 dva_i
= _mm_mul_ps(dva
,isaj
);
1754 dva_j
= _mm_mul_ps(dva
,isai
);
1756 xmm1
= _mm_load_ss(dvda
+ai1
);
1757 xmm2
= _mm_load_ss(dvda
+aj1
);
1758 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1759 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1760 _mm_store_ss(dvda
+ai1
,xmm1
);
1761 _mm_store_ss(dvda
+aj1
,xmm2
);
1763 /* Need to shuffle dva_i and dva_j */
1764 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
1765 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
1767 xmm1
= _mm_load_ss(dvda
+ai2
);
1768 xmm2
= _mm_load_ss(dvda
+aj2
);
1769 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1770 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1771 _mm_store_ss(dvda
+ai2
,xmm1
);
1772 _mm_store_ss(dvda
+aj2
,xmm2
);
1774 /* Need to shuffle dva_i and dva_j */
1775 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
1776 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
1778 xmm1
= _mm_load_ss(dvda
+ai3
);
1779 xmm2
= _mm_load_ss(dvda
+aj3
);
1780 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1781 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1782 _mm_store_ss(dvda
+ai3
,xmm1
);
1783 _mm_store_ss(dvda
+aj3
,xmm2
);
1785 /* Need to shuffle dva_i and dva_j */
1786 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
1787 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
1789 xmm1
= _mm_load_ss(dvda
+ai4
);
1790 xmm2
= _mm_load_ss(dvda
+aj4
);
1791 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1792 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1793 _mm_store_ss(dvda
+ai4
,xmm1
);
1794 _mm_store_ss(dvda
+aj4
,xmm2
);
1796 /* Load, update and store partial forces on ai and ai atoms
1797 * This needs to be done in the order ai1+aj1,ai2+aj2 etc...
1798 * since loading all four values at once will mean the force
1799 * is not updated properly
1802 xmm1
= _mm_load_ss(f
+ai13
);
1803 xmm2
= _mm_load_ss(f
+ai13
+1);
1804 xmm3
= _mm_load_ss(f
+ai13
+2);
1806 xmm4
= _mm_load_ss(f
+aj13
);
1807 xmm5
= _mm_load_ss(f
+aj13
+1);
1808 xmm6
= _mm_load_ss(f
+aj13
+2);
1810 xmm1
= _mm_add_ss(xmm1
,tx
);
1811 xmm2
= _mm_add_ss(xmm2
,ty
);
1812 xmm3
= _mm_add_ss(xmm3
,tz
);
1814 xmm4
= _mm_sub_ss(xmm4
,tx
);
1815 xmm5
= _mm_sub_ss(xmm5
,ty
);
1816 xmm6
= _mm_sub_ss(xmm6
,tz
);
1818 _mm_store_ss(f
+ai13
,xmm1
);
1819 _mm_store_ss(f
+ai13
+1,xmm2
);
1820 _mm_store_ss(f
+ai13
+2,xmm3
);
1822 _mm_store_ss(f
+aj13
,xmm4
);
1823 _mm_store_ss(f
+aj13
+1,xmm5
);
1824 _mm_store_ss(f
+aj13
+2,xmm6
);
1827 xmm1
= _mm_load_ss(f
+ai23
);
1828 xmm2
= _mm_load_ss(f
+ai23
+1);
1829 xmm3
= _mm_load_ss(f
+ai23
+2);
1831 xmm4
= _mm_load_ss(f
+aj23
);
1832 xmm5
= _mm_load_ss(f
+aj23
+1);
1833 xmm6
= _mm_load_ss(f
+aj23
+2);
1835 /* Need to shuffle tx/ty/tz */
1836 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
1837 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
1838 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
1840 xmm1
= _mm_add_ss(xmm1
,tx
);
1841 xmm2
= _mm_add_ss(xmm2
,ty
);
1842 xmm3
= _mm_add_ss(xmm3
,tz
);
1844 xmm4
= _mm_sub_ss(xmm4
,tx
);
1845 xmm5
= _mm_sub_ss(xmm5
,ty
);
1846 xmm6
= _mm_sub_ss(xmm6
,tz
);
1848 _mm_store_ss(f
+ai23
,xmm1
);
1849 _mm_store_ss(f
+ai23
+1,xmm2
);
1850 _mm_store_ss(f
+ai23
+2,xmm3
);
1852 _mm_store_ss(f
+aj23
,xmm4
);
1853 _mm_store_ss(f
+aj23
+1,xmm5
);
1854 _mm_store_ss(f
+aj23
+2,xmm6
);
1857 xmm1
= _mm_load_ss(f
+ai33
);
1858 xmm2
= _mm_load_ss(f
+ai33
+1);
1859 xmm3
= _mm_load_ss(f
+ai33
+2);
1861 xmm4
= _mm_load_ss(f
+aj33
);
1862 xmm5
= _mm_load_ss(f
+aj33
+1);
1863 xmm6
= _mm_load_ss(f
+aj33
+2);
1865 /* Need to shuffle tx/ty/tz */
1866 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
1867 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
1868 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
1870 xmm1
= _mm_add_ss(xmm1
,tx
);
1871 xmm2
= _mm_add_ss(xmm2
,ty
);
1872 xmm3
= _mm_add_ss(xmm3
,tz
);
1874 xmm4
= _mm_sub_ss(xmm4
,tx
);
1875 xmm5
= _mm_sub_ss(xmm5
,ty
);
1876 xmm6
= _mm_sub_ss(xmm6
,tz
);
1878 _mm_store_ss(f
+ai33
,xmm1
);
1879 _mm_store_ss(f
+ai33
+1,xmm2
);
1880 _mm_store_ss(f
+ai33
+2,xmm3
);
1882 _mm_store_ss(f
+aj33
,xmm4
);
1883 _mm_store_ss(f
+aj33
+1,xmm5
);
1884 _mm_store_ss(f
+aj33
+2,xmm6
);
1887 xmm1
= _mm_load_ss(f
+ai43
);
1888 xmm2
= _mm_load_ss(f
+ai43
+1);
1889 xmm3
= _mm_load_ss(f
+ai43
+2);
1891 xmm4
= _mm_load_ss(f
+aj43
);
1892 xmm5
= _mm_load_ss(f
+aj43
+1);
1893 xmm6
= _mm_load_ss(f
+aj43
+2);
1895 /* Need to shuffle tx/ty/tz */
1896 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
1897 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
1898 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
1900 xmm1
= _mm_add_ss(xmm1
,tx
);
1901 xmm2
= _mm_add_ss(xmm2
,ty
);
1902 xmm3
= _mm_add_ss(xmm3
,tz
);
1904 xmm4
= _mm_sub_ss(xmm4
,tx
);
1905 xmm5
= _mm_sub_ss(xmm5
,ty
);
1906 xmm6
= _mm_sub_ss(xmm6
,tz
);
1908 _mm_store_ss(f
+ai43
,xmm1
);
1909 _mm_store_ss(f
+ai43
+1,xmm2
);
1910 _mm_store_ss(f
+ai43
+2,xmm3
);
1912 _mm_store_ss(f
+aj43
,xmm4
);
1913 _mm_store_ss(f
+aj43
+1,xmm5
);
1914 _mm_store_ss(f
+aj43
+2,xmm6
);
1922 ai1
= forceatoms
[i
++];
1923 aj1
= forceatoms
[i
++];
1928 /* Load ai and aj coordinates */
1929 xmm1
= _mm_loadl_pi(xmm1
,(__m64
*) (x
+ai13
));
1930 iz
= _mm_load1_ps(x
+ai13
+2);
1932 ix
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(0,0,0,0));
1933 iy
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(1,1,1,1));
1935 xmm1
= _mm_loadl_pi(xmm1
,(__m64
*) (x
+aj13
));
1936 jz
= _mm_load1_ps(x
+aj13
+2);
1938 jx
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(0,0,0,0));
1939 jy
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(1,1,1,1));
1941 /* Load Born radii for ai1 and aj1 */
1942 isai
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, bRad
[ai1
]);
1943 isaj
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, bRad
[aj1
]);
1945 /* Load charges for ai1 and aj1 */
1946 ci
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, charge
[ai1
]);
1947 cj
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, charge
[aj1
]);
1949 /* Load chain rule terms for ai1 and aj1 */
1950 di
= _mm_set_ps(0.0f
,0.0f
,0.0f
,dvda
[ai1
]);
1951 dj
= _mm_set_ps(0.0f
,0.0f
,0.0f
,dvda
[aj1
]);
1953 mask
= gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0,0xffffffff) );
1958 ai1
= forceatoms
[i
++];
1959 aj1
= forceatoms
[i
++];
1962 ai2
= forceatoms
[i
++];
1963 aj2
= forceatoms
[i
++];
1970 /* Load ai1-2 and aj1-2 coordinates */
1971 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+ai13
));
1972 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+ai23
));
1974 xmm5
= _mm_load1_ps(x
+ai13
+2);
1975 xmm6
= _mm_load1_ps(x
+ai23
+2);
1977 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1978 iz
= _mm_shuffle_ps(xmm5
,xmm5
,_MM_SHUFFLE(2,0,2,0));
1980 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1981 ix
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(2,0,2,0));
1982 iy
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(3,1,3,1));
1984 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+aj13
));
1985 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+aj23
));
1987 xmm5
= _mm_load1_ps(x
+aj13
+2);
1988 xmm6
= _mm_load1_ps(x
+aj23
+2);
1990 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1991 jz
= _mm_shuffle_ps(xmm5
,xmm5
,_MM_SHUFFLE(2,0,2,0));
1993 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1994 jx
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(2,0,2,0));
1995 jy
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(3,1,3,1));
1997 /* Load Born radii for ai1-2 and aj1-2 */
1998 isai
= _mm_set_ps(0.0f
, 0.0f
, bRad
[ai2
], bRad
[ai1
]);
1999 isaj
= _mm_set_ps(0.0f
, 0.0f
, bRad
[aj2
], bRad
[aj1
]);
2001 /* Load charges for ai1-2 and aj1-2 */
2002 ci
= _mm_set_ps(0.0f
, 0.0f
, charge
[ai2
], charge
[ai1
]);
2003 cj
= _mm_set_ps(0.0f
, 0.0f
, charge
[aj2
], charge
[aj1
]);
2005 /* Load chain rule terms for ai1-2 and aj1-2 */
2006 di
= _mm_set_ps(0.0f
,0.0f
,dvda
[ai2
],dvda
[ai1
]);
2007 dj
= _mm_set_ps(0.0f
,0.0f
,dvda
[aj2
],dvda
[aj1
]);
2009 mask
= gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0xffffffff,0xffffffff) );
2014 ai1
= forceatoms
[i
++];
2015 aj1
= forceatoms
[i
++];
2018 ai2
= forceatoms
[i
++];
2019 aj2
= forceatoms
[i
++];
2022 ai3
= forceatoms
[i
++];
2023 aj3
= forceatoms
[i
++];
2033 /* Load ai1-3 and aj1-3 coordinates */
2034 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+ai13
));
2035 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+ai23
));
2036 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+ai33
));
2038 xmm5
= _mm_load1_ps(x
+ai13
+2);
2039 xmm6
= _mm_load1_ps(x
+ai23
+2);
2040 xmm7
= _mm_load1_ps(x
+ai33
+2);
2042 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
, _MM_SHUFFLE(0,0,0,0));
2043 iz
= _mm_shuffle_ps(xmm5
,xmm7
, _MM_SHUFFLE(3,1,3,1));
2045 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,2,3,2));
2046 xmm2
= _mm_shuffle_ps(xmm3
,xmm3
, _MM_SHUFFLE(3,2,3,2));
2048 ix
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(2,0,2,0));
2049 iy
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,1,3,1));
2051 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+aj13
));
2052 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+aj23
));
2053 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+aj33
));
2055 xmm5
= _mm_load1_ps(x
+aj13
+2);
2056 xmm6
= _mm_load1_ps(x
+aj23
+2);
2057 xmm7
= _mm_load1_ps(x
+aj33
+2);
2059 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
, _MM_SHUFFLE(0,0,0,0));
2060 jz
= _mm_shuffle_ps(xmm5
,xmm7
, _MM_SHUFFLE(3,1,3,1));
2062 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,2,3,2));
2063 xmm2
= _mm_shuffle_ps(xmm3
,xmm3
, _MM_SHUFFLE(3,2,3,2));
2065 jx
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(2,0,2,0));
2066 jy
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,1,3,1));
2068 /* Load Born radii for ai1-3 and aj1-3 */
2069 isai
= _mm_set_ps(0.0f
, bRad
[ai3
], bRad
[ai2
], bRad
[ai1
]);
2070 isaj
= _mm_set_ps(0.0f
, bRad
[aj3
], bRad
[aj2
], bRad
[aj1
]);
2072 /* Load charges for ai1-3 and aj1-3 */
2073 ci
= _mm_set_ps(0.0f
, charge
[ai3
], charge
[ai2
], charge
[ai1
]);
2074 cj
= _mm_set_ps(0.0f
, charge
[aj3
], charge
[aj2
], charge
[aj1
]);
2076 /* Load chain rule terms for ai1-3 and aj1-3 */
2077 di
= _mm_set_ps(0.0f
,dvda
[ai3
],dvda
[ai2
],dvda
[ai1
]);
2078 dj
= _mm_set_ps(0.0f
,dvda
[aj3
],dvda
[aj2
],dvda
[aj1
]);
2080 mask
= gmx_mm_castsi128_ps( _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff) );
2083 ix
= _mm_and_ps( mask
, ix
);
2084 iy
= _mm_and_ps( mask
, iy
);
2085 iz
= _mm_and_ps( mask
, iz
);
2087 jx
= _mm_and_ps( mask
, jx
);
2088 jy
= _mm_and_ps( mask
, jy
);
2089 jz
= _mm_and_ps( mask
, jz
);
2092 dx
= _mm_sub_ps(ix
, jx
);
2093 dy
= _mm_sub_ps(iy
, jy
);
2094 dz
= _mm_sub_ps(iz
, jz
);
2096 rsq11
= _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
) , _mm_mul_ps(dy
,dy
) ) , _mm_mul_ps(dz
,dz
) );
2098 rinv
= gmx_mm_inv_ps(rsq11
);
2099 r
= _mm_mul_ps(rsq11
,rinv
);
2101 isaprod
= _mm_mul_ps(isai
,isaj
);
2102 inv_isaprod
= _mm_mul_ps(isaprod
,isaprod
);
2103 inv_isaprod
= gmx_mm_inv_ps(inv_isaprod
);
2105 qq
= _mm_mul_ps(ci
,fac
);
2106 qq
= _mm_mul_ps(cj
,qq
);
2108 /* Calculate scalar GB force */
2109 expterm
= _mm_mul_ps(rsq11
,inv_isaprod
);
2110 expterm
= _mm_mul_ps(expterm
,qrtr
);
2111 expterm
= _mm_mul_ps(expterm
,neg
);
2112 expterm
= gmx_mm_exp_ps(expterm
);
2114 fgb2
= _mm_mul_ps(isaprod
,expterm
);
2115 fgb2
= _mm_add_ps(fgb2
,rsq11
);
2117 fgb
= gmx_mm_inv_ps(fgb2
);
2119 /* Potential energy */
2120 vgb
= _mm_mul_ps(qq
,fgb
);
2121 vgb
= _mm_and_ps(mask
,vgb
);
2122 vgb_tot
= _mm_add_ps(vgb_tot
,vgb
);
2124 fijC
= _mm_mul_ps(qrtr
,r
);
2125 fijC
= _mm_mul_ps(fijC
,expterm
);
2126 fijC
= _mm_sub_ps(r
,fijC
);
2127 fijC
= _mm_mul_ps(fijC
,vgb
);
2128 fijC
= _mm_mul_ps(fijC
,neg
);
2129 fijC
= _mm_mul_ps(fijC
,fgb
);
2130 fijC
= _mm_mul_ps(fijC
,fgb
); /* fscal = fijC */
2132 /* Chain rule terms */
2133 dva
= _mm_mul_ps(rsq11
,inv_isaprod
);
2134 dva
= _mm_mul_ps(dva
,eigth
);
2135 dva
= _mm_add_ps(dva
,half
);
2136 dva
= _mm_mul_ps(dva
,expterm
);
2137 dva
= _mm_mul_ps(dva
,vgb
);
2138 dva
= _mm_mul_ps(dva
,neg
);
2139 dva
= _mm_mul_ps(dva
,fgb
);
2140 dva
= _mm_mul_ps(dva
,fgb
);
2142 /* Calculate vectorial force */
2143 fscal
= _mm_mul_ps(fijC
,rinv
);
2144 fscal
= _mm_mul_ps(fscal
,neg
);
2146 tx
= _mm_mul_ps(fscal
,dx
);
2147 ty
= _mm_mul_ps(fscal
,dy
);
2148 tz
= _mm_mul_ps(fscal
,dz
);
2150 /* Calculate chain rule terms */
2151 dva_i
= _mm_mul_ps(dva
,isaj
);
2152 dva_j
= _mm_mul_ps(dva
,isai
);
2156 /* Load, update and store chain rule terms for ai1 and aj1 */
2157 xmm1
= _mm_load_ss(dvda
+ai1
);
2158 xmm2
= _mm_load_ss(dvda
+aj1
);
2159 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2160 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2161 _mm_store_ss(dvda
+ai1
,xmm1
);
2162 _mm_store_ss(dvda
+aj1
,xmm2
);
2164 /* Load, update and store partial forces on ai1 and aj1 */
2165 xmm1
= _mm_load_ss(f
+ai13
);
2166 xmm2
= _mm_load_ss(f
+ai13
+1);
2167 xmm3
= _mm_load_ss(f
+ai13
+2);
2169 xmm4
= _mm_load_ss(f
+aj13
);
2170 xmm5
= _mm_load_ss(f
+aj13
+1);
2171 xmm6
= _mm_load_ss(f
+aj13
+2);
2173 xmm1
= _mm_add_ss(xmm1
,tx
);
2174 xmm2
= _mm_add_ss(xmm2
,ty
);
2175 xmm3
= _mm_add_ss(xmm3
,tz
);
2177 xmm4
= _mm_sub_ss(xmm4
,tx
);
2178 xmm5
= _mm_sub_ss(xmm5
,ty
);
2179 xmm6
= _mm_sub_ss(xmm6
,tz
);
2181 _mm_store_ss(f
+ai13
,xmm1
);
2182 _mm_store_ss(f
+ai13
+1,xmm2
);
2183 _mm_store_ss(f
+ai13
+2,xmm3
);
2185 _mm_store_ss(f
+aj13
,xmm4
);
2186 _mm_store_ss(f
+aj13
+1,xmm5
);
2187 _mm_store_ss(f
+aj13
+2,xmm6
);
2191 /* Load, update and store chain rule terms for ai1-2 and aj1-2 atoms */
2192 dva_i
= _mm_mul_ps(dva
,isaj
);
2193 dva_j
= _mm_mul_ps(dva
,isai
);
2195 xmm1
= _mm_load_ss(dvda
+ai1
);
2196 xmm2
= _mm_load_ss(dvda
+aj1
);
2197 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2198 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2199 _mm_store_ss(dvda
+ai1
,xmm1
);
2200 _mm_store_ss(dvda
+aj1
,xmm2
);
2202 /* Need to shuffle dva_i and dva_j */
2203 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
2204 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
2206 xmm1
= _mm_load_ss(dvda
+ai2
);
2207 xmm2
= _mm_load_ss(dvda
+aj2
);
2208 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2209 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2210 _mm_store_ss(dvda
+ai2
,xmm1
);
2211 _mm_store_ss(dvda
+aj2
,xmm2
);
2213 /* Load, update and store partial forces on ai1-2 and aj1-2 */
2214 xmm1
= _mm_load_ss(f
+ai13
);
2215 xmm2
= _mm_load_ss(f
+ai13
+1);
2216 xmm3
= _mm_load_ss(f
+ai13
+2);
2218 xmm4
= _mm_load_ss(f
+aj13
);
2219 xmm5
= _mm_load_ss(f
+aj13
+1);
2220 xmm6
= _mm_load_ss(f
+aj13
+2);
2222 xmm1
= _mm_add_ss(xmm1
,tx
);
2223 xmm2
= _mm_add_ss(xmm2
,ty
);
2224 xmm3
= _mm_add_ss(xmm3
,tz
);
2226 xmm4
= _mm_sub_ss(xmm4
,tx
);
2227 xmm5
= _mm_sub_ss(xmm5
,ty
);
2228 xmm6
= _mm_sub_ss(xmm6
,tz
);
2230 _mm_store_ss(f
+ai13
,xmm1
);
2231 _mm_store_ss(f
+ai13
+1,xmm2
);
2232 _mm_store_ss(f
+ai13
+2,xmm3
);
2234 _mm_store_ss(f
+aj13
,xmm4
);
2235 _mm_store_ss(f
+aj13
+1,xmm5
);
2236 _mm_store_ss(f
+aj13
+2,xmm6
);
2239 xmm1
= _mm_load_ss(f
+ai23
);
2240 xmm2
= _mm_load_ss(f
+ai23
+1);
2241 xmm3
= _mm_load_ss(f
+ai23
+2);
2243 xmm4
= _mm_load_ss(f
+aj23
);
2244 xmm5
= _mm_load_ss(f
+aj23
+1);
2245 xmm6
= _mm_load_ss(f
+aj23
+2);
2247 /* Need to shuffle tx/ty/tz */
2248 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
2249 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
2250 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
2252 xmm1
= _mm_add_ss(xmm1
,tx
);
2253 xmm2
= _mm_add_ss(xmm2
,ty
);
2254 xmm3
= _mm_add_ss(xmm3
,tz
);
2256 xmm4
= _mm_sub_ss(xmm4
,tx
);
2257 xmm5
= _mm_sub_ss(xmm5
,ty
);
2258 xmm6
= _mm_sub_ss(xmm6
,tz
);
2260 _mm_store_ss(f
+ai23
,xmm1
);
2261 _mm_store_ss(f
+ai23
+1,xmm2
);
2262 _mm_store_ss(f
+ai23
+2,xmm3
);
2264 _mm_store_ss(f
+aj23
,xmm4
);
2265 _mm_store_ss(f
+aj23
+1,xmm5
);
2266 _mm_store_ss(f
+aj23
+2,xmm6
);
2270 /* Load, update and store chain rule terms for ai1-3 and aj1-3 atoms */
2271 dva_i
= _mm_mul_ps(dva
,isaj
);
2272 dva_j
= _mm_mul_ps(dva
,isai
);
2274 xmm1
= _mm_load_ss(dvda
+ai1
);
2275 xmm2
= _mm_load_ss(dvda
+aj1
);
2276 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2277 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2278 _mm_store_ss(dvda
+ai1
,xmm1
);
2279 _mm_store_ss(dvda
+aj1
,xmm2
);
2281 /* Need to shuffle dva_i and dva_j */
2282 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
2283 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
2285 xmm1
= _mm_load_ss(dvda
+ai2
);
2286 xmm2
= _mm_load_ss(dvda
+aj2
);
2287 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2288 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2289 _mm_store_ss(dvda
+ai2
,xmm1
);
2290 _mm_store_ss(dvda
+aj2
,xmm2
);
2292 /* Need to shuffle dva_i and dva_j */
2293 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
2294 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
2296 xmm1
= _mm_load_ss(dvda
+ai3
);
2297 xmm2
= _mm_load_ss(dvda
+aj3
);
2298 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2299 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2300 _mm_store_ss(dvda
+ai3
,xmm1
);
2301 _mm_store_ss(dvda
+aj3
,xmm2
);
2303 /* Load, update and store partial forces on ai1-3 and aj1-3 */
2304 xmm1
= _mm_load_ss(f
+ai13
);
2305 xmm2
= _mm_load_ss(f
+ai13
+1);
2306 xmm3
= _mm_load_ss(f
+ai13
+2);
2308 xmm4
= _mm_load_ss(f
+aj13
);
2309 xmm5
= _mm_load_ss(f
+aj13
+1);
2310 xmm6
= _mm_load_ss(f
+aj13
+2);
2312 xmm1
= _mm_add_ss(xmm1
,tx
);
2313 xmm2
= _mm_add_ss(xmm2
,ty
);
2314 xmm3
= _mm_add_ss(xmm3
,tz
);
2316 xmm4
= _mm_sub_ss(xmm4
,tx
);
2317 xmm5
= _mm_sub_ss(xmm5
,ty
);
2318 xmm6
= _mm_sub_ss(xmm6
,tz
);
2320 _mm_store_ss(f
+ai13
,xmm1
);
2321 _mm_store_ss(f
+ai13
+1,xmm2
);
2322 _mm_store_ss(f
+ai13
+2,xmm3
);
2324 _mm_store_ss(f
+aj13
,xmm4
);
2325 _mm_store_ss(f
+aj13
+1,xmm5
);
2326 _mm_store_ss(f
+aj13
+2,xmm6
);
2329 xmm1
= _mm_load_ss(f
+ai23
);
2330 xmm2
= _mm_load_ss(f
+ai23
+1);
2331 xmm3
= _mm_load_ss(f
+ai23
+2);
2333 xmm4
= _mm_load_ss(f
+aj23
);
2334 xmm5
= _mm_load_ss(f
+aj23
+1);
2335 xmm6
= _mm_load_ss(f
+aj23
+2);
2337 /* Need to shuffle tx/ty/tz */
2338 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
2339 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
2340 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
2342 xmm1
= _mm_add_ss(xmm1
,tx
);
2343 xmm2
= _mm_add_ss(xmm2
,ty
);
2344 xmm3
= _mm_add_ss(xmm3
,tz
);
2346 xmm4
= _mm_sub_ss(xmm4
,tx
);
2347 xmm5
= _mm_sub_ss(xmm5
,ty
);
2348 xmm6
= _mm_sub_ss(xmm6
,tz
);
2350 _mm_store_ss(f
+ai23
,xmm1
);
2351 _mm_store_ss(f
+ai23
+1,xmm2
);
2352 _mm_store_ss(f
+ai23
+2,xmm3
);
2354 _mm_store_ss(f
+aj23
,xmm4
);
2355 _mm_store_ss(f
+aj23
+1,xmm5
);
2356 _mm_store_ss(f
+aj23
+2,xmm6
);
2359 xmm1
= _mm_load_ss(f
+ai33
);
2360 xmm2
= _mm_load_ss(f
+ai33
+1);
2361 xmm3
= _mm_load_ss(f
+ai33
+2);
2363 xmm4
= _mm_load_ss(f
+aj33
);
2364 xmm5
= _mm_load_ss(f
+aj33
+1);
2365 xmm6
= _mm_load_ss(f
+aj33
+2);
2367 /* Need to shuffle tx/ty/tz */
2368 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
2369 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
2370 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
2372 xmm1
= _mm_add_ss(xmm1
,tx
);
2373 xmm2
= _mm_add_ss(xmm2
,ty
);
2374 xmm3
= _mm_add_ss(xmm3
,tz
);
2376 xmm4
= _mm_sub_ss(xmm4
,tx
);
2377 xmm5
= _mm_sub_ss(xmm5
,ty
);
2378 xmm6
= _mm_sub_ss(xmm6
,tz
);
2380 _mm_store_ss(f
+ai33
,xmm1
);
2381 _mm_store_ss(f
+ai33
+1,xmm2
);
2382 _mm_store_ss(f
+ai33
+2,xmm3
);
2384 _mm_store_ss(f
+aj33
,xmm4
);
2385 _mm_store_ss(f
+aj33
+1,xmm5
);
2386 _mm_store_ss(f
+aj33
+2,xmm6
);
2391 /* End processing for potential energy */
2392 vgb
= _mm_movehl_ps(vgb
,vgb_tot
);
2393 vgb_tot
= _mm_add_ps(vgb_tot
,vgb
);
2394 vgb
= _mm_shuffle_ps(vgb_tot
,vgb_tot
,_MM_SHUFFLE(1,1,1,1));
2395 vgb_tot
= _mm_add_ss(vgb_tot
,vgb
);
2397 _mm_store_ss(&vctot
,vgb_tot
);
2406 /* keep compiler happy */
2407 int genborn_sse_dummy
;
2409 #endif /* SSE intrinsics available */