3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2008, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
64 /* Only compile this file if SSE intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
67 #include <gmx_sse2_single.h>
68 #include <xmmintrin.h>
69 #include <emmintrin.h>
73 calc_gb_rad_still_sse(t_commrec
*cr
, t_forcerec
*fr
,int natoms
, gmx_localtop_t
*top
,
74 const t_atomtypes
*atype
, float *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
76 int i
,k
,n
,ii
,is3
,ii3
,nj0
,nj1
,offset
;
78 int jnrA
,jnrB
,jnrC
,jnrD
,j3A
,j3B
,j3C
,j3D
;
79 int jnrE
,jnrF
,jnrG
,jnrH
,j3E
,j3F
,j3G
,j3H
;
100 __m128 rsq
,rinv
,rinv2
,rinv4
,rinv6
;
101 __m128 rsqB
,rinvB
,rinv2B
,rinv4B
,rinv6B
;
102 __m128 ratio
,gpi
,rai
,raj
,vai
,vaj
,rvdw
;
103 __m128 ratioB
,rajB
,vajB
,rvdwB
;
104 __m128 ccf
,dccf
,theta
,cosq
,term
,sinq
,res
,prod
,prod_ai
,tmp
;
105 __m128 ccfB
,dccfB
,thetaB
,cosqB
,termB
,sinqB
,resB
,prodB
;
106 __m128 mask
,icf4
,icf6
,mask_cmp
;
107 __m128 icf4B
,icf6B
,mask_cmpB
;
109 __m128 mask1
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
110 __m128 mask2
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
111 __m128 mask3
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
113 const __m128 half
= {0.5f
, 0.5f
, 0.5f
, 0.5f
};
114 const __m128 three
= {3.0f
, 3.0f
, 3.0f
, 3.0f
};
115 const __m128 one
= {1.0f
, 1.0f
, 1.0f
, 1.0f
};
116 const __m128 two
= {2.0f
, 2.0f
, 2.0f
, 2.0f
};
117 const __m128 zero
= {0.0f
, 0.0f
, 0.0f
, 0.0f
};
118 const __m128 four
= {4.0f
, 4.0f
, 4.0f
, 4.0f
};
120 const __m128 still_p5inv
= {STILL_P5INV
, STILL_P5INV
, STILL_P5INV
, STILL_P5INV
};
121 const __m128 still_pip5
= {STILL_PIP5
, STILL_PIP5
, STILL_PIP5
, STILL_PIP5
};
122 const __m128 still_p4
= {STILL_P4
, STILL_P4
, STILL_P4
, STILL_P4
};
124 factor
= 0.5 * ONE_4PI_EPS0
;
126 gb_radius
= born
->gb_radius
;
128 work
= born
->gpol_still_work
;
130 shiftvec
= fr
->shift_vec
[0];
133 jnrA
= jnrB
= jnrC
= jnrD
= 0;
134 jx
= _mm_setzero_ps();
135 jy
= _mm_setzero_ps();
136 jz
= _mm_setzero_ps();
141 n1
= md
->start
+md
->homenr
+natoms
/2+1;
143 for(i
=0;i
<natoms
;i
++)
148 for(i
=0;i
<nl
->nri
;i
++)
152 is3
= 3*nl
->shift
[i
];
154 shY
= shiftvec
[is3
+1];
155 shZ
= shiftvec
[is3
+2];
157 nj1
= nl
->jindex
[i
+1];
159 ix
= _mm_set1_ps(shX
+x
[ii3
+0]);
160 iy
= _mm_set1_ps(shY
+x
[ii3
+1]);
161 iz
= _mm_set1_ps(shZ
+x
[ii3
+2]);
163 offset
= (nj1
-nj0
)%4;
165 /* Polarization energy for atom ai */
166 gpi
= _mm_setzero_ps();
168 rai
= _mm_load1_ps(gb_radius
+ii
);
169 prod_ai
= _mm_set1_ps(STILL_P4
*vsolv
[ii
]);
171 for(k
=nj0
;k
<nj1
-4-offset
;k
+=8)
191 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
192 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3E
,x
+j3F
,x
+j3G
,x
+j3H
,jxB
,jyB
,jzB
);
194 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
195 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrE
,gb_radius
+jnrF
,gb_radius
+jnrG
,gb_radius
+jnrH
,rajB
);
196 GMX_MM_LOAD_4VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vsolv
+jnrC
,vsolv
+jnrD
,vaj
);
197 GMX_MM_LOAD_4VALUES_PS(vsolv
+jnrE
,vsolv
+jnrF
,vsolv
+jnrG
,vsolv
+jnrH
,vajB
);
199 dx
= _mm_sub_ps(ix
,jx
);
200 dy
= _mm_sub_ps(iy
,jy
);
201 dz
= _mm_sub_ps(iz
,jz
);
202 dxB
= _mm_sub_ps(ix
,jxB
);
203 dyB
= _mm_sub_ps(iy
,jyB
);
204 dzB
= _mm_sub_ps(iz
,jzB
);
206 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
207 rsqB
= gmx_mm_calc_rsq_ps(dxB
,dyB
,dzB
);
208 rinv
= gmx_mm_invsqrt_ps(rsq
);
209 rinvB
= gmx_mm_invsqrt_ps(rsqB
);
210 rinv2
= _mm_mul_ps(rinv
,rinv
);
211 rinv2B
= _mm_mul_ps(rinvB
,rinvB
);
212 rinv4
= _mm_mul_ps(rinv2
,rinv2
);
213 rinv4B
= _mm_mul_ps(rinv2B
,rinv2B
);
214 rinv6
= _mm_mul_ps(rinv4
,rinv2
);
215 rinv6B
= _mm_mul_ps(rinv4B
,rinv2B
);
217 rvdw
= _mm_add_ps(rai
,raj
);
218 rvdwB
= _mm_add_ps(rai
,rajB
);
219 ratio
= _mm_mul_ps(rsq
, gmx_mm_inv_ps( _mm_mul_ps(rvdw
,rvdw
)));
220 ratioB
= _mm_mul_ps(rsqB
, gmx_mm_inv_ps( _mm_mul_ps(rvdwB
,rvdwB
)));
222 mask_cmp
= _mm_cmple_ps(ratio
,still_p5inv
);
223 mask_cmpB
= _mm_cmple_ps(ratioB
,still_p5inv
);
225 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
226 if( 0 == _mm_movemask_ps(mask_cmp
) )
228 /* if ratio>still_p5inv for ALL elements */
230 dccf
= _mm_setzero_ps();
234 ratio
= _mm_min_ps(ratio
,still_p5inv
);
235 theta
= _mm_mul_ps(ratio
,still_pip5
);
236 GMX_MM_SINCOS_PS(theta
,sinq
,cosq
);
237 term
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosq
));
238 ccf
= _mm_mul_ps(term
,term
);
239 dccf
= _mm_mul_ps(_mm_mul_ps(two
,term
),
240 _mm_mul_ps(sinq
,theta
));
242 if( 0 == _mm_movemask_ps(mask_cmpB
) )
244 /* if ratio>still_p5inv for ALL elements */
246 dccfB
= _mm_setzero_ps();
250 ratioB
= _mm_min_ps(ratioB
,still_p5inv
);
251 thetaB
= _mm_mul_ps(ratioB
,still_pip5
);
252 GMX_MM_SINCOS_PS(thetaB
,sinqB
,cosqB
);
253 termB
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosqB
));
254 ccfB
= _mm_mul_ps(termB
,termB
);
255 dccfB
= _mm_mul_ps(_mm_mul_ps(two
,termB
),
256 _mm_mul_ps(sinqB
,thetaB
));
259 prod
= _mm_mul_ps(still_p4
,vaj
);
260 prodB
= _mm_mul_ps(still_p4
,vajB
);
261 icf4
= _mm_mul_ps(ccf
,rinv4
);
262 icf4B
= _mm_mul_ps(ccfB
,rinv4B
);
263 icf6
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccf
),dccf
), rinv6
);
264 icf6B
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccfB
),dccfB
), rinv6B
);
266 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_mul_ps(prod_ai
,icf4
));
267 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrE
,work
+jnrF
,work
+jnrG
,work
+jnrH
,_mm_mul_ps(prod_ai
,icf4B
));
269 gpi
= _mm_add_ps(gpi
, _mm_add_ps( _mm_mul_ps(prod
,icf4
) , _mm_mul_ps(prodB
,icf4B
) ) );
271 _mm_store_ps(dadx
,_mm_mul_ps(prod
,icf6
));
273 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6
));
275 _mm_store_ps(dadx
,_mm_mul_ps(prodB
,icf6B
));
277 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6B
));
281 for(;k
<nj1
-offset
;k
+=4)
293 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
295 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
296 GMX_MM_LOAD_4VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vsolv
+jnrC
,vsolv
+jnrD
,vaj
);
298 dx
= _mm_sub_ps(ix
,jx
);
299 dy
= _mm_sub_ps(iy
,jy
);
300 dz
= _mm_sub_ps(iz
,jz
);
302 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
303 rinv
= gmx_mm_invsqrt_ps(rsq
);
304 rinv2
= _mm_mul_ps(rinv
,rinv
);
305 rinv4
= _mm_mul_ps(rinv2
,rinv2
);
306 rinv6
= _mm_mul_ps(rinv4
,rinv2
);
308 rvdw
= _mm_add_ps(rai
,raj
);
309 ratio
= _mm_mul_ps(rsq
, gmx_mm_inv_ps( _mm_mul_ps(rvdw
,rvdw
)));
311 mask_cmp
= _mm_cmple_ps(ratio
,still_p5inv
);
313 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
314 if(0 == _mm_movemask_ps(mask_cmp
))
316 /* if ratio>still_p5inv for ALL elements */
318 dccf
= _mm_setzero_ps();
322 ratio
= _mm_min_ps(ratio
,still_p5inv
);
323 theta
= _mm_mul_ps(ratio
,still_pip5
);
324 GMX_MM_SINCOS_PS(theta
,sinq
,cosq
);
325 term
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosq
));
326 ccf
= _mm_mul_ps(term
,term
);
327 dccf
= _mm_mul_ps(_mm_mul_ps(two
,term
),
328 _mm_mul_ps(sinq
,theta
));
331 prod
= _mm_mul_ps(still_p4
,vaj
);
332 icf4
= _mm_mul_ps(ccf
,rinv4
);
333 icf6
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccf
),dccf
), rinv6
);
336 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_mul_ps(prod_ai
,icf4
));
338 gpi
= _mm_add_ps(gpi
, _mm_mul_ps(prod
,icf4
));
340 _mm_store_ps(dadx
,_mm_mul_ps(prod
,icf6
));
342 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6
));
352 GMX_MM_LOAD_1RVEC_1POINTER_PS(x
+j3A
,jx
,jy
,jz
);
353 GMX_MM_LOAD_1VALUE_PS(gb_radius
+jnrA
,raj
);
354 GMX_MM_LOAD_1VALUE_PS(vsolv
+jnrA
,vaj
);
363 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x
+j3A
,x
+j3B
,jx
,jy
,jz
);
364 GMX_MM_LOAD_2VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,raj
);
365 GMX_MM_LOAD_2VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vaj
);
370 /* offset must be 3 */
377 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,jx
,jy
,jz
);
378 GMX_MM_LOAD_3VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,raj
);
379 GMX_MM_LOAD_3VALUES_PS(vsolv
+jnrA
,vsolv
+jnrB
,vsolv
+jnrC
,vaj
);
383 dx
= _mm_sub_ps(ix
,jx
);
384 dy
= _mm_sub_ps(iy
,jy
);
385 dz
= _mm_sub_ps(iz
,jz
);
387 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
388 rinv
= gmx_mm_invsqrt_ps(rsq
);
389 rinv2
= _mm_mul_ps(rinv
,rinv
);
390 rinv4
= _mm_mul_ps(rinv2
,rinv2
);
391 rinv6
= _mm_mul_ps(rinv4
,rinv2
);
393 rvdw
= _mm_add_ps(rai
,raj
);
394 ratio
= _mm_mul_ps(rsq
, gmx_mm_inv_ps( _mm_mul_ps(rvdw
,rvdw
)));
396 mask_cmp
= _mm_cmple_ps(ratio
,still_p5inv
);
398 if(0 == _mm_movemask_ps(mask_cmp
))
400 /* if ratio>still_p5inv for ALL elements */
402 dccf
= _mm_setzero_ps();
406 ratio
= _mm_min_ps(ratio
,still_p5inv
);
407 theta
= _mm_mul_ps(ratio
,still_pip5
);
408 GMX_MM_SINCOS_PS(theta
,sinq
,cosq
);
409 term
= _mm_mul_ps(half
,_mm_sub_ps(one
,cosq
));
410 ccf
= _mm_mul_ps(term
,term
);
411 dccf
= _mm_mul_ps(_mm_mul_ps(two
,term
),
412 _mm_mul_ps(sinq
,theta
));
415 prod
= _mm_mul_ps(still_p4
,vaj
);
416 icf4
= _mm_mul_ps(ccf
,rinv4
);
417 icf6
= _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four
,ccf
),dccf
), rinv6
);
419 gpi
= _mm_add_ps(gpi
, _mm_mul_ps(prod
,icf4
));
421 _mm_store_ps(dadx
,_mm_mul_ps(prod
,icf6
));
423 _mm_store_ps(dadx
,_mm_mul_ps(prod_ai
,icf6
));
426 tmp
= _mm_mul_ps(prod_ai
,icf4
);
430 GMX_MM_INCREMENT_1VALUE_PS(work
+jnrA
,tmp
);
434 GMX_MM_INCREMENT_2VALUES_PS(work
+jnrA
,work
+jnrB
,tmp
);
438 /* offset must be 3 */
439 GMX_MM_INCREMENT_3VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,tmp
);
442 gmx_mm_update_1pot_ps(gpi
,work
+ii
);
445 /* Sum up the polarization energy from other nodes */
448 gmx_sum(natoms
, work
, cr
);
450 else if(DOMAINDECOMP(cr
))
452 dd_atom_sum_real(cr
->dd
, work
);
455 /* Compute the radii */
456 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
458 if(born
->use
[i
] != 0)
460 gpi_ai
= born
->gpol
[i
] + work
[i
]; /* add gpi to the initial pol energy gpi_ai*/
461 gpi2
= gpi_ai
* gpi_ai
;
462 born
->bRad
[i
] = factor
*gmx_invsqrt(gpi2
);
463 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
467 /* Extra (local) communication required for DD */
470 dd_atom_spread_real(cr
->dd
, born
->bRad
);
471 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
479 calc_gb_rad_hct_obc_sse(t_commrec
*cr
, t_forcerec
* fr
, int natoms
, gmx_localtop_t
*top
,
480 const t_atomtypes
*atype
, float *x
, t_nblist
*nl
, gmx_genborn_t
*born
,t_mdatoms
*md
,int gb_algorithm
)
482 int i
,ai
,k
,n
,ii
,ii3
,is3
,nj0
,nj1
,at0
,at1
,offset
;
483 int jnrA
,jnrB
,jnrC
,jnrD
;
485 int jnrE
,jnrF
,jnrG
,jnrH
;
488 float rr
,rr_inv
,rr_inv2
,sum_tmp
,sum
,sum2
,sum3
,gbr
;
489 float sum_ai2
, sum_ai3
,tsum
,tchain
,doffset
;
498 __m128 ix
,iy
,iz
,jx
,jy
,jz
;
499 __m128 dx
,dy
,dz
,t1
,t2
,t3
,t4
;
501 __m128 rai
,rai_inv
,raj
, raj_inv
,rai_inv2
,sk
,sk2
,lij
,dlij
,duij
;
502 __m128 uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
503 __m128 lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
504 __m128 sum_ai
, tmp_ai
,sk_ai
,sk_aj
,sk2_ai
,sk2_aj
,sk2_rinv
;
508 __m128 obc_mask1
,obc_mask2
,obc_mask3
;
509 __m128 jxB
,jyB
,jzB
,t1B
,t2B
,t3B
,t4B
;
510 __m128 dxB
,dyB
,dzB
,rsqB
,rinvB
,rB
;
511 __m128 rajB
, raj_invB
,rai_inv2B
,sk2B
,lijB
,dlijB
,duijB
;
512 __m128 uijB
,lij2B
,uij2B
,lij3B
,uij3B
,diff2B
;
513 __m128 lij_invB
,sk2_invB
,prodB
;
514 __m128 sk_ajB
,sk2_ajB
,sk2_rinvB
;
515 __m128 dadx1B
,dadx2B
;
517 __m128 obc_mask1B
,obc_mask2B
,obc_mask3B
;
519 __m128 mask1
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
520 __m128 mask2
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
521 __m128 mask3
= gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
523 __m128 oneeighth
= _mm_set1_ps(0.125);
524 __m128 onefourth
= _mm_set1_ps(0.25);
526 const __m128 neg
= {-1.0f
, -1.0f
, -1.0f
, -1.0f
};
527 const __m128 zero
= {0.0f
, 0.0f
, 0.0f
, 0.0f
};
528 const __m128 half
= {0.5f
, 0.5f
, 0.5f
, 0.5f
};
529 const __m128 one
= {1.0f
, 1.0f
, 1.0f
, 1.0f
};
530 const __m128 two
= {2.0f
, 2.0f
, 2.0f
, 2.0f
};
531 const __m128 three
= {3.0f
, 3.0f
, 3.0f
, 3.0f
};
533 /* Set the dielectric offset */
534 doffset
= born
->gb_doffset
;
535 gb_radius
= born
->gb_radius
;
536 obc_param
= born
->param
;
537 work
= born
->gpol_hct_work
;
540 shiftvec
= fr
->shift_vec
[0];
542 jx
= _mm_setzero_ps();
543 jy
= _mm_setzero_ps();
544 jz
= _mm_setzero_ps();
546 jnrA
= jnrB
= jnrC
= jnrD
= 0;
548 for(i
=0;i
<born
->nr
;i
++)
553 for(i
=0;i
<nl
->nri
;i
++)
557 is3
= 3*nl
->shift
[i
];
559 shY
= shiftvec
[is3
+1];
560 shZ
= shiftvec
[is3
+2];
562 nj1
= nl
->jindex
[i
+1];
564 ix
= _mm_set1_ps(shX
+x
[ii3
+0]);
565 iy
= _mm_set1_ps(shY
+x
[ii3
+1]);
566 iz
= _mm_set1_ps(shZ
+x
[ii3
+2]);
568 offset
= (nj1
-nj0
)%4;
570 rai
= _mm_load1_ps(gb_radius
+ii
);
571 rai_inv
= gmx_mm_inv_ps(rai
);
573 sum_ai
= _mm_setzero_ps();
575 sk_ai
= _mm_load1_ps(born
->param
+ii
);
576 sk2_ai
= _mm_mul_ps(sk_ai
,sk_ai
);
578 for(k
=nj0
;k
<nj1
-4-offset
;k
+=8)
598 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
599 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3E
,x
+j3F
,x
+j3G
,x
+j3H
,jxB
,jyB
,jzB
);
600 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
601 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrE
,gb_radius
+jnrF
,gb_radius
+jnrG
,gb_radius
+jnrH
,rajB
);
602 GMX_MM_LOAD_4VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,obc_param
+jnrC
,obc_param
+jnrD
,sk_aj
);
603 GMX_MM_LOAD_4VALUES_PS(obc_param
+jnrE
,obc_param
+jnrF
,obc_param
+jnrG
,obc_param
+jnrH
,sk_ajB
);
605 dx
= _mm_sub_ps(ix
, jx
);
606 dy
= _mm_sub_ps(iy
, jy
);
607 dz
= _mm_sub_ps(iz
, jz
);
608 dxB
= _mm_sub_ps(ix
, jxB
);
609 dyB
= _mm_sub_ps(iy
, jyB
);
610 dzB
= _mm_sub_ps(iz
, jzB
);
612 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
613 rsqB
= gmx_mm_calc_rsq_ps(dxB
,dyB
,dzB
);
615 rinv
= gmx_mm_invsqrt_ps(rsq
);
616 r
= _mm_mul_ps(rsq
,rinv
);
617 rinvB
= gmx_mm_invsqrt_ps(rsqB
);
618 rB
= _mm_mul_ps(rsqB
,rinvB
);
620 /* Compute raj_inv aj1-4 */
621 raj_inv
= gmx_mm_inv_ps(raj
);
622 raj_invB
= gmx_mm_inv_ps(rajB
);
624 /* Evaluate influence of atom aj -> ai */
625 t1
= _mm_add_ps(r
,sk_aj
);
626 t2
= _mm_sub_ps(r
,sk_aj
);
627 t3
= _mm_sub_ps(sk_aj
,r
);
628 t1B
= _mm_add_ps(rB
,sk_ajB
);
629 t2B
= _mm_sub_ps(rB
,sk_ajB
);
630 t3B
= _mm_sub_ps(sk_ajB
,rB
);
631 obc_mask1
= _mm_cmplt_ps(rai
, t1
);
632 obc_mask2
= _mm_cmplt_ps(rai
, t2
);
633 obc_mask3
= _mm_cmplt_ps(rai
, t3
);
634 obc_mask1B
= _mm_cmplt_ps(rai
, t1B
);
635 obc_mask2B
= _mm_cmplt_ps(rai
, t2B
);
636 obc_mask3B
= _mm_cmplt_ps(rai
, t3B
);
638 uij
= gmx_mm_inv_ps(t1
);
639 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
640 _mm_andnot_ps(obc_mask2
,rai_inv
));
641 dlij
= _mm_and_ps(one
,obc_mask2
);
642 uij2
= _mm_mul_ps(uij
, uij
);
643 uij3
= _mm_mul_ps(uij2
,uij
);
644 lij2
= _mm_mul_ps(lij
, lij
);
645 lij3
= _mm_mul_ps(lij2
,lij
);
647 uijB
= gmx_mm_inv_ps(t1B
);
648 lijB
= _mm_or_ps( _mm_and_ps(obc_mask2B
,gmx_mm_inv_ps(t2B
)),
649 _mm_andnot_ps(obc_mask2B
,rai_inv
));
650 dlijB
= _mm_and_ps(one
,obc_mask2B
);
651 uij2B
= _mm_mul_ps(uijB
, uijB
);
652 uij3B
= _mm_mul_ps(uij2B
,uijB
);
653 lij2B
= _mm_mul_ps(lijB
, lijB
);
654 lij3B
= _mm_mul_ps(lij2B
,lijB
);
656 diff2
= _mm_sub_ps(uij2
,lij2
);
657 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
658 sk2_aj
= _mm_mul_ps(sk_aj
,sk_aj
);
659 sk2_rinv
= _mm_mul_ps(sk2_aj
,rinv
);
660 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
662 diff2B
= _mm_sub_ps(uij2B
,lij2B
);
663 lij_invB
= gmx_mm_invsqrt_ps(lij2B
);
664 sk2_ajB
= _mm_mul_ps(sk_ajB
,sk_ajB
);
665 sk2_rinvB
= _mm_mul_ps(sk2_ajB
,rinvB
);
666 prodB
= _mm_mul_ps(onefourth
,sk2_rinvB
);
668 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
669 logtermB
= gmx_mm_log_ps(_mm_mul_ps(uijB
,lij_invB
));
671 t1
= _mm_sub_ps(lij
,uij
);
672 t2
= _mm_mul_ps(diff2
,
673 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
675 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
676 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
677 t4
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lij
));
678 t4
= _mm_and_ps(t4
,obc_mask3
);
679 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
681 t1B
= _mm_sub_ps(lijB
,uijB
);
682 t2B
= _mm_mul_ps(diff2B
,
683 _mm_sub_ps(_mm_mul_ps(onefourth
,rB
),
685 t3B
= _mm_mul_ps(half
,_mm_mul_ps(rinvB
,logtermB
));
686 t1B
= _mm_add_ps(t1B
,_mm_add_ps(t2B
,t3B
));
687 t4B
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lijB
));
688 t4B
= _mm_and_ps(t4B
,obc_mask3B
);
689 t1B
= _mm_mul_ps(half
,_mm_add_ps(t1B
,t4B
));
691 sum_ai
= _mm_add_ps(sum_ai
, _mm_add_ps( _mm_and_ps(t1
,obc_mask1
), _mm_and_ps(t1B
,obc_mask1B
) ));
693 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
694 _mm_mul_ps(prod
,lij3
));
696 _mm_mul_ps(onefourth
,
697 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
698 _mm_mul_ps(lij3
,r
))));
699 t2
= _mm_mul_ps(onefourth
,
700 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
701 _mm_mul_ps(uij3
,r
)));
703 _mm_add_ps(_mm_mul_ps(half
,uij2
),
704 _mm_mul_ps(prod
,uij3
)));
705 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
706 _mm_mul_ps(rinv
,rinv
));
708 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
710 _mm_mul_ps(sk2_rinv
,rinv
))));
711 t1
= _mm_mul_ps(rinv
,
712 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
717 t1B
= _mm_add_ps(_mm_mul_ps(half
,lij2B
),
718 _mm_mul_ps(prodB
,lij3B
));
719 t1B
= _mm_sub_ps(t1B
,
720 _mm_mul_ps(onefourth
,
721 _mm_add_ps(_mm_mul_ps(lijB
,rinvB
),
722 _mm_mul_ps(lij3B
,rB
))));
723 t2B
= _mm_mul_ps(onefourth
,
724 _mm_add_ps(_mm_mul_ps(uijB
,rinvB
),
725 _mm_mul_ps(uij3B
,rB
)));
726 t2B
= _mm_sub_ps(t2B
,
727 _mm_add_ps(_mm_mul_ps(half
,uij2B
),
728 _mm_mul_ps(prodB
,uij3B
)));
729 t3B
= _mm_mul_ps(_mm_mul_ps(onefourth
,logtermB
),
730 _mm_mul_ps(rinvB
,rinvB
));
731 t3B
= _mm_sub_ps(t3B
,
732 _mm_mul_ps(_mm_mul_ps(diff2B
,oneeighth
),
734 _mm_mul_ps(sk2_rinvB
,rinvB
))));
735 t1B
= _mm_mul_ps(rinvB
,
736 _mm_add_ps(_mm_mul_ps(dlijB
,t1B
),
737 _mm_add_ps(t2B
,t3B
)));
739 dadx1
= _mm_and_ps(t1
,obc_mask1
);
740 dadx1B
= _mm_and_ps(t1B
,obc_mask1B
);
743 /* Evaluate influence of atom ai -> aj */
744 t1
= _mm_add_ps(r
,sk_ai
);
745 t2
= _mm_sub_ps(r
,sk_ai
);
746 t3
= _mm_sub_ps(sk_ai
,r
);
747 t1B
= _mm_add_ps(rB
,sk_ai
);
748 t2B
= _mm_sub_ps(rB
,sk_ai
);
749 t3B
= _mm_sub_ps(sk_ai
,rB
);
750 obc_mask1
= _mm_cmplt_ps(raj
, t1
);
751 obc_mask2
= _mm_cmplt_ps(raj
, t2
);
752 obc_mask3
= _mm_cmplt_ps(raj
, t3
);
753 obc_mask1B
= _mm_cmplt_ps(rajB
, t1B
);
754 obc_mask2B
= _mm_cmplt_ps(rajB
, t2B
);
755 obc_mask3B
= _mm_cmplt_ps(rajB
, t3B
);
757 uij
= gmx_mm_inv_ps(t1
);
758 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
759 _mm_andnot_ps(obc_mask2
,raj_inv
));
760 dlij
= _mm_and_ps(one
,obc_mask2
);
761 uij2
= _mm_mul_ps(uij
, uij
);
762 uij3
= _mm_mul_ps(uij2
,uij
);
763 lij2
= _mm_mul_ps(lij
, lij
);
764 lij3
= _mm_mul_ps(lij2
,lij
);
766 uijB
= gmx_mm_inv_ps(t1B
);
767 lijB
= _mm_or_ps( _mm_and_ps(obc_mask2B
,gmx_mm_inv_ps(t2B
)),
768 _mm_andnot_ps(obc_mask2B
,raj_invB
));
769 dlijB
= _mm_and_ps(one
,obc_mask2B
);
770 uij2B
= _mm_mul_ps(uijB
, uijB
);
771 uij3B
= _mm_mul_ps(uij2B
,uijB
);
772 lij2B
= _mm_mul_ps(lijB
, lijB
);
773 lij3B
= _mm_mul_ps(lij2B
,lijB
);
775 diff2
= _mm_sub_ps(uij2
,lij2
);
776 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
777 sk2_rinv
= _mm_mul_ps(sk2_ai
,rinv
);
778 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
780 diff2B
= _mm_sub_ps(uij2B
,lij2B
);
781 lij_invB
= gmx_mm_invsqrt_ps(lij2B
);
782 sk2_rinvB
= _mm_mul_ps(sk2_ai
,rinvB
);
783 prodB
= _mm_mul_ps(onefourth
,sk2_rinvB
);
785 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
786 logtermB
= gmx_mm_log_ps(_mm_mul_ps(uijB
,lij_invB
));
788 t1
= _mm_sub_ps(lij
,uij
);
789 t2
= _mm_mul_ps(diff2
,
790 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
792 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
793 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
794 t4
= _mm_mul_ps(two
,_mm_sub_ps(raj_inv
,lij
));
795 t4
= _mm_and_ps(t4
,obc_mask3
);
796 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
798 t1B
= _mm_sub_ps(lijB
,uijB
);
799 t2B
= _mm_mul_ps(diff2B
,
800 _mm_sub_ps(_mm_mul_ps(onefourth
,rB
),
802 t3B
= _mm_mul_ps(half
,_mm_mul_ps(rinvB
,logtermB
));
803 t1B
= _mm_add_ps(t1B
,_mm_add_ps(t2B
,t3B
));
804 t4B
= _mm_mul_ps(two
,_mm_sub_ps(raj_invB
,lijB
));
805 t4B
= _mm_and_ps(t4B
,obc_mask3B
);
806 t1B
= _mm_mul_ps(half
,_mm_add_ps(t1B
,t4B
));
808 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_and_ps(t1
,obc_mask1
));
809 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrE
,work
+jnrF
,work
+jnrG
,work
+jnrH
,_mm_and_ps(t1B
,obc_mask1B
));
811 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
812 _mm_mul_ps(prod
,lij3
));
814 _mm_mul_ps(onefourth
,
815 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
816 _mm_mul_ps(lij3
,r
))));
817 t2
= _mm_mul_ps(onefourth
,
818 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
819 _mm_mul_ps(uij3
,r
)));
821 _mm_add_ps(_mm_mul_ps(half
,uij2
),
822 _mm_mul_ps(prod
,uij3
)));
823 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
824 _mm_mul_ps(rinv
,rinv
));
826 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
828 _mm_mul_ps(sk2_rinv
,rinv
))));
829 t1
= _mm_mul_ps(rinv
,
830 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
834 t1B
= _mm_add_ps(_mm_mul_ps(half
,lij2B
),
835 _mm_mul_ps(prodB
,lij3B
));
836 t1B
= _mm_sub_ps(t1B
,
837 _mm_mul_ps(onefourth
,
838 _mm_add_ps(_mm_mul_ps(lijB
,rinvB
),
839 _mm_mul_ps(lij3B
,rB
))));
840 t2B
= _mm_mul_ps(onefourth
,
841 _mm_add_ps(_mm_mul_ps(uijB
,rinvB
),
842 _mm_mul_ps(uij3B
,rB
)));
843 t2B
= _mm_sub_ps(t2B
,
844 _mm_add_ps(_mm_mul_ps(half
,uij2B
),
845 _mm_mul_ps(prodB
,uij3B
)));
846 t3B
= _mm_mul_ps(_mm_mul_ps(onefourth
,logtermB
),
847 _mm_mul_ps(rinvB
,rinvB
));
848 t3B
= _mm_sub_ps(t3B
,
849 _mm_mul_ps(_mm_mul_ps(diff2B
,oneeighth
),
851 _mm_mul_ps(sk2_rinvB
,rinvB
))));
852 t1B
= _mm_mul_ps(rinvB
,
853 _mm_add_ps(_mm_mul_ps(dlijB
,t1B
),
854 _mm_add_ps(t2B
,t3B
)));
857 dadx2
= _mm_and_ps(t1
,obc_mask1
);
858 dadx2B
= _mm_and_ps(t1B
,obc_mask1B
);
860 _mm_store_ps(dadx
,dadx1
);
862 _mm_store_ps(dadx
,dadx2
);
864 _mm_store_ps(dadx
,dadx1B
);
866 _mm_store_ps(dadx
,dadx2B
);
869 } /* end normal inner loop */
871 for(;k
<nj1
-offset
;k
+=4)
883 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
884 GMX_MM_LOAD_4VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,gb_radius
+jnrD
,raj
);
885 GMX_MM_LOAD_4VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,obc_param
+jnrC
,obc_param
+jnrD
,sk_aj
);
887 dx
= _mm_sub_ps(ix
, jx
);
888 dy
= _mm_sub_ps(iy
, jy
);
889 dz
= _mm_sub_ps(iz
, jz
);
891 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
893 rinv
= gmx_mm_invsqrt_ps(rsq
);
894 r
= _mm_mul_ps(rsq
,rinv
);
896 /* Compute raj_inv aj1-4 */
897 raj_inv
= gmx_mm_inv_ps(raj
);
899 /* Evaluate influence of atom aj -> ai */
900 t1
= _mm_add_ps(r
,sk_aj
);
901 obc_mask1
= _mm_cmplt_ps(rai
, t1
);
903 if(_mm_movemask_ps(obc_mask1
))
905 /* If any of the elements has rai<dr+sk, this is executed */
906 t2
= _mm_sub_ps(r
,sk_aj
);
907 t3
= _mm_sub_ps(sk_aj
,r
);
909 obc_mask2
= _mm_cmplt_ps(rai
, t2
);
910 obc_mask3
= _mm_cmplt_ps(rai
, t3
);
912 uij
= gmx_mm_inv_ps(t1
);
913 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
914 _mm_andnot_ps(obc_mask2
,rai_inv
));
915 dlij
= _mm_and_ps(one
,obc_mask2
);
916 uij2
= _mm_mul_ps(uij
, uij
);
917 uij3
= _mm_mul_ps(uij2
,uij
);
918 lij2
= _mm_mul_ps(lij
, lij
);
919 lij3
= _mm_mul_ps(lij2
,lij
);
920 diff2
= _mm_sub_ps(uij2
,lij2
);
921 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
922 sk2_aj
= _mm_mul_ps(sk_aj
,sk_aj
);
923 sk2_rinv
= _mm_mul_ps(sk2_aj
,rinv
);
924 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
925 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
926 t1
= _mm_sub_ps(lij
,uij
);
927 t2
= _mm_mul_ps(diff2
,
928 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
930 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
931 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
932 t4
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lij
));
933 t4
= _mm_and_ps(t4
,obc_mask3
);
934 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
935 sum_ai
= _mm_add_ps(sum_ai
,_mm_and_ps(t1
,obc_mask1
));
936 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
937 _mm_mul_ps(prod
,lij3
));
939 _mm_mul_ps(onefourth
,
940 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
941 _mm_mul_ps(lij3
,r
))));
942 t2
= _mm_mul_ps(onefourth
,
943 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
944 _mm_mul_ps(uij3
,r
)));
946 _mm_add_ps(_mm_mul_ps(half
,uij2
),
947 _mm_mul_ps(prod
,uij3
)));
948 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
949 _mm_mul_ps(rinv
,rinv
));
951 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
953 _mm_mul_ps(sk2_rinv
,rinv
))));
954 t1
= _mm_mul_ps(rinv
,
955 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
958 dadx1
= _mm_and_ps(t1
,obc_mask1
);
962 dadx1
= _mm_setzero_ps();
965 /* Evaluate influence of atom ai -> aj */
966 t1
= _mm_add_ps(r
,sk_ai
);
967 obc_mask1
= _mm_cmplt_ps(raj
, t1
);
969 if(_mm_movemask_ps(obc_mask1
))
971 t2
= _mm_sub_ps(r
,sk_ai
);
972 t3
= _mm_sub_ps(sk_ai
,r
);
973 obc_mask2
= _mm_cmplt_ps(raj
, t2
);
974 obc_mask3
= _mm_cmplt_ps(raj
, t3
);
976 uij
= gmx_mm_inv_ps(t1
);
977 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
978 _mm_andnot_ps(obc_mask2
,raj_inv
));
979 dlij
= _mm_and_ps(one
,obc_mask2
);
980 uij2
= _mm_mul_ps(uij
, uij
);
981 uij3
= _mm_mul_ps(uij2
,uij
);
982 lij2
= _mm_mul_ps(lij
, lij
);
983 lij3
= _mm_mul_ps(lij2
,lij
);
984 diff2
= _mm_sub_ps(uij2
,lij2
);
985 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
986 sk2_rinv
= _mm_mul_ps(sk2_ai
,rinv
);
987 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
988 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
989 t1
= _mm_sub_ps(lij
,uij
);
990 t2
= _mm_mul_ps(diff2
,
991 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
993 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
994 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
995 t4
= _mm_mul_ps(two
,_mm_sub_ps(raj_inv
,lij
));
996 t4
= _mm_and_ps(t4
,obc_mask3
);
997 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
999 GMX_MM_INCREMENT_4VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,work
+jnrD
,_mm_and_ps(t1
,obc_mask1
));
1001 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
1002 _mm_mul_ps(prod
,lij3
));
1004 _mm_mul_ps(onefourth
,
1005 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
1006 _mm_mul_ps(lij3
,r
))));
1007 t2
= _mm_mul_ps(onefourth
,
1008 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
1009 _mm_mul_ps(uij3
,r
)));
1011 _mm_add_ps(_mm_mul_ps(half
,uij2
),
1012 _mm_mul_ps(prod
,uij3
)));
1013 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
1014 _mm_mul_ps(rinv
,rinv
));
1016 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
1018 _mm_mul_ps(sk2_rinv
,rinv
))));
1019 t1
= _mm_mul_ps(rinv
,
1020 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
1021 _mm_add_ps(t2
,t3
)));
1022 dadx2
= _mm_and_ps(t1
,obc_mask1
);
1026 dadx2
= _mm_setzero_ps();
1029 _mm_store_ps(dadx
,dadx1
);
1031 _mm_store_ps(dadx
,dadx2
);
1033 } /* end normal inner loop */
1041 GMX_MM_LOAD_1RVEC_1POINTER_PS(x
+j3A
,jx
,jy
,jz
);
1042 GMX_MM_LOAD_1VALUE_PS(gb_radius
+jnrA
,raj
);
1043 GMX_MM_LOAD_1VALUE_PS(obc_param
+jnrA
,sk_aj
);
1052 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x
+j3A
,x
+j3B
,jx
,jy
,jz
);
1053 GMX_MM_LOAD_2VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,raj
);
1054 GMX_MM_LOAD_2VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,sk_aj
);
1059 /* offset must be 3 */
1066 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,jx
,jy
,jz
);
1067 GMX_MM_LOAD_3VALUES_PS(gb_radius
+jnrA
,gb_radius
+jnrB
,gb_radius
+jnrC
,raj
);
1068 GMX_MM_LOAD_3VALUES_PS(obc_param
+jnrA
,obc_param
+jnrB
,obc_param
+jnrC
,sk_aj
);
1072 dx
= _mm_sub_ps(ix
, jx
);
1073 dy
= _mm_sub_ps(iy
, jy
);
1074 dz
= _mm_sub_ps(iz
, jz
);
1076 rsq
= gmx_mm_calc_rsq_ps(dx
,dy
,dz
);
1078 rinv
= gmx_mm_invsqrt_ps(rsq
);
1079 r
= _mm_mul_ps(rsq
,rinv
);
1081 /* Compute raj_inv aj1-4 */
1082 raj_inv
= gmx_mm_inv_ps(raj
);
1084 /* Evaluate influence of atom aj -> ai */
1085 t1
= _mm_add_ps(r
,sk_aj
);
1086 obc_mask1
= _mm_cmplt_ps(rai
, t1
);
1087 obc_mask1
= _mm_and_ps(obc_mask1
,mask
);
1089 if(_mm_movemask_ps(obc_mask1
))
1091 t2
= _mm_sub_ps(r
,sk_aj
);
1092 t3
= _mm_sub_ps(sk_aj
,r
);
1093 obc_mask2
= _mm_cmplt_ps(rai
, t2
);
1094 obc_mask3
= _mm_cmplt_ps(rai
, t3
);
1096 uij
= gmx_mm_inv_ps(t1
);
1097 lij
= _mm_or_ps( _mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
1098 _mm_andnot_ps(obc_mask2
,rai_inv
));
1099 dlij
= _mm_and_ps(one
,obc_mask2
);
1100 uij2
= _mm_mul_ps(uij
, uij
);
1101 uij3
= _mm_mul_ps(uij2
,uij
);
1102 lij2
= _mm_mul_ps(lij
, lij
);
1103 lij3
= _mm_mul_ps(lij2
,lij
);
1104 diff2
= _mm_sub_ps(uij2
,lij2
);
1105 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
1106 sk2_aj
= _mm_mul_ps(sk_aj
,sk_aj
);
1107 sk2_rinv
= _mm_mul_ps(sk2_aj
,rinv
);
1108 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
1109 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
1110 t1
= _mm_sub_ps(lij
,uij
);
1111 t2
= _mm_mul_ps(diff2
,
1112 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
1114 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
1115 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
1116 t4
= _mm_mul_ps(two
,_mm_sub_ps(rai_inv
,lij
));
1117 t4
= _mm_and_ps(t4
,obc_mask3
);
1118 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
1119 sum_ai
= _mm_add_ps(sum_ai
,_mm_and_ps(t1
,obc_mask1
));
1120 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
1121 _mm_mul_ps(prod
,lij3
));
1123 _mm_mul_ps(onefourth
,
1124 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
1125 _mm_mul_ps(lij3
,r
))));
1126 t2
= _mm_mul_ps(onefourth
,
1127 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
1128 _mm_mul_ps(uij3
,r
)));
1130 _mm_add_ps(_mm_mul_ps(half
,uij2
),
1131 _mm_mul_ps(prod
,uij3
)));
1132 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
1133 _mm_mul_ps(rinv
,rinv
));
1135 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
1137 _mm_mul_ps(sk2_rinv
,rinv
))));
1138 t1
= _mm_mul_ps(rinv
,
1139 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
1140 _mm_add_ps(t2
,t3
)));
1141 dadx1
= _mm_and_ps(t1
,obc_mask1
);
1145 dadx1
= _mm_setzero_ps();
1148 /* Evaluate influence of atom ai -> aj */
1149 t1
= _mm_add_ps(r
,sk_ai
);
1150 obc_mask1
= _mm_cmplt_ps(raj
, t1
);
1151 obc_mask1
= _mm_and_ps(obc_mask1
,mask
);
1153 if(_mm_movemask_ps(obc_mask1
))
1155 t2
= _mm_sub_ps(r
,sk_ai
);
1156 t3
= _mm_sub_ps(sk_ai
,r
);
1157 obc_mask2
= _mm_cmplt_ps(raj
, t2
);
1158 obc_mask3
= _mm_cmplt_ps(raj
, t3
);
1160 uij
= gmx_mm_inv_ps(t1
);
1161 lij
= _mm_or_ps(_mm_and_ps(obc_mask2
,gmx_mm_inv_ps(t2
)),
1162 _mm_andnot_ps(obc_mask2
,raj_inv
));
1163 dlij
= _mm_and_ps(one
,obc_mask2
);
1164 uij2
= _mm_mul_ps(uij
, uij
);
1165 uij3
= _mm_mul_ps(uij2
,uij
);
1166 lij2
= _mm_mul_ps(lij
, lij
);
1167 lij3
= _mm_mul_ps(lij2
,lij
);
1168 diff2
= _mm_sub_ps(uij2
,lij2
);
1169 lij_inv
= gmx_mm_invsqrt_ps(lij2
);
1170 sk2_rinv
= _mm_mul_ps(sk2_ai
,rinv
);
1171 prod
= _mm_mul_ps(onefourth
,sk2_rinv
);
1172 logterm
= gmx_mm_log_ps(_mm_mul_ps(uij
,lij_inv
));
1173 t1
= _mm_sub_ps(lij
,uij
);
1174 t2
= _mm_mul_ps(diff2
,
1175 _mm_sub_ps(_mm_mul_ps(onefourth
,r
),
1177 t3
= _mm_mul_ps(half
,_mm_mul_ps(rinv
,logterm
));
1178 t1
= _mm_add_ps(t1
,_mm_add_ps(t2
,t3
));
1179 t4
= _mm_mul_ps(two
,_mm_sub_ps(raj_inv
,lij
));
1180 t4
= _mm_and_ps(t4
,obc_mask3
);
1181 t1
= _mm_mul_ps(half
,_mm_add_ps(t1
,t4
));
1183 tmp
= _mm_and_ps(t1
,obc_mask1
);
1185 t1
= _mm_add_ps(_mm_mul_ps(half
,lij2
),
1186 _mm_mul_ps(prod
,lij3
));
1188 _mm_mul_ps(onefourth
,
1189 _mm_add_ps(_mm_mul_ps(lij
,rinv
),
1190 _mm_mul_ps(lij3
,r
))));
1191 t2
= _mm_mul_ps(onefourth
,
1192 _mm_add_ps(_mm_mul_ps(uij
,rinv
),
1193 _mm_mul_ps(uij3
,r
)));
1195 _mm_add_ps(_mm_mul_ps(half
,uij2
),
1196 _mm_mul_ps(prod
,uij3
)));
1197 t3
= _mm_mul_ps(_mm_mul_ps(onefourth
,logterm
),
1198 _mm_mul_ps(rinv
,rinv
));
1200 _mm_mul_ps(_mm_mul_ps(diff2
,oneeighth
),
1202 _mm_mul_ps(sk2_rinv
,rinv
))));
1203 t1
= _mm_mul_ps(rinv
,
1204 _mm_add_ps(_mm_mul_ps(dlij
,t1
),
1205 _mm_add_ps(t2
,t3
)));
1206 dadx2
= _mm_and_ps(t1
,obc_mask1
);
1210 dadx2
= _mm_setzero_ps();
1211 tmp
= _mm_setzero_ps();
1214 _mm_store_ps(dadx
,dadx1
);
1216 _mm_store_ps(dadx
,dadx2
);
1221 GMX_MM_INCREMENT_1VALUE_PS(work
+jnrA
,tmp
);
1225 GMX_MM_INCREMENT_2VALUES_PS(work
+jnrA
,work
+jnrB
,tmp
);
1229 /* offset must be 3 */
1230 GMX_MM_INCREMENT_3VALUES_PS(work
+jnrA
,work
+jnrB
,work
+jnrC
,tmp
);
1234 gmx_mm_update_1pot_ps(sum_ai
,work
+ii
);
1238 /* Parallel summations */
1241 gmx_sum(natoms
, work
, cr
);
1243 else if(DOMAINDECOMP(cr
))
1245 dd_atom_sum_real(cr
->dd
, work
);
1248 if(gb_algorithm
==egbHCT
)
1251 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1253 if(born
->use
[i
] != 0)
1255 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]]-doffset
;
1256 sum
= 1.0/rr
- work
[i
];
1257 min_rad
= rr
+ doffset
;
1260 born
->bRad
[i
] = rad
> min_rad
? rad
: min_rad
;
1261 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1265 /* Extra communication required for DD */
1266 if(DOMAINDECOMP(cr
))
1268 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1269 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1275 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1277 if(born
->use
[i
] != 0)
1279 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]];
1287 tsum
= tanh(born
->obc_alpha
*sum
-born
->obc_beta
*sum2
+born
->obc_gamma
*sum3
);
1288 born
->bRad
[i
] = rr_inv
- tsum
*rr_inv2
;
1289 born
->bRad
[i
] = 1.0 / born
->bRad
[i
];
1291 fr
->invsqrta
[i
]=gmx_invsqrt(born
->bRad
[i
]);
1293 tchain
= rr
* (born
->obc_alpha
-2*born
->obc_beta
*sum
+3*born
->obc_gamma
*sum2
);
1294 born
->drobc
[i
] = (1.0-tsum
*tsum
)*tchain
*rr_inv2
;
1297 /* Extra (local) communication required for DD */
1298 if(DOMAINDECOMP(cr
))
1300 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1301 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1302 dd_atom_spread_real(cr
->dd
, born
->drobc
);
1313 float calc_gb_chainrule_sse(int natoms
, t_nblist
*nl
, float *dadx
, float *dvda
,
1314 float *x
, float *f
, float *fshift
, float *shiftvec
,
1315 int gb_algorithm
, gmx_genborn_t
*born
, t_mdatoms
*md
)
1317 int i
,k
,n
,ii
,jnr
,ii3
,is3
,nj0
,nj1
,offset
,n0
,n1
;
1318 int jnrA
,jnrB
,jnrC
,jnrD
;
1319 int j3A
,j3B
,j3C
,j3D
;
1320 int jnrE
,jnrF
,jnrG
,jnrH
;
1321 int j3E
,j3F
,j3G
,j3H
;
1324 float rbi
,shX
,shY
,shZ
;
1336 __m128 rbai
,rbaj
,rbajB
, f_gb
, f_gb_ai
,f_gbB
,f_gb_aiB
;
1337 __m128 xmm1
,xmm2
,xmm3
;
1339 const __m128 two
= {2.0f
, 2.0f
, 2.0f
, 2.0f
};
1345 /* Loop to get the proper form for the Born radius term, sse style */
1349 n1
= md
->start
+md
->homenr
+1+natoms
/2;
1351 if(gb_algorithm
==egbSTILL
)
1356 rbi
= born
->bRad
[k
];
1357 rb
[k
] = (2 * rbi
* rbi
* dvda
[k
])/ONE_4PI_EPS0
;
1360 else if(gb_algorithm
==egbHCT
)
1365 rbi
= born
->bRad
[k
];
1366 rb
[k
] = rbi
* rbi
* dvda
[k
];
1369 else if(gb_algorithm
==egbOBC
)
1374 rbi
= born
->bRad
[k
];
1375 rb
[k
] = rbi
* rbi
* born
->drobc
[k
] * dvda
[k
];
1379 jz
= _mm_setzero_ps();
1381 n
= j3A
= j3B
= j3C
= j3D
= 0;
1383 for(i
=0;i
<nl
->nri
;i
++)
1387 is3
= 3*nl
->shift
[i
];
1388 shX
= shiftvec
[is3
];
1389 shY
= shiftvec
[is3
+1];
1390 shZ
= shiftvec
[is3
+2];
1391 nj0
= nl
->jindex
[i
];
1392 nj1
= nl
->jindex
[i
+1];
1394 ix
= _mm_set1_ps(shX
+x
[ii3
+0]);
1395 iy
= _mm_set1_ps(shY
+x
[ii3
+1]);
1396 iz
= _mm_set1_ps(shZ
+x
[ii3
+2]);
1398 offset
= (nj1
-nj0
)%4;
1400 rbai
= _mm_load1_ps(rb
+ii
);
1401 fix
= _mm_setzero_ps();
1402 fiy
= _mm_setzero_ps();
1403 fiz
= _mm_setzero_ps();
1406 for(k
=nj0
;k
<nj1
-offset
;k
+=4)
1418 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,x
+j3D
,jx
,jy
,jz
);
1420 dx
= _mm_sub_ps(ix
,jx
);
1421 dy
= _mm_sub_ps(iy
,jy
);
1422 dz
= _mm_sub_ps(iz
,jz
);
1424 GMX_MM_LOAD_4VALUES_PS(rb
+jnrA
,rb
+jnrB
,rb
+jnrC
,rb
+jnrD
,rbaj
);
1426 /* load chain rule terms for j1-4 */
1427 f_gb
= _mm_load_ps(dadx
);
1429 f_gb_ai
= _mm_load_ps(dadx
);
1432 /* calculate scalar force */
1433 f_gb
= _mm_mul_ps(f_gb
,rbai
);
1434 f_gb_ai
= _mm_mul_ps(f_gb_ai
,rbaj
);
1435 f_gb
= _mm_add_ps(f_gb
,f_gb_ai
);
1437 tx
= _mm_mul_ps(f_gb
,dx
);
1438 ty
= _mm_mul_ps(f_gb
,dy
);
1439 tz
= _mm_mul_ps(f_gb
,dz
);
1441 fix
= _mm_add_ps(fix
,tx
);
1442 fiy
= _mm_add_ps(fiy
,ty
);
1443 fiz
= _mm_add_ps(fiz
,tz
);
1445 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f
+j3A
,f
+j3B
,f
+j3C
,f
+j3D
,tx
,ty
,tz
);
1448 /*deal with odd elements */
1455 GMX_MM_LOAD_1RVEC_1POINTER_PS(x
+j3A
,jx
,jy
,jz
);
1456 GMX_MM_LOAD_1VALUE_PS(rb
+jnrA
,rbaj
);
1464 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x
+j3A
,x
+j3B
,jx
,jy
,jz
);
1465 GMX_MM_LOAD_2VALUES_PS(rb
+jnrA
,rb
+jnrB
,rbaj
);
1469 /* offset must be 3 */
1476 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x
+j3A
,x
+j3B
,x
+j3C
,jx
,jy
,jz
);
1477 GMX_MM_LOAD_3VALUES_PS(rb
+jnrA
,rb
+jnrB
,rb
+jnrC
,rbaj
);
1480 dx
= _mm_sub_ps(ix
,jx
);
1481 dy
= _mm_sub_ps(iy
,jy
);
1482 dz
= _mm_sub_ps(iz
,jz
);
1484 /* load chain rule terms for j1-4 */
1485 f_gb
= _mm_load_ps(dadx
);
1487 f_gb_ai
= _mm_load_ps(dadx
);
1490 /* calculate scalar force */
1491 f_gb
= _mm_mul_ps(f_gb
,rbai
);
1492 f_gb_ai
= _mm_mul_ps(f_gb_ai
,rbaj
);
1493 f_gb
= _mm_add_ps(f_gb
,f_gb_ai
);
1495 tx
= _mm_mul_ps(f_gb
,dx
);
1496 ty
= _mm_mul_ps(f_gb
,dy
);
1497 tz
= _mm_mul_ps(f_gb
,dz
);
1499 fix
= _mm_add_ps(fix
,tx
);
1500 fiy
= _mm_add_ps(fiy
,ty
);
1501 fiz
= _mm_add_ps(fiz
,tz
);
1505 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f
+j3A
,tx
,ty
,tz
);
1509 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f
+j3A
,f
+j3B
,tx
,ty
,tz
);
1513 /* offset must be 3 */
1514 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f
+j3A
,f
+j3B
,f
+j3C
,tx
,ty
,tz
);
1518 /* fix/fiy/fiz now contain four partial force terms, that all should be
1519 * added to the i particle forces and shift forces.
1521 gmx_mm_update_iforce_1atom_ps(fix
,fiy
,fiz
,f
+ii3
,fshift
+is3
);
1528 float gb_bonds_analytic(real
*x
, real
*f
, real
*charge
, real
*bRad
, real
*dvda
,
1529 t_idef
*idef
, real epsilon_r
, real gb_epsilon_solvent
, real facel
)
1533 int type1
,type2
,type3
,type4
,ai1
,ai2
,ai3
,ai4
,aj1
,aj2
,aj3
,aj4
;
1534 int ai13
,ai23
,ai33
,ai43
,aj13
,aj23
,aj33
,aj43
;
1539 __m128 ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
;
1540 __m128 rsq11
,rinv
,r
,t1
,t2
,t3
;
1541 __m128 isai
,isaj
,isaprod
,inv_isaprod
,expterm
;
1542 __m128 ci
,cj
,qq
,fac
,dva
,dva_i
,dva_j
,di
,dj
;
1543 __m128 vgb
,vgb_tot
,fgb
,fgb2
,fijC
,fscal
;
1544 __m128 tx
,ty
,tz
,fix1
,fiy1
,fiz1
;
1545 __m128 xmm1
,xmm2
,xmm3
,xmm4
,xmm5
,xmm6
,xmm7
,xmm8
;
1548 const __m128 neg
= {-1.0f
, -1.0f
, -1.0f
, -1.0f
};
1549 const __m128 qrtr
= {0.25f
, 0.25f
, 0.25f
, 0.25f
};
1550 const __m128 eigth
= {0.125f
, 0.125f
, 0.125f
, 0.125f
};
1551 const __m128 half
= {0.5f
, 0.5f
, 0.5f
, 0.5f
};
1552 const __m128 three
= {3.0f
, 3.0f
, 3.0f
, 3.0f
};
1554 t_iatom
*forceatoms
;
1556 /* Keep the compiler happy */
1557 vgb_tot
= _mm_setzero_ps();
1558 vgb
= _mm_setzero_ps();
1559 xmm1
= _mm_setzero_ps();
1560 xmm2
= _mm_setzero_ps();
1561 xmm3
= _mm_setzero_ps();
1562 xmm4
= _mm_setzero_ps();
1563 ai1
= ai2
= ai3
= ai4
= 0;
1564 aj1
= aj2
= aj3
= aj4
= 0;
1565 ai13
= ai23
= ai33
= ai43
= 0;
1566 aj13
= aj23
= aj33
= aj43
= 0;
1568 /* Scale the electrostatics by gb_epsilon_solvent */
1569 facel
= (-1.0) * facel
* ((1.0/epsilon_r
) - 1.0/gb_epsilon_solvent
);
1570 fac
= _mm_load1_ps(&facel
);
1574 for(j
=F_GB12
;j
<=F_GB14
;j
++)
1576 forceatoms
= idef
->il
[j
].iatoms
;
1578 /* Number of atoms involved in each GB interaction plus the interaction type */
1580 nri
= idef
->il
[j
].nr
/nral
;
1583 /* In the for loop, i is updated for every element in the forceatom list, so we have to stop
1584 * the loop at offset*nral before the maximum idef->il[j].nr number, instead of just offset
1586 for(i
=0;i
<idef
->il
[j
].nr
-(offset
*nral
); )
1588 /* Load everything separately for now, test with sse load and shuffles later
1589 * Also, to avoid reading in the interaction type, we just increment i to pass over
1590 * the types in the forceatoms array, this saves some memory accesses.
1593 ai1
= forceatoms
[i
++];
1594 aj1
= forceatoms
[i
++];
1597 ai2
= forceatoms
[i
++];
1598 aj2
= forceatoms
[i
++];
1601 ai3
= forceatoms
[i
++];
1602 aj3
= forceatoms
[i
++];
1605 ai4
= forceatoms
[i
++];
1606 aj4
= forceatoms
[i
++];
1618 /* Load particle ai1-4 and transpose */
1619 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+ai13
));
1620 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+ai23
));
1621 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+ai33
));
1622 xmm4
= _mm_loadh_pi(xmm4
,(__m64
*) (x
+ai43
));
1624 xmm5
= _mm_load1_ps(x
+ai13
+2);
1625 xmm6
= _mm_load1_ps(x
+ai23
+2);
1626 xmm7
= _mm_load1_ps(x
+ai33
+2);
1627 xmm8
= _mm_load1_ps(x
+ai43
+2);
1629 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1630 xmm6
= _mm_shuffle_ps(xmm7
,xmm8
,_MM_SHUFFLE(0,0,0,0));
1631 iz
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(2,0,2,0));
1633 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1634 xmm2
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(3,2,3,2));
1635 ix
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(2,0,2,0));
1636 iy
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,1,3,1));
1638 /* Load particle aj1-4 and transpose */
1639 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+aj13
));
1640 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+aj23
));
1641 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+aj33
));
1642 xmm4
= _mm_loadh_pi(xmm4
,(__m64
*) (x
+aj43
));
1644 xmm5
= _mm_load1_ps(x
+aj13
+2);
1645 xmm6
= _mm_load1_ps(x
+aj23
+2);
1646 xmm7
= _mm_load1_ps(x
+aj33
+2);
1647 xmm8
= _mm_load1_ps(x
+aj43
+2);
1649 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1650 xmm6
= _mm_shuffle_ps(xmm7
,xmm8
,_MM_SHUFFLE(0,0,0,0));
1651 jz
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(2,0,2,0));
1653 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1654 xmm2
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(3,2,3,2));
1655 jx
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(2,0,2,0));
1656 jy
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,1,3,1));
1659 dx
= _mm_sub_ps(ix
, jx
);
1660 dy
= _mm_sub_ps(iy
, jy
);
1661 dz
= _mm_sub_ps(iz
, jz
);
1663 rsq11
= _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
) , _mm_mul_ps(dy
,dy
) ) , _mm_mul_ps(dz
,dz
) );
1665 rinv
= gmx_mm_inv_ps(rsq11
);
1666 r
= _mm_mul_ps(rsq11
,rinv
);
1668 /* Load Born radii for ai's and aj's */
1669 xmm1
= _mm_load_ss(bRad
+ai1
);
1670 xmm2
= _mm_load_ss(bRad
+ai2
);
1671 xmm3
= _mm_load_ss(bRad
+ai3
);
1672 xmm4
= _mm_load_ss(bRad
+ai4
);
1674 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1675 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1676 isai
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1678 xmm1
= _mm_load_ss(bRad
+aj1
);
1679 xmm2
= _mm_load_ss(bRad
+aj2
);
1680 xmm3
= _mm_load_ss(bRad
+aj3
);
1681 xmm4
= _mm_load_ss(bRad
+aj4
);
1683 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1684 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1685 isaj
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1687 isaprod
= _mm_mul_ps(isai
,isaj
); // rb2 in tinker
1688 inv_isaprod
= _mm_mul_ps(isaprod
,isaprod
);
1689 inv_isaprod
= gmx_mm_inv_ps(inv_isaprod
); //1/rb2 in tinker
1691 /* Load charges for ai's and aj's */
1692 xmm1
= _mm_load_ss(charge
+ai1
);
1693 xmm2
= _mm_load_ss(charge
+ai2
);
1694 xmm3
= _mm_load_ss(charge
+ai3
);
1695 xmm4
= _mm_load_ss(charge
+ai4
);
1697 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1698 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1699 ci
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1701 xmm1
= _mm_load_ss(charge
+aj1
);
1702 xmm2
= _mm_load_ss(charge
+aj2
);
1703 xmm3
= _mm_load_ss(charge
+aj3
);
1704 xmm4
= _mm_load_ss(charge
+aj4
);
1706 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(0,0,0,0));
1707 xmm3
= _mm_shuffle_ps(xmm3
,xmm4
,_MM_SHUFFLE(0,0,0,0));
1708 cj
= _mm_shuffle_ps(xmm1
,xmm3
,_MM_SHUFFLE(2,0,2,0));
1710 qq
= _mm_mul_ps(ci
,fac
);
1711 qq
= _mm_mul_ps(cj
,qq
);
1713 /* Calculate scalar GB force */
1714 expterm
= _mm_mul_ps(rsq11
,inv_isaprod
);
1715 expterm
= _mm_mul_ps(expterm
,qrtr
);
1716 expterm
= _mm_mul_ps(expterm
,neg
);
1717 expterm
= gmx_mm_exp_ps(expterm
);
1719 fgb2
= _mm_mul_ps(isaprod
,expterm
);
1720 fgb2
= _mm_add_ps(fgb2
,rsq11
);
1722 fgb
= gmx_mm_inv_ps(fgb2
);
1724 /* Potential energy */
1725 vgb
= _mm_mul_ps(qq
,fgb
);
1726 vgb_tot
= _mm_add_ps(vgb_tot
,vgb
);
1728 fijC
= _mm_mul_ps(qrtr
,r
);
1729 fijC
= _mm_mul_ps(fijC
,expterm
);
1730 fijC
= _mm_sub_ps(r
,fijC
);
1731 fijC
= _mm_mul_ps(fijC
,vgb
);
1732 fijC
= _mm_mul_ps(fijC
,neg
);
1733 fijC
= _mm_mul_ps(fijC
,fgb
);
1734 fijC
= _mm_mul_ps(fijC
,fgb
); /* fijC = fijC in tab code, de */
1736 /* Chain rule terms */
1737 dva
= _mm_mul_ps(rsq11
,inv_isaprod
);
1738 dva
= _mm_mul_ps(dva
,eigth
);
1739 dva
= _mm_add_ps(dva
,half
);
1740 dva
= _mm_mul_ps(dva
,expterm
);
1741 dva
= _mm_mul_ps(dva
,vgb
);
1742 dva
= _mm_mul_ps(dva
,neg
);
1743 dva
= _mm_mul_ps(dva
,fgb
);
1744 dva
= _mm_mul_ps(dva
,fgb
); /* dva * isaj = dvatmp*isai*isai */
1746 /* Calculate vectorial force */
1747 fscal
= _mm_mul_ps(fijC
,rinv
);
1748 fscal
= _mm_mul_ps(fscal
,neg
);
1750 tx
= _mm_mul_ps(fscal
,dx
);
1751 ty
= _mm_mul_ps(fscal
,dy
);
1752 tz
= _mm_mul_ps(fscal
,dz
);
1754 /* Load, update and store chain rule terms for ai and aj atoms */
1755 dva_i
= _mm_mul_ps(dva
,isaj
);
1756 dva_j
= _mm_mul_ps(dva
,isai
);
1758 xmm1
= _mm_load_ss(dvda
+ai1
);
1759 xmm2
= _mm_load_ss(dvda
+aj1
);
1760 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1761 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1762 _mm_store_ss(dvda
+ai1
,xmm1
);
1763 _mm_store_ss(dvda
+aj1
,xmm2
);
1765 /* Need to shuffle dva_i and dva_j */
1766 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
1767 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
1769 xmm1
= _mm_load_ss(dvda
+ai2
);
1770 xmm2
= _mm_load_ss(dvda
+aj2
);
1771 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1772 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1773 _mm_store_ss(dvda
+ai2
,xmm1
);
1774 _mm_store_ss(dvda
+aj2
,xmm2
);
1776 /* Need to shuffle dva_i and dva_j */
1777 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
1778 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
1780 xmm1
= _mm_load_ss(dvda
+ai3
);
1781 xmm2
= _mm_load_ss(dvda
+aj3
);
1782 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1783 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1784 _mm_store_ss(dvda
+ai3
,xmm1
);
1785 _mm_store_ss(dvda
+aj3
,xmm2
);
1787 /* Need to shuffle dva_i and dva_j */
1788 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
1789 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
1791 xmm1
= _mm_load_ss(dvda
+ai4
);
1792 xmm2
= _mm_load_ss(dvda
+aj4
);
1793 xmm1
= _mm_add_ss(xmm1
,dva_i
);
1794 xmm2
= _mm_add_ss(xmm2
,dva_j
);
1795 _mm_store_ss(dvda
+ai4
,xmm1
);
1796 _mm_store_ss(dvda
+aj4
,xmm2
);
1798 /* Load, update and store partial forces on ai and ai atoms
1799 * This needs to be done in the order ai1+aj1,ai2+aj2 etc...
1800 * since loading all four values at once will mean the force
1801 * is not updated properly
1804 xmm1
= _mm_load_ss(f
+ai13
);
1805 xmm2
= _mm_load_ss(f
+ai13
+1);
1806 xmm3
= _mm_load_ss(f
+ai13
+2);
1808 xmm4
= _mm_load_ss(f
+aj13
);
1809 xmm5
= _mm_load_ss(f
+aj13
+1);
1810 xmm6
= _mm_load_ss(f
+aj13
+2);
1812 xmm1
= _mm_add_ss(xmm1
,tx
);
1813 xmm2
= _mm_add_ss(xmm2
,ty
);
1814 xmm3
= _mm_add_ss(xmm3
,tz
);
1816 xmm4
= _mm_sub_ss(xmm4
,tx
);
1817 xmm5
= _mm_sub_ss(xmm5
,ty
);
1818 xmm6
= _mm_sub_ss(xmm6
,tz
);
1820 _mm_store_ss(f
+ai13
,xmm1
);
1821 _mm_store_ss(f
+ai13
+1,xmm2
);
1822 _mm_store_ss(f
+ai13
+2,xmm3
);
1824 _mm_store_ss(f
+aj13
,xmm4
);
1825 _mm_store_ss(f
+aj13
+1,xmm5
);
1826 _mm_store_ss(f
+aj13
+2,xmm6
);
1829 xmm1
= _mm_load_ss(f
+ai23
);
1830 xmm2
= _mm_load_ss(f
+ai23
+1);
1831 xmm3
= _mm_load_ss(f
+ai23
+2);
1833 xmm4
= _mm_load_ss(f
+aj23
);
1834 xmm5
= _mm_load_ss(f
+aj23
+1);
1835 xmm6
= _mm_load_ss(f
+aj23
+2);
1837 /* Need to shuffle tx/ty/tz */
1838 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
1839 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
1840 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
1842 xmm1
= _mm_add_ss(xmm1
,tx
);
1843 xmm2
= _mm_add_ss(xmm2
,ty
);
1844 xmm3
= _mm_add_ss(xmm3
,tz
);
1846 xmm4
= _mm_sub_ss(xmm4
,tx
);
1847 xmm5
= _mm_sub_ss(xmm5
,ty
);
1848 xmm6
= _mm_sub_ss(xmm6
,tz
);
1850 _mm_store_ss(f
+ai23
,xmm1
);
1851 _mm_store_ss(f
+ai23
+1,xmm2
);
1852 _mm_store_ss(f
+ai23
+2,xmm3
);
1854 _mm_store_ss(f
+aj23
,xmm4
);
1855 _mm_store_ss(f
+aj23
+1,xmm5
);
1856 _mm_store_ss(f
+aj23
+2,xmm6
);
1859 xmm1
= _mm_load_ss(f
+ai33
);
1860 xmm2
= _mm_load_ss(f
+ai33
+1);
1861 xmm3
= _mm_load_ss(f
+ai33
+2);
1863 xmm4
= _mm_load_ss(f
+aj33
);
1864 xmm5
= _mm_load_ss(f
+aj33
+1);
1865 xmm6
= _mm_load_ss(f
+aj33
+2);
1867 /* Need to shuffle tx/ty/tz */
1868 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
1869 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
1870 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
1872 xmm1
= _mm_add_ss(xmm1
,tx
);
1873 xmm2
= _mm_add_ss(xmm2
,ty
);
1874 xmm3
= _mm_add_ss(xmm3
,tz
);
1876 xmm4
= _mm_sub_ss(xmm4
,tx
);
1877 xmm5
= _mm_sub_ss(xmm5
,ty
);
1878 xmm6
= _mm_sub_ss(xmm6
,tz
);
1880 _mm_store_ss(f
+ai33
,xmm1
);
1881 _mm_store_ss(f
+ai33
+1,xmm2
);
1882 _mm_store_ss(f
+ai33
+2,xmm3
);
1884 _mm_store_ss(f
+aj33
,xmm4
);
1885 _mm_store_ss(f
+aj33
+1,xmm5
);
1886 _mm_store_ss(f
+aj33
+2,xmm6
);
1889 xmm1
= _mm_load_ss(f
+ai43
);
1890 xmm2
= _mm_load_ss(f
+ai43
+1);
1891 xmm3
= _mm_load_ss(f
+ai43
+2);
1893 xmm4
= _mm_load_ss(f
+aj43
);
1894 xmm5
= _mm_load_ss(f
+aj43
+1);
1895 xmm6
= _mm_load_ss(f
+aj43
+2);
1897 /* Need to shuffle tx/ty/tz */
1898 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
1899 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
1900 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
1902 xmm1
= _mm_add_ss(xmm1
,tx
);
1903 xmm2
= _mm_add_ss(xmm2
,ty
);
1904 xmm3
= _mm_add_ss(xmm3
,tz
);
1906 xmm4
= _mm_sub_ss(xmm4
,tx
);
1907 xmm5
= _mm_sub_ss(xmm5
,ty
);
1908 xmm6
= _mm_sub_ss(xmm6
,tz
);
1910 _mm_store_ss(f
+ai43
,xmm1
);
1911 _mm_store_ss(f
+ai43
+1,xmm2
);
1912 _mm_store_ss(f
+ai43
+2,xmm3
);
1914 _mm_store_ss(f
+aj43
,xmm4
);
1915 _mm_store_ss(f
+aj43
+1,xmm5
);
1916 _mm_store_ss(f
+aj43
+2,xmm6
);
1924 ai1
= forceatoms
[i
++];
1925 aj1
= forceatoms
[i
++];
1930 /* Load ai and aj coordinates */
1931 xmm1
= _mm_loadl_pi(xmm1
,(__m64
*) (x
+ai13
));
1932 iz
= _mm_load1_ps(x
+ai13
+2);
1934 ix
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(0,0,0,0));
1935 iy
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(1,1,1,1));
1937 xmm1
= _mm_loadl_pi(xmm1
,(__m64
*) (x
+aj13
));
1938 jz
= _mm_load1_ps(x
+aj13
+2);
1940 jx
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(0,0,0,0));
1941 jy
= _mm_shuffle_ps(xmm1
, xmm1
, _MM_SHUFFLE(1,1,1,1));
1943 /* Load Born radii for ai1 and aj1 */
1944 isai
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, bRad
[ai1
]);
1945 isaj
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, bRad
[aj1
]);
1947 /* Load charges for ai1 and aj1 */
1948 ci
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, charge
[ai1
]);
1949 cj
= _mm_set_ps(0.0f
, 0.0f
, 0.0f
, charge
[aj1
]);
1951 /* Load chain rule terms for ai1 and aj1 */
1952 di
= _mm_set_ps(0.0f
,0.0f
,0.0f
,dvda
[ai1
]);
1953 dj
= _mm_set_ps(0.0f
,0.0f
,0.0f
,dvda
[aj1
]);
1955 mask
= gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0,0xffffffff) );
1960 ai1
= forceatoms
[i
++];
1961 aj1
= forceatoms
[i
++];
1964 ai2
= forceatoms
[i
++];
1965 aj2
= forceatoms
[i
++];
1972 /* Load ai1-2 and aj1-2 coordinates */
1973 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+ai13
));
1974 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+ai23
));
1976 xmm5
= _mm_load1_ps(x
+ai13
+2);
1977 xmm6
= _mm_load1_ps(x
+ai23
+2);
1979 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1980 iz
= _mm_shuffle_ps(xmm5
,xmm5
,_MM_SHUFFLE(2,0,2,0));
1982 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1983 ix
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(2,0,2,0));
1984 iy
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(3,1,3,1));
1986 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+aj13
));
1987 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+aj23
));
1989 xmm5
= _mm_load1_ps(x
+aj13
+2);
1990 xmm6
= _mm_load1_ps(x
+aj23
+2);
1992 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
,_MM_SHUFFLE(0,0,0,0));
1993 jz
= _mm_shuffle_ps(xmm5
,xmm5
,_MM_SHUFFLE(2,0,2,0));
1995 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
,_MM_SHUFFLE(3,2,3,2));
1996 jx
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(2,0,2,0));
1997 jy
= _mm_shuffle_ps(xmm1
,xmm1
,_MM_SHUFFLE(3,1,3,1));
1999 /* Load Born radii for ai1-2 and aj1-2 */
2000 isai
= _mm_set_ps(0.0f
, 0.0f
, bRad
[ai2
], bRad
[ai1
]);
2001 isaj
= _mm_set_ps(0.0f
, 0.0f
, bRad
[aj2
], bRad
[aj1
]);
2003 /* Load charges for ai1-2 and aj1-2 */
2004 ci
= _mm_set_ps(0.0f
, 0.0f
, charge
[ai2
], charge
[ai1
]);
2005 cj
= _mm_set_ps(0.0f
, 0.0f
, charge
[aj2
], charge
[aj1
]);
2007 /* Load chain rule terms for ai1-2 and aj1-2 */
2008 di
= _mm_set_ps(0.0f
,0.0f
,dvda
[ai2
],dvda
[ai1
]);
2009 dj
= _mm_set_ps(0.0f
,0.0f
,dvda
[aj2
],dvda
[aj1
]);
2011 mask
= gmx_mm_castsi128_ps( _mm_set_epi32(0,0,0xffffffff,0xffffffff) );
2016 ai1
= forceatoms
[i
++];
2017 aj1
= forceatoms
[i
++];
2020 ai2
= forceatoms
[i
++];
2021 aj2
= forceatoms
[i
++];
2024 ai3
= forceatoms
[i
++];
2025 aj3
= forceatoms
[i
++];
2035 /* Load ai1-3 and aj1-3 coordinates */
2036 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+ai13
));
2037 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+ai23
));
2038 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+ai33
));
2040 xmm5
= _mm_load1_ps(x
+ai13
+2);
2041 xmm6
= _mm_load1_ps(x
+ai23
+2);
2042 xmm7
= _mm_load1_ps(x
+ai33
+2);
2044 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
, _MM_SHUFFLE(0,0,0,0));
2045 iz
= _mm_shuffle_ps(xmm5
,xmm7
, _MM_SHUFFLE(3,1,3,1));
2047 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,2,3,2));
2048 xmm2
= _mm_shuffle_ps(xmm3
,xmm3
, _MM_SHUFFLE(3,2,3,2));
2050 ix
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(2,0,2,0));
2051 iy
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,1,3,1));
2053 xmm1
= _mm_loadh_pi(xmm1
,(__m64
*) (x
+aj13
));
2054 xmm2
= _mm_loadh_pi(xmm2
,(__m64
*) (x
+aj23
));
2055 xmm3
= _mm_loadh_pi(xmm3
,(__m64
*) (x
+aj33
));
2057 xmm5
= _mm_load1_ps(x
+aj13
+2);
2058 xmm6
= _mm_load1_ps(x
+aj23
+2);
2059 xmm7
= _mm_load1_ps(x
+aj33
+2);
2061 xmm5
= _mm_shuffle_ps(xmm5
,xmm6
, _MM_SHUFFLE(0,0,0,0));
2062 jz
= _mm_shuffle_ps(xmm5
,xmm7
, _MM_SHUFFLE(3,1,3,1));
2064 xmm1
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,2,3,2));
2065 xmm2
= _mm_shuffle_ps(xmm3
,xmm3
, _MM_SHUFFLE(3,2,3,2));
2067 jx
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(2,0,2,0));
2068 jy
= _mm_shuffle_ps(xmm1
,xmm2
, _MM_SHUFFLE(3,1,3,1));
2070 /* Load Born radii for ai1-3 and aj1-3 */
2071 isai
= _mm_set_ps(0.0f
, bRad
[ai3
], bRad
[ai2
], bRad
[ai1
]);
2072 isaj
= _mm_set_ps(0.0f
, bRad
[aj3
], bRad
[aj2
], bRad
[aj1
]);
2074 /* Load charges for ai1-3 and aj1-3 */
2075 ci
= _mm_set_ps(0.0f
, charge
[ai3
], charge
[ai2
], charge
[ai1
]);
2076 cj
= _mm_set_ps(0.0f
, charge
[aj3
], charge
[aj2
], charge
[aj1
]);
2078 /* Load chain rule terms for ai1-3 and aj1-3 */
2079 di
= _mm_set_ps(0.0f
,dvda
[ai3
],dvda
[ai2
],dvda
[ai1
]);
2080 dj
= _mm_set_ps(0.0f
,dvda
[aj3
],dvda
[aj2
],dvda
[aj1
]);
2082 mask
= gmx_mm_castsi128_ps( _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff) );
2085 ix
= _mm_and_ps( mask
, ix
);
2086 iy
= _mm_and_ps( mask
, iy
);
2087 iz
= _mm_and_ps( mask
, iz
);
2089 jx
= _mm_and_ps( mask
, jx
);
2090 jy
= _mm_and_ps( mask
, jy
);
2091 jz
= _mm_and_ps( mask
, jz
);
2094 dx
= _mm_sub_ps(ix
, jx
);
2095 dy
= _mm_sub_ps(iy
, jy
);
2096 dz
= _mm_sub_ps(iz
, jz
);
2098 rsq11
= _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx
,dx
) , _mm_mul_ps(dy
,dy
) ) , _mm_mul_ps(dz
,dz
) );
2100 rinv
= gmx_mm_inv_ps(rsq11
);
2101 r
= _mm_mul_ps(rsq11
,rinv
);
2103 isaprod
= _mm_mul_ps(isai
,isaj
);
2104 inv_isaprod
= _mm_mul_ps(isaprod
,isaprod
);
2105 inv_isaprod
= gmx_mm_inv_ps(inv_isaprod
);
2107 qq
= _mm_mul_ps(ci
,fac
);
2108 qq
= _mm_mul_ps(cj
,qq
);
2110 /* Calculate scalar GB force */
2111 expterm
= _mm_mul_ps(rsq11
,inv_isaprod
);
2112 expterm
= _mm_mul_ps(expterm
,qrtr
);
2113 expterm
= _mm_mul_ps(expterm
,neg
);
2114 expterm
= gmx_mm_exp_ps(expterm
);
2116 fgb2
= _mm_mul_ps(isaprod
,expterm
);
2117 fgb2
= _mm_add_ps(fgb2
,rsq11
);
2119 fgb
= gmx_mm_inv_ps(fgb2
);
2121 /* Potential energy */
2122 vgb
= _mm_mul_ps(qq
,fgb
);
2123 vgb
= _mm_and_ps(mask
,vgb
);
2124 vgb_tot
= _mm_add_ps(vgb_tot
,vgb
);
2126 fijC
= _mm_mul_ps(qrtr
,r
);
2127 fijC
= _mm_mul_ps(fijC
,expterm
);
2128 fijC
= _mm_sub_ps(r
,fijC
);
2129 fijC
= _mm_mul_ps(fijC
,vgb
);
2130 fijC
= _mm_mul_ps(fijC
,neg
);
2131 fijC
= _mm_mul_ps(fijC
,fgb
);
2132 fijC
= _mm_mul_ps(fijC
,fgb
); /* fscal = fijC */
2134 /* Chain rule terms */
2135 dva
= _mm_mul_ps(rsq11
,inv_isaprod
);
2136 dva
= _mm_mul_ps(dva
,eigth
);
2137 dva
= _mm_add_ps(dva
,half
);
2138 dva
= _mm_mul_ps(dva
,expterm
);
2139 dva
= _mm_mul_ps(dva
,vgb
);
2140 dva
= _mm_mul_ps(dva
,neg
);
2141 dva
= _mm_mul_ps(dva
,fgb
);
2142 dva
= _mm_mul_ps(dva
,fgb
);
2144 /* Calculate vectorial force */
2145 fscal
= _mm_mul_ps(fijC
,rinv
);
2146 fscal
= _mm_mul_ps(fscal
,neg
);
2148 tx
= _mm_mul_ps(fscal
,dx
);
2149 ty
= _mm_mul_ps(fscal
,dy
);
2150 tz
= _mm_mul_ps(fscal
,dz
);
2152 /* Calculate chain rule terms */
2153 dva_i
= _mm_mul_ps(dva
,isaj
);
2154 dva_j
= _mm_mul_ps(dva
,isai
);
2158 /* Load, update and store chain rule terms for ai1 and aj1 */
2159 xmm1
= _mm_load_ss(dvda
+ai1
);
2160 xmm2
= _mm_load_ss(dvda
+aj1
);
2161 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2162 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2163 _mm_store_ss(dvda
+ai1
,xmm1
);
2164 _mm_store_ss(dvda
+aj1
,xmm2
);
2166 /* Load, update and store partial forces on ai1 and aj1 */
2167 xmm1
= _mm_load_ss(f
+ai13
);
2168 xmm2
= _mm_load_ss(f
+ai13
+1);
2169 xmm3
= _mm_load_ss(f
+ai13
+2);
2171 xmm4
= _mm_load_ss(f
+aj13
);
2172 xmm5
= _mm_load_ss(f
+aj13
+1);
2173 xmm6
= _mm_load_ss(f
+aj13
+2);
2175 xmm1
= _mm_add_ss(xmm1
,tx
);
2176 xmm2
= _mm_add_ss(xmm2
,ty
);
2177 xmm3
= _mm_add_ss(xmm3
,tz
);
2179 xmm4
= _mm_sub_ss(xmm4
,tx
);
2180 xmm5
= _mm_sub_ss(xmm5
,ty
);
2181 xmm6
= _mm_sub_ss(xmm6
,tz
);
2183 _mm_store_ss(f
+ai13
,xmm1
);
2184 _mm_store_ss(f
+ai13
+1,xmm2
);
2185 _mm_store_ss(f
+ai13
+2,xmm3
);
2187 _mm_store_ss(f
+aj13
,xmm4
);
2188 _mm_store_ss(f
+aj13
+1,xmm5
);
2189 _mm_store_ss(f
+aj13
+2,xmm6
);
2193 /* Load, update and store chain rule terms for ai1-2 and aj1-2 atoms */
2194 dva_i
= _mm_mul_ps(dva
,isaj
);
2195 dva_j
= _mm_mul_ps(dva
,isai
);
2197 xmm1
= _mm_load_ss(dvda
+ai1
);
2198 xmm2
= _mm_load_ss(dvda
+aj1
);
2199 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2200 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2201 _mm_store_ss(dvda
+ai1
,xmm1
);
2202 _mm_store_ss(dvda
+aj1
,xmm2
);
2204 /* Need to shuffle dva_i and dva_j */
2205 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
2206 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
2208 xmm1
= _mm_load_ss(dvda
+ai2
);
2209 xmm2
= _mm_load_ss(dvda
+aj2
);
2210 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2211 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2212 _mm_store_ss(dvda
+ai2
,xmm1
);
2213 _mm_store_ss(dvda
+aj2
,xmm2
);
2215 /* Load, update and store partial forces on ai1-2 and aj1-2 */
2216 xmm1
= _mm_load_ss(f
+ai13
);
2217 xmm2
= _mm_load_ss(f
+ai13
+1);
2218 xmm3
= _mm_load_ss(f
+ai13
+2);
2220 xmm4
= _mm_load_ss(f
+aj13
);
2221 xmm5
= _mm_load_ss(f
+aj13
+1);
2222 xmm6
= _mm_load_ss(f
+aj13
+2);
2224 xmm1
= _mm_add_ss(xmm1
,tx
);
2225 xmm2
= _mm_add_ss(xmm2
,ty
);
2226 xmm3
= _mm_add_ss(xmm3
,tz
);
2228 xmm4
= _mm_sub_ss(xmm4
,tx
);
2229 xmm5
= _mm_sub_ss(xmm5
,ty
);
2230 xmm6
= _mm_sub_ss(xmm6
,tz
);
2232 _mm_store_ss(f
+ai13
,xmm1
);
2233 _mm_store_ss(f
+ai13
+1,xmm2
);
2234 _mm_store_ss(f
+ai13
+2,xmm3
);
2236 _mm_store_ss(f
+aj13
,xmm4
);
2237 _mm_store_ss(f
+aj13
+1,xmm5
);
2238 _mm_store_ss(f
+aj13
+2,xmm6
);
2241 xmm1
= _mm_load_ss(f
+ai23
);
2242 xmm2
= _mm_load_ss(f
+ai23
+1);
2243 xmm3
= _mm_load_ss(f
+ai23
+2);
2245 xmm4
= _mm_load_ss(f
+aj23
);
2246 xmm5
= _mm_load_ss(f
+aj23
+1);
2247 xmm6
= _mm_load_ss(f
+aj23
+2);
2249 /* Need to shuffle tx/ty/tz */
2250 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
2251 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
2252 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
2254 xmm1
= _mm_add_ss(xmm1
,tx
);
2255 xmm2
= _mm_add_ss(xmm2
,ty
);
2256 xmm3
= _mm_add_ss(xmm3
,tz
);
2258 xmm4
= _mm_sub_ss(xmm4
,tx
);
2259 xmm5
= _mm_sub_ss(xmm5
,ty
);
2260 xmm6
= _mm_sub_ss(xmm6
,tz
);
2262 _mm_store_ss(f
+ai23
,xmm1
);
2263 _mm_store_ss(f
+ai23
+1,xmm2
);
2264 _mm_store_ss(f
+ai23
+2,xmm3
);
2266 _mm_store_ss(f
+aj23
,xmm4
);
2267 _mm_store_ss(f
+aj23
+1,xmm5
);
2268 _mm_store_ss(f
+aj23
+2,xmm6
);
2272 /* Load, update and store chain rule terms for ai1-3 and aj1-3 atoms */
2273 dva_i
= _mm_mul_ps(dva
,isaj
);
2274 dva_j
= _mm_mul_ps(dva
,isai
);
2276 xmm1
= _mm_load_ss(dvda
+ai1
);
2277 xmm2
= _mm_load_ss(dvda
+aj1
);
2278 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2279 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2280 _mm_store_ss(dvda
+ai1
,xmm1
);
2281 _mm_store_ss(dvda
+aj1
,xmm2
);
2283 /* Need to shuffle dva_i and dva_j */
2284 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
2285 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
2287 xmm1
= _mm_load_ss(dvda
+ai2
);
2288 xmm2
= _mm_load_ss(dvda
+aj2
);
2289 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2290 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2291 _mm_store_ss(dvda
+ai2
,xmm1
);
2292 _mm_store_ss(dvda
+aj2
,xmm2
);
2294 /* Need to shuffle dva_i and dva_j */
2295 dva_i
= _mm_shuffle_ps(dva_i
,dva_i
,_MM_SHUFFLE(0,3,2,1));
2296 dva_j
= _mm_shuffle_ps(dva_j
,dva_j
,_MM_SHUFFLE(0,3,2,1));
2298 xmm1
= _mm_load_ss(dvda
+ai3
);
2299 xmm2
= _mm_load_ss(dvda
+aj3
);
2300 xmm1
= _mm_add_ss(xmm1
,dva_i
);
2301 xmm2
= _mm_add_ss(xmm2
,dva_j
);
2302 _mm_store_ss(dvda
+ai3
,xmm1
);
2303 _mm_store_ss(dvda
+aj3
,xmm2
);
2305 /* Load, update and store partial forces on ai1-3 and aj1-3 */
2306 xmm1
= _mm_load_ss(f
+ai13
);
2307 xmm2
= _mm_load_ss(f
+ai13
+1);
2308 xmm3
= _mm_load_ss(f
+ai13
+2);
2310 xmm4
= _mm_load_ss(f
+aj13
);
2311 xmm5
= _mm_load_ss(f
+aj13
+1);
2312 xmm6
= _mm_load_ss(f
+aj13
+2);
2314 xmm1
= _mm_add_ss(xmm1
,tx
);
2315 xmm2
= _mm_add_ss(xmm2
,ty
);
2316 xmm3
= _mm_add_ss(xmm3
,tz
);
2318 xmm4
= _mm_sub_ss(xmm4
,tx
);
2319 xmm5
= _mm_sub_ss(xmm5
,ty
);
2320 xmm6
= _mm_sub_ss(xmm6
,tz
);
2322 _mm_store_ss(f
+ai13
,xmm1
);
2323 _mm_store_ss(f
+ai13
+1,xmm2
);
2324 _mm_store_ss(f
+ai13
+2,xmm3
);
2326 _mm_store_ss(f
+aj13
,xmm4
);
2327 _mm_store_ss(f
+aj13
+1,xmm5
);
2328 _mm_store_ss(f
+aj13
+2,xmm6
);
2331 xmm1
= _mm_load_ss(f
+ai23
);
2332 xmm2
= _mm_load_ss(f
+ai23
+1);
2333 xmm3
= _mm_load_ss(f
+ai23
+2);
2335 xmm4
= _mm_load_ss(f
+aj23
);
2336 xmm5
= _mm_load_ss(f
+aj23
+1);
2337 xmm6
= _mm_load_ss(f
+aj23
+2);
2339 /* Need to shuffle tx/ty/tz */
2340 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
2341 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
2342 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
2344 xmm1
= _mm_add_ss(xmm1
,tx
);
2345 xmm2
= _mm_add_ss(xmm2
,ty
);
2346 xmm3
= _mm_add_ss(xmm3
,tz
);
2348 xmm4
= _mm_sub_ss(xmm4
,tx
);
2349 xmm5
= _mm_sub_ss(xmm5
,ty
);
2350 xmm6
= _mm_sub_ss(xmm6
,tz
);
2352 _mm_store_ss(f
+ai23
,xmm1
);
2353 _mm_store_ss(f
+ai23
+1,xmm2
);
2354 _mm_store_ss(f
+ai23
+2,xmm3
);
2356 _mm_store_ss(f
+aj23
,xmm4
);
2357 _mm_store_ss(f
+aj23
+1,xmm5
);
2358 _mm_store_ss(f
+aj23
+2,xmm6
);
2361 xmm1
= _mm_load_ss(f
+ai33
);
2362 xmm2
= _mm_load_ss(f
+ai33
+1);
2363 xmm3
= _mm_load_ss(f
+ai33
+2);
2365 xmm4
= _mm_load_ss(f
+aj33
);
2366 xmm5
= _mm_load_ss(f
+aj33
+1);
2367 xmm6
= _mm_load_ss(f
+aj33
+2);
2369 /* Need to shuffle tx/ty/tz */
2370 tx
= _mm_shuffle_ps(tx
,tx
,_MM_SHUFFLE(0,3,2,1));
2371 ty
= _mm_shuffle_ps(ty
,ty
,_MM_SHUFFLE(0,3,2,1));
2372 tz
= _mm_shuffle_ps(tz
,tz
,_MM_SHUFFLE(0,3,2,1));
2374 xmm1
= _mm_add_ss(xmm1
,tx
);
2375 xmm2
= _mm_add_ss(xmm2
,ty
);
2376 xmm3
= _mm_add_ss(xmm3
,tz
);
2378 xmm4
= _mm_sub_ss(xmm4
,tx
);
2379 xmm5
= _mm_sub_ss(xmm5
,ty
);
2380 xmm6
= _mm_sub_ss(xmm6
,tz
);
2382 _mm_store_ss(f
+ai33
,xmm1
);
2383 _mm_store_ss(f
+ai33
+1,xmm2
);
2384 _mm_store_ss(f
+ai33
+2,xmm3
);
2386 _mm_store_ss(f
+aj33
,xmm4
);
2387 _mm_store_ss(f
+aj33
+1,xmm5
);
2388 _mm_store_ss(f
+aj33
+2,xmm6
);
2393 /* End processing for potential energy */
2394 vgb
= _mm_movehl_ps(vgb
,vgb_tot
);
2395 vgb_tot
= _mm_add_ps(vgb_tot
,vgb
);
2396 vgb
= _mm_shuffle_ps(vgb_tot
,vgb_tot
,_MM_SHUFFLE(1,1,1,1));
2397 vgb_tot
= _mm_add_ss(vgb_tot
,vgb
);
2399 _mm_store_ss(&vctot
,vgb_tot
);
2408 /* keep compiler happy */
2409 int genborn_sse_dummy
;
2411 #endif /* SSE intrinsics available */