2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 /* This is the innermost loop contents for the 4 x N atom simd kernel.
37 * This flavor of the kernel duplicates the data for N j-particles in
38 * 2xN wide simd registers to do operate on 2 i-particles at once.
39 * This leads to 4/2=2 sets of most instructions. Therefore we call
40 * this kernel 2x(N+N) = 2xnn
42 * This 2xnn kernel is basically the 4xn equivalent with half the registers
43 * and instructions removed.
45 * An alternative would be to load to different cluster of N j-particles
46 * into simd registers, giving a 4x(N+N) kernel. This doubles the amount
47 * of instructions, which could lead to better scheduling. But we actually
48 * observed worse scheduling for the AVX-256 4x8 normal analytical PME
49 * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
50 * It could be worth trying this option, but it takes some more effort.
51 * This 2xnn kernel is basically the 4xn equivalent with
55 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
56 * forces on excluded atom pairs here in the non-bonded loops.
57 * But when energies and/or virial is required we calculate them
58 * separately to as then it is easier to separate the energy and virial
61 #if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
66 int cj
, aj
, ajx
, ajy
, ajz
;
69 /* Energy group indices for two atoms packed into one int */
70 int egp_jj
[UNROLLJ
/2];
74 /* Interaction (non-exclusion) mask of all 1's or 0's */
79 SimdReal jx_S
, jy_S
, jz_S
;
80 SimdReal dx_S0
, dy_S0
, dz_S0
;
81 SimdReal dx_S2
, dy_S2
, dz_S2
;
82 SimdReal tx_S0
, ty_S0
, tz_S0
;
83 SimdReal tx_S2
, ty_S2
, tz_S2
;
84 SimdReal rsq_S0
, rinv_S0
, rinvsq_S0
;
85 SimdReal rsq_S2
, rinv_S2
, rinvsq_S2
;
86 /* wco: within cut-off, mask of all 1's or 0's */
89 #ifdef VDW_CUTOFF_CHECK
96 #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
98 #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
103 #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
104 SimdReal rsw_S0
, rsw2_S0
;
106 SimdReal rsw_S2
, rsw2_S2
;
112 /* 1/r masked with the interaction mask */
120 /* The force (PME mesh force) we need to subtract from 1/r^2 */
124 #ifdef CALC_COUL_EWALD
125 SimdReal brsq_S0
, brsq_S2
;
126 SimdReal ewcorr_S0
, ewcorr_S2
;
129 /* frcoul = (1/r - fsub)*r */
133 /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
134 SimdReal rs_S0
, rf_S0
, frac_S0
;
135 SimdReal rs_S2
, rf_S2
, frac_S2
;
136 /* Table index: rs truncated to an int */
137 SimdInt32 ti_S0
, ti_S2
;
138 /* Linear force table values */
139 SimdReal ctab0_S0
, ctab1_S0
;
140 SimdReal ctab0_S2
, ctab1_S2
;
142 /* Quadratic energy table value */
143 SimdReal ctabv_S0
, dum_S0
;
144 SimdReal ctabv_S2
, dum_S2
;
147 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
148 /* The potential (PME mesh) we need to subtract from 1/r */
153 /* Electrostatic potential */
158 /* The force times 1/r */
164 /* LJ sigma_j/2 and sqrt(epsilon_j) */
165 SimdReal hsig_j_S
, seps_j_S
;
166 /* LJ sigma_ij and epsilon_ij */
167 SimdReal sig_S0
, eps_S0
;
169 SimdReal sig_S2
, eps_S2
;
172 SimdReal sig2_S0
, sig6_S0
;
174 SimdReal sig2_S2
, sig6_S2
;
176 #endif /* LJ_COMB_LB */
180 SimdReal c6s_j_S
, c12s_j_S
;
183 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
184 /* Index for loading LJ parameters, complicated when interleaving */
188 /* Intermediate variables for LJ calculation */
196 SimdReal sir_S0
, sir2_S0
, sir6_S0
;
198 SimdReal sir_S2
, sir2_S2
, sir6_S2
;
202 SimdReal FrLJ6_S0
, FrLJ12_S0
, frLJ_S0
;
204 SimdReal FrLJ6_S2
, FrLJ12_S2
, frLJ_S2
;
208 /* j-cluster index */
211 /* Atom indices (of the first atom in the cluster) */
213 #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
221 gmx_load_simd_2xnn_interactions(l_cj
[cjind
].excl
,
222 filter_S0
, filter_S2
,
223 &interact_S0
, &interact_S2
);
224 #endif /* CHECK_EXCLS */
226 /* load j atom coordinates */
227 jx_S
= loadDuplicateHsimd(x
+ajx
);
228 jy_S
= loadDuplicateHsimd(x
+ajy
);
229 jz_S
= loadDuplicateHsimd(x
+ajz
);
231 /* Calculate distance */
232 dx_S0
= ix_S0
- jx_S
;
233 dy_S0
= iy_S0
- jy_S
;
234 dz_S0
= iz_S0
- jz_S
;
235 dx_S2
= ix_S2
- jx_S
;
236 dy_S2
= iy_S2
- jy_S
;
237 dz_S2
= iz_S2
- jz_S
;
239 /* rsq = dx*dx+dy*dy+dz*dz */
240 rsq_S0
= norm2(dx_S0
, dy_S0
, dz_S0
);
241 rsq_S2
= norm2(dx_S2
, dy_S2
, dz_S2
);
243 /* Do the cut-off check */
244 wco_S0
= (rsq_S0
< rc2_S
);
245 wco_S2
= (rsq_S2
< rc2_S
);
249 /* Only remove the (sub-)diagonal to avoid double counting */
250 #if UNROLLJ == UNROLLI
253 wco_S0
= wco_S0
&& diagonal_mask_S0
;
254 wco_S2
= wco_S2
&& diagonal_mask_S2
;
257 #if UNROLLJ == 2*UNROLLI
260 wco_S0
= wco_S0
&& diagonal_mask0_S0
;
261 wco_S2
= wco_S2
&& diagonal_mask0_S2
;
263 else if (cj
*2 + 1 == ci_sh
)
265 wco_S0
= wco_S0
&& diagonal_mask1_S0
;
266 wco_S2
= wco_S2
&& diagonal_mask1_S2
;
269 #error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
272 #else /* EXCL_FORCES */
273 /* No exclusion forces: remove all excluded atom pairs from the list */
274 wco_S0
= wco_S0
&& interact_S0
;
275 wco_S2
= wco_S2
&& interact_S2
;
282 alignas(GMX_SIMD_ALIGNMENT
) real tmp
[GMX_SIMD_REAL_WIDTH
];
284 for (i
= 0; i
< UNROLLI
; i
+= 2)
286 store(tmp
, rc2_S
- (i
== 0 ? rsq_S0
: rsq_S2
));
287 for (j
= 0; j
< 2*UNROLLJ
; j
++)
298 // Ensure the distances do not fall below the limit where r^-12 overflows.
299 // This should never happen for normal interactions.
300 rsq_S0
= max(rsq_S0
, minRsq_S
);
301 rsq_S2
= max(rsq_S2
, minRsq_S
);
304 rinv_S0
= invsqrt(rsq_S0
);
305 rinv_S2
= invsqrt(rsq_S2
);
308 /* Load parameters for j atom */
309 jq_S
= loadDuplicateHsimd(q
+aj
);
310 qq_S0
= iq_S0
* jq_S
;
311 qq_S2
= iq_S2
* jq_S
;
315 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
316 SimdReal c6_S0
, c12_S0
;
317 gatherLoadTransposeHsimd
<c_simdBestPairAlignment
>(nbfp0
, nbfp1
, type
+aj
, &c6_S0
, &c12_S0
);
319 SimdReal c6_S2
, c12_S2
;
320 gatherLoadTransposeHsimd
<c_simdBestPairAlignment
>(nbfp2
, nbfp3
, type
+aj
, &c6_S2
, &c12_S2
);
322 #endif /* not defined any LJ rule */
325 c6s_j_S
= loadDuplicateHsimd(ljc
+aj2
);
326 c12s_j_S
= loadDuplicateHsimd(ljc
+aj2
+STRIDE
);
327 SimdReal c6_S0
= c6s_S0
* c6s_j_S
;
329 SimdReal c6_S2
= c6s_S2
* c6s_j_S
;
331 SimdReal c12_S0
= c12s_S0
* c12s_j_S
;
333 SimdReal c12_S2
= c12s_S2
* c12s_j_S
;
335 #endif /* LJ_COMB_GEOM */
338 hsig_j_S
= loadDuplicateHsimd(ljc
+aj2
);
339 seps_j_S
= loadDuplicateHsimd(ljc
+aj2
+STRIDE
);
341 sig_S0
= hsig_i_S0
+ hsig_j_S
;
342 eps_S0
= seps_i_S0
* seps_j_S
;
344 sig_S2
= hsig_i_S2
+ hsig_j_S
;
345 eps_S2
= seps_i_S2
* seps_j_S
;
347 #endif /* LJ_COMB_LB */
351 /* Set rinv to zero for r beyond the cut-off */
352 rinv_S0
= selectByMask(rinv_S0
, wco_S0
);
353 rinv_S2
= selectByMask(rinv_S2
, wco_S2
);
355 rinvsq_S0
= rinv_S0
* rinv_S0
;
356 rinvsq_S2
= rinv_S2
* rinv_S2
;
359 /* Note that here we calculate force*r, not the usual force/r.
360 * This allows avoiding masking the reaction-field contribution,
361 * as frcoul is later multiplied by rinvsq which has been
362 * masked with the cut-off check.
366 /* Only add 1/r for non-excluded atom pairs */
367 rinv_ex_S0
= selectByMask(rinv_S0
, interact_S0
);
368 rinv_ex_S2
= selectByMask(rinv_S2
, interact_S2
);
370 /* No exclusion forces, we always need 1/r */
371 #define rinv_ex_S0 rinv_S0
372 #define rinv_ex_S2 rinv_S2
376 /* Electrostatic interactions */
377 frcoul_S0
= qq_S0
* fma(rsq_S0
, mrc_3_S
, rinv_ex_S0
);
378 frcoul_S2
= qq_S2
* fma(rsq_S2
, mrc_3_S
, rinv_ex_S2
);
381 vcoul_S0
= qq_S0
* (rinv_ex_S0
+ fma(rsq_S0
, hrc_3_S
, moh_rc_S
));
382 vcoul_S2
= qq_S2
* (rinv_ex_S2
+ fma(rsq_S2
, hrc_3_S
, moh_rc_S
));
386 #ifdef CALC_COUL_EWALD
387 /* We need to mask (or limit) rsq for the cut-off,
388 * as large distances can cause an overflow in gmx_pmecorrF/V.
390 brsq_S0
= beta2_S
* selectByMask(rsq_S0
, wco_S0
);
391 brsq_S2
= beta2_S
* selectByMask(rsq_S2
, wco_S2
);
392 ewcorr_S0
= beta_S
* pmeForceCorrection(brsq_S0
);
393 ewcorr_S2
= beta_S
* pmeForceCorrection(brsq_S2
);
394 frcoul_S0
= qq_S0
* fma(ewcorr_S0
, brsq_S0
, rinv_ex_S0
);
395 frcoul_S2
= qq_S2
* fma(ewcorr_S2
, brsq_S2
, rinv_ex_S2
);
398 vc_sub_S0
= beta_S
* pmePotentialCorrection(brsq_S0
);
399 vc_sub_S2
= beta_S
* pmePotentialCorrection(brsq_S2
);
402 #endif /* CALC_COUL_EWALD */
405 /* Electrostatic interactions */
406 r_S0
= rsq_S0
* rinv_S0
;
407 r_S2
= rsq_S2
* rinv_S2
;
408 /* Convert r to scaled table units */
409 rs_S0
= r_S0
* invtsp_S
;
410 rs_S2
= r_S2
* invtsp_S
;
411 /* Truncate scaled r to an int */
412 ti_S0
= cvttR2I(rs_S0
);
413 ti_S2
= cvttR2I(rs_S2
);
415 rf_S0
= trunc(rs_S0
);
416 rf_S2
= trunc(rs_S2
);
418 frac_S0
= rs_S0
- rf_S0
;
419 frac_S2
= rs_S2
- rf_S2
;
421 /* Load and interpolate table forces and possibly energies.
422 * Force and energy can be combined in one table, stride 4: FDV0
423 * or in two separate tables with stride 1: F and V
424 * Currently single precision uses FDV0, double F and V.
426 #ifndef CALC_ENERGIES
428 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
);
429 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
);
431 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
);
432 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
);
433 ctab1_S0
= ctab1_S0
- ctab0_S0
;
434 ctab1_S2
= ctab1_S2
- ctab0_S2
;
438 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
, &ctabv_S0
, &dum_S0
);
439 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
, &ctabv_S2
, &dum_S2
);
441 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
);
442 gatherLoadUBySimdIntTranspose
<1>(tab_coul_V
, ti_S0
, &ctabv_S0
, &dum_S0
);
443 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
);
444 gatherLoadUBySimdIntTranspose
<1>(tab_coul_V
, ti_S2
, &ctabv_S2
, &dum_S2
);
445 ctab1_S0
= ctab1_S0
- ctab0_S0
;
446 ctab1_S2
= ctab1_S2
- ctab0_S2
;
449 fsub_S0
= fma(frac_S0
, ctab1_S0
, ctab0_S0
);
450 fsub_S2
= fma(frac_S2
, ctab1_S2
, ctab0_S2
);
451 frcoul_S0
= qq_S0
* fnma(fsub_S0
, r_S0
, rinv_ex_S0
);
452 frcoul_S2
= qq_S2
* fnma(fsub_S2
, r_S2
, rinv_ex_S2
);
455 vc_sub_S0
= fma((mhalfsp_S
* frac_S0
), (ctab0_S0
+ fsub_S0
), ctabv_S0
);
456 vc_sub_S2
= fma((mhalfsp_S
* frac_S2
), (ctab0_S2
+ fsub_S2
), ctabv_S2
);
458 #endif /* CALC_COUL_TAB */
460 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
461 #ifndef NO_SHIFT_EWALD
462 /* Add Ewald potential shift to vc_sub for convenience */
464 vc_sub_S0
= vc_sub_S0
+ selectByMask(sh_ewald_S
, interact_S0
);
465 vc_sub_S2
= vc_sub_S2
+ selectByMask(sh_ewald_S
, interact_S2
);
467 vc_sub_S0
= vc_sub_S0
+ sh_ewald_S
;
468 vc_sub_S2
= vc_sub_S2
+ sh_ewald_S
;
472 vcoul_S0
= qq_S0
* (rinv_ex_S0
- vc_sub_S0
);
473 vcoul_S2
= qq_S2
* (rinv_ex_S2
- vc_sub_S2
);
478 /* Mask energy for cut-off and diagonal */
479 vcoul_S0
= selectByMask(vcoul_S0
, wco_S0
);
480 vcoul_S2
= selectByMask(vcoul_S2
, wco_S2
);
483 #endif /* CALC_COULOMB */
486 /* Lennard-Jones interaction */
488 #ifdef VDW_CUTOFF_CHECK
489 wco_vdw_S0
= (rsq_S0
< rcvdw2_S
);
491 wco_vdw_S2
= (rsq_S2
< rcvdw2_S
);
494 /* Same cut-off for Coulomb and VdW, reuse the registers */
495 #define wco_vdw_S0 wco_S0
496 #define wco_vdw_S2 wco_S2
500 rinvsix_S0
= rinvsq_S0
* rinvsq_S0
* rinvsq_S0
;
502 rinvsix_S0
= selectByMask(rinvsix_S0
, interact_S0
);
505 rinvsix_S2
= rinvsq_S2
* rinvsq_S2
* rinvsq_S2
;
507 rinvsix_S2
= selectByMask(rinvsix_S2
, interact_S2
);
511 #if defined LJ_CUT || defined LJ_POT_SWITCH
512 /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
513 FrLJ6_S0
= c6_S0
* rinvsix_S0
;
515 FrLJ6_S2
= c6_S2
* rinvsix_S2
;
517 FrLJ12_S0
= c12_S0
* rinvsix_S0
* rinvsix_S0
;
519 FrLJ12_S2
= c12_S2
* rinvsix_S2
* rinvsix_S2
;
523 #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
524 /* We switch the LJ force */
525 r_S0
= rsq_S0
* rinv_S0
;
526 rsw_S0
= max(r_S0
- rswitch_S
, zero_S
);
527 rsw2_S0
= rsw_S0
* rsw_S0
;
529 r_S2
= rsq_S2
* rinv_S2
;
530 rsw_S2
= max(r_S2
- rswitch_S
, zero_S
);
531 rsw2_S2
= rsw_S2
* rsw_S2
;
535 #ifdef LJ_FORCE_SWITCH
537 #define add_fr_switch(fr, rsw, rsw2_r, c2, c3) fma(fma(c3, rsw, c2), rsw2_r, fr)
538 SimdReal rsw2_r_S0
= rsw2_S0
* r_S0
;
539 FrLJ6_S0
= c6_S0
* add_fr_switch(rinvsix_S0
, rsw_S0
, rsw2_r_S0
, p6_fc2_S
, p6_fc3_S
);
541 SimdReal rsw2_r_S2
= rsw2_S2
* r_S2
;
542 FrLJ6_S2
= c6_S2
* add_fr_switch(rinvsix_S2
, rsw_S2
, rsw2_r_S2
, p6_fc2_S
, p6_fc3_S
);
544 FrLJ12_S0
= c12_S0
* add_fr_switch(rinvsix_S0
* rinvsix_S0
, rsw_S0
, rsw2_r_S0
, p12_fc2_S
, p12_fc3_S
);
546 FrLJ12_S2
= c12_S2
* add_fr_switch(rinvsix_S2
* rinvsix_S2
, rsw_S2
, rsw2_r_S2
, p12_fc2_S
, p12_fc3_S
);
549 #endif /* LJ_FORCE_SWITCH */
551 #endif /* not LJ_COMB_LB */
554 sir_S0
= sig_S0
* rinv_S0
;
556 sir_S2
= sig_S2
* rinv_S2
;
558 sir2_S0
= sir_S0
* sir_S0
;
560 sir2_S2
= sir_S2
* sir_S2
;
562 sir6_S0
= sir2_S0
* sir2_S0
* sir2_S0
;
564 sir6_S0
= selectByMask(sir6_S0
, interact_S0
);
567 sir6_S2
= sir2_S2
* sir2_S2
* sir2_S2
;
569 sir6_S2
= selectByMask(sir6_S2
, interact_S2
);
572 #ifdef VDW_CUTOFF_CHECK
573 sir6_S0
= selectByMask(sir6_S0
, wco_vdw_S0
);
575 sir6_S2
= selectByMask(sir6_S2
, wco_vdw_S2
);
578 FrLJ6_S0
= eps_S0
* sir6_S0
;
580 FrLJ6_S2
= eps_S2
* sir6_S2
;
582 FrLJ12_S0
= FrLJ6_S0
* sir6_S0
;
584 FrLJ12_S2
= FrLJ6_S2
* sir6_S2
;
586 #if defined CALC_ENERGIES
587 /* We need C6 and C12 to calculate the LJ potential shift */
588 sig2_S0
= sig_S0
* sig_S0
;
590 sig2_S2
= sig_S2
* sig_S2
;
592 sig6_S0
= sig2_S0
* sig2_S0
* sig2_S0
;
594 sig6_S2
= sig2_S2
* sig2_S2
* sig2_S2
;
596 SimdReal c6_S0
= eps_S0
* sig6_S0
;
598 SimdReal c6_S2
= eps_S2
* sig6_S2
;
600 SimdReal c12_S0
= c6_S0
* sig6_S0
;
602 SimdReal c12_S2
= c6_S2
* sig6_S2
;
605 #endif /* LJ_COMB_LB */
607 /* Determine the total scalar LJ force*r */
608 frLJ_S0
= FrLJ12_S0
- FrLJ6_S0
;
610 frLJ_S2
= FrLJ12_S2
- FrLJ6_S2
;
613 #if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
616 /* Calculate the LJ energies, with constant potential shift */
617 SimdReal VLJ6_S0
= sixth_S
* fma(c6_S0
, p6_cpot_S
, FrLJ6_S0
);
619 SimdReal VLJ6_S2
= sixth_S
* fma(c6_S2
, p6_cpot_S
, FrLJ6_S2
);
621 SimdReal VLJ12_S0
= twelveth_S
* fma(c12_S0
, p12_cpot_S
, FrLJ12_S0
);
623 SimdReal VLJ12_S2
= twelveth_S
* fma(c12_S2
, p12_cpot_S
, FrLJ12_S2
);
627 #ifdef LJ_FORCE_SWITCH
628 #define v_fswitch_pr(rsw, rsw2, c0, c3, c4) fma(fma(c4, rsw, c3), (rsw2) * (rsw), c0)
630 SimdReal VLJ6_S0
= c6_S0
* fma(sixth_S
, rinvsix_S0
, v_fswitch_pr(rsw_S0
, rsw2_S0
, p6_6cpot_S
, p6_vc3_S
, p6_vc4_S
));
632 SimdReal VLJ6_S2
= c6_S2
* fma(sixth_S
, rinvsix_S2
, v_fswitch_pr(rsw_S2
, rsw2_S2
, p6_6cpot_S
, p6_vc3_S
, p6_vc4_S
));
634 SimdReal VLJ12_S0
= c12_S0
* fma(twelveth_S
, rinvsix_S0
* rinvsix_S0
, v_fswitch_pr(rsw_S0
, rsw2_S0
, p12_12cpot_S
, p12_vc3_S
, p12_vc4_S
));
636 SimdReal VLJ12_S2
= c12_S2
* fma(twelveth_S
, rinvsix_S2
* rinvsix_S2
, v_fswitch_pr(rsw_S2
, rsw2_S2
, p12_12cpot_S
, p12_vc3_S
, p12_vc4_S
));
639 #endif /* LJ_FORCE_SWITCH */
641 /* Add up the repulsion and dispersion */
642 SimdReal VLJ_S0
= VLJ12_S0
- VLJ6_S0
;
644 SimdReal VLJ_S2
= VLJ12_S2
- VLJ6_S2
;
647 #endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
650 /* We always need the potential, since it is needed for the force */
651 SimdReal VLJ_S0
= fnma(sixth_S
, FrLJ6_S0
, twelveth_S
* FrLJ12_S0
);
653 SimdReal VLJ_S2
= fnma(sixth_S
, FrLJ6_S2
, twelveth_S
* FrLJ12_S2
);
657 SimdReal sw_S0
, dsw_S0
;
659 SimdReal sw_S2
, dsw_S2
;
662 #define switch_pr(rsw, rsw2, c3, c4, c5) fma(fma(fma(c5, rsw, c4), rsw, c3), (rsw2) * (rsw), one_S)
663 #define dswitch_pr(rsw, rsw2, c2, c3, c4) fma(fma(c4, rsw, c3), rsw, c2)*(rsw2)
665 sw_S0
= switch_pr(rsw_S0
, rsw2_S0
, swV3_S
, swV4_S
, swV5_S
);
666 dsw_S0
= dswitch_pr(rsw_S0
, rsw2_S0
, swF2_S
, swF3_S
, swF4_S
);
668 sw_S2
= switch_pr(rsw_S2
, rsw2_S2
, swV3_S
, swV4_S
, swV5_S
);
669 dsw_S2
= dswitch_pr(rsw_S2
, rsw2_S2
, swF2_S
, swF3_S
, swF4_S
);
671 frLJ_S0
= fnma(dsw_S0
* VLJ_S0
, r_S0
, sw_S0
* frLJ_S0
);
673 frLJ_S2
= fnma(dsw_S2
* VLJ_S2
, r_S2
, sw_S2
* frLJ_S2
);
676 VLJ_S0
= sw_S0
* VLJ_S0
;
678 VLJ_S2
= sw_S2
* VLJ_S2
;
685 #endif /* LJ_POT_SWITCH */
687 #if defined CALC_ENERGIES && defined CHECK_EXCLS
688 /* The potential shift should be removed for excluded pairs */
689 VLJ_S0
= selectByMask(VLJ_S0
, interact_S0
);
691 VLJ_S2
= selectByMask(VLJ_S2
, interact_S2
);
698 SimdReal c6grid_S0
, rinvsix_nm_S0
, cr2_S0
, expmcr2_S0
, poly_S0
;
700 SimdReal c6grid_S2
, rinvsix_nm_S2
, cr2_S2
, expmcr2_S2
, poly_S2
;
709 /* Determine C6 for the grid using the geometric combination rule */
710 c6s_j_S
= loadDuplicateHsimd(ljc
+aj2
);
711 c6grid_S0
= c6s_S0
* c6s_j_S
;
713 c6grid_S2
= c6s_S2
* c6s_j_S
;
717 /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
718 rinvsix_nm_S0
= rinvsq_S0
* rinvsq_S0
* rinvsq_S0
;
720 rinvsix_nm_S2
= rinvsq_S2
* rinvsq_S2
* rinvsq_S2
;
723 /* We didn't use a mask, so we can copy */
724 rinvsix_nm_S0
= rinvsix_S0
;
726 rinvsix_nm_S2
= rinvsix_S2
;
730 /* Mask for the cut-off to avoid overflow of cr2^2 */
731 cr2_S0
= lje_c2_S
* selectByMask(rsq_S0
, wco_vdw_S0
);
733 cr2_S2
= lje_c2_S
* selectByMask(rsq_S2
, wco_vdw_S2
);
735 // Unsafe version of our exp() should be fine, since these arguments should never
736 // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
737 expmcr2_S0
= exp
<MathOptimization::Unsafe
>( -cr2_S0
);
739 expmcr2_S2
= exp
<MathOptimization::Unsafe
>( -cr2_S2
);
742 /* 1 + cr2 + 1/2*cr2^2 */
743 poly_S0
= fma(fma(half_S
, cr2_S0
, one_S
), cr2_S0
, one_S
);
745 poly_S2
= fma(fma(half_S
, cr2_S2
, one_S
), cr2_S2
, one_S
);
748 /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
749 * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
751 frLJ_S0
= fma(c6grid_S0
, fnma(expmcr2_S0
, fma(rinvsix_nm_S0
, poly_S0
, lje_c6_6_S
), rinvsix_nm_S0
), frLJ_S0
);
753 frLJ_S2
= fma(c6grid_S2
, fnma(expmcr2_S2
, fma(rinvsix_nm_S2
, poly_S2
, lje_c6_6_S
), rinvsix_nm_S2
), frLJ_S2
);
758 sh_mask_S0
= selectByMask(lje_vc_S
, interact_S0
);
760 sh_mask_S2
= selectByMask(lje_vc_S
, interact_S2
);
763 sh_mask_S0
= lje_vc_S
;
765 sh_mask_S2
= lje_vc_S
;
769 VLJ_S0
= fma(sixth_S
* c6grid_S0
, fma(rinvsix_nm_S0
, fnma(expmcr2_S0
, poly_S0
, one_S
), sh_mask_S0
), VLJ_S0
);
771 VLJ_S2
= fma(sixth_S
* c6grid_S2
, fma(rinvsix_nm_S2
, fnma(expmcr2_S2
, poly_S2
, one_S
), sh_mask_S2
), VLJ_S2
);
773 #endif /* CALC_ENERGIES */
775 #endif /* LJ_EWALD_GEOM */
777 #if defined VDW_CUTOFF_CHECK
778 /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
779 * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
781 frLJ_S0
= selectByMask(frLJ_S0
, wco_vdw_S0
);
783 frLJ_S2
= selectByMask(frLJ_S2
, wco_vdw_S2
);
788 /* The potential shift should be removed for pairs beyond cut-off */
789 VLJ_S0
= selectByMask(VLJ_S0
, wco_vdw_S0
);
791 VLJ_S2
= selectByMask(VLJ_S2
, wco_vdw_S2
);
799 /* Extract the group pair index per j pair.
800 * Energy groups are stored per i-cluster, so things get
801 * complicated when the i- and j-cluster size don't match.
806 egps_j
= nbatParams
.energrp
[cj
>> 1];
807 egp_jj
[0] = ((egps_j
>> ((cj
& 1)*egps_jshift
)) & egps_jmask
)*egps_jstride
;
809 /* We assume UNROLLI <= UNROLLJ */
811 for (jdi
= 0; jdi
< UNROLLJ
/UNROLLI
; jdi
++)
814 egps_j
= nbatParams
.energrp
[cj
*(UNROLLJ
/UNROLLI
) + jdi
];
815 for (jj
= 0; jj
< (UNROLLI
/2); jj
++)
817 egp_jj
[jdi
*(UNROLLI
/2)+jj
] = ((egps_j
>> (jj
*egps_jshift
)) & egps_jmask
)*egps_jstride
;
825 #ifndef ENERGY_GROUPS
826 vctot_S
= vctot_S
+ vcoul_S0
+ vcoul_S2
;
828 add_ener_grp_halves(vcoul_S0
, vctp
[0], vctp
[1], egp_jj
);
829 add_ener_grp_halves(vcoul_S2
, vctp
[2], vctp
[3], egp_jj
);
834 #ifndef ENERGY_GROUPS
835 Vvdwtot_S
= Vvdwtot_S
+ VLJ_S0
841 add_ener_grp_halves(VLJ_S0
, vvdwtp
[0], vvdwtp
[1], egp_jj
);
843 add_ener_grp_halves(VLJ_S2
, vvdwtp
[2], vvdwtp
[3], egp_jj
);
847 #endif /* CALC_ENERGIES */
851 fscal_S0
= rinvsq_S0
* (frcoul_S0
+ frLJ_S0
);
853 fscal_S0
= rinvsq_S0
* frLJ_S0
;
856 fscal_S0
= rinvsq_S0
* frcoul_S0
;
858 #if defined CALC_LJ && !defined HALF_LJ
860 fscal_S2
= rinvsq_S2
* (frcoul_S2
+ frLJ_S2
);
862 fscal_S2
= rinvsq_S2
* frLJ_S2
;
865 /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
866 fscal_S2
= rinvsq_S2
* frcoul_S2
;
869 /* Calculate temporary vectorial force */
870 tx_S0
= fscal_S0
* dx_S0
;
871 tx_S2
= fscal_S2
* dx_S2
;
872 ty_S0
= fscal_S0
* dy_S0
;
873 ty_S2
= fscal_S2
* dy_S2
;
874 tz_S0
= fscal_S0
* dz_S0
;
875 tz_S2
= fscal_S2
* dz_S2
;
877 /* Increment i atom force */
878 fix_S0
= fix_S0
+ tx_S0
;
879 fix_S2
= fix_S2
+ tx_S2
;
880 fiy_S0
= fiy_S0
+ ty_S0
;
881 fiy_S2
= fiy_S2
+ ty_S2
;
882 fiz_S0
= fiz_S0
+ tz_S0
;
883 fiz_S2
= fiz_S2
+ tz_S2
;
885 /* Decrement j atom force */
886 decrHsimd(f
+ajx
, tx_S0
+ tx_S2
);
887 decrHsimd(f
+ajy
, ty_S0
+ ty_S2
);
888 decrHsimd(f
+ajz
, tz_S0
+ tz_S2
);