2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 /* Doxygen gets confused (buggy) about the block in this file in combination with
38 * the namespace prefix, and thinks store is documented here.
39 * This will solve itself with the second-generation nbnxn kernels, so for now
40 * we just tell Doxygen to stay out.
44 /* This is the innermost loop contents for the 4 x N atom simd kernel.
45 * This flavor of the kernel calculates interactions of 4 i-atoms
46 * with N j-atoms stored in N wide simd registers.
49 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
50 * forces on excluded atom pairs here in the non-bonded loops.
51 * But when energies and/or virial is required we calculate them
52 * separately to as then it is easier to separate the energy and virial
55 # if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
60 int cj
, ajx
, ajy
, ajz
;
64 /* Energy group indices for two atoms packed into one int */
65 int egp_jj
[UNROLLJ
/ 2];
69 /* Interaction (non-exclusion) mask of all 1's or 0's */
76 SimdReal jx_S
, jy_S
, jz_S
;
77 SimdReal dx_S0
, dy_S0
, dz_S0
;
78 SimdReal dx_S1
, dy_S1
, dz_S1
;
79 SimdReal dx_S2
, dy_S2
, dz_S2
;
80 SimdReal dx_S3
, dy_S3
, dz_S3
;
81 SimdReal tx_S0
, ty_S0
, tz_S0
;
82 SimdReal tx_S1
, ty_S1
, tz_S1
;
83 SimdReal tx_S2
, ty_S2
, tz_S2
;
84 SimdReal tx_S3
, ty_S3
, tz_S3
;
85 SimdReal rsq_S0
, rinv_S0
, rinvsq_S0
;
86 SimdReal rsq_S1
, rinv_S1
, rinvsq_S1
;
87 SimdReal rsq_S2
, rinv_S2
, rinvsq_S2
;
88 SimdReal rsq_S3
, rinv_S3
, rinvsq_S3
;
90 /* wco: within cut-off, mask of all 1's or 0's */
96 # ifdef VDW_CUTOFF_CHECK
105 # if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
108 # if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
114 # if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
115 SimdReal rsw_S0
, rsw2_S0
;
116 SimdReal rsw_S1
, rsw2_S1
;
118 SimdReal rsw_S2
, rsw2_S2
;
119 SimdReal rsw_S3
, rsw2_S3
;
125 /* 1/r masked with the interaction mask */
136 # ifdef CALC_COUL_TAB
137 /* The force (PME mesh force) we need to subtract from 1/r^2 */
143 # ifdef CALC_COUL_EWALD
144 SimdReal brsq_S0
, brsq_S1
, brsq_S2
, brsq_S3
;
145 SimdReal ewcorr_S0
, ewcorr_S1
, ewcorr_S2
, ewcorr_S3
;
148 /* frcoul = (1/r - fsub)*r */
153 # ifdef CALC_COUL_TAB
154 /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
155 SimdReal rs_S0
, rf_S0
, frac_S0
;
156 SimdReal rs_S1
, rf_S1
, frac_S1
;
157 SimdReal rs_S2
, rf_S2
, frac_S2
;
158 SimdReal rs_S3
, rf_S3
, frac_S3
;
159 /* Table index: rs truncated to an int */
160 SimdInt32 ti_S0
, ti_S1
, ti_S2
, ti_S3
;
161 /* Linear force table values */
162 SimdReal ctab0_S0
, ctab1_S0
;
163 SimdReal ctab0_S1
, ctab1_S1
;
164 SimdReal ctab0_S2
, ctab1_S2
;
165 SimdReal ctab0_S3
, ctab1_S3
;
166 # ifdef CALC_ENERGIES
167 /* Quadratic energy table value */
168 SimdReal ctabv_S0
, dum_S0
;
169 SimdReal ctabv_S1
, dum_S1
;
170 SimdReal ctabv_S2
, dum_S2
;
171 SimdReal ctabv_S3
, dum_S3
;
174 # if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
175 /* The potential (PME mesh) we need to subtract from 1/r */
181 # ifdef CALC_ENERGIES
182 /* Electrostatic potential */
189 /* The force times 1/r */
197 /* LJ sigma_j/2 and sqrt(epsilon_j) */
198 SimdReal hsig_j_S
, seps_j_S
;
199 /* LJ sigma_ij and epsilon_ij */
200 SimdReal sig_S0
, eps_S0
;
201 SimdReal sig_S1
, eps_S1
;
203 SimdReal sig_S2
, eps_S2
;
204 SimdReal sig_S3
, eps_S3
;
206 # ifdef CALC_ENERGIES
207 SimdReal sig2_S0
, sig6_S0
;
208 SimdReal sig2_S1
, sig6_S1
;
210 SimdReal sig2_S2
, sig6_S2
;
211 SimdReal sig2_S3
, sig6_S3
;
213 # endif /* LJ_COMB_LB */
214 # endif /* CALC_LJ */
217 SimdReal c6s_j_S
, c12s_j_S
;
220 # if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
221 /* Index for loading LJ parameters, complicated when interleaving */
225 /* Intermediate variables for LJ calculation */
235 SimdReal sir_S0
, sir2_S0
, sir6_S0
;
236 SimdReal sir_S1
, sir2_S1
, sir6_S1
;
238 SimdReal sir_S2
, sir2_S2
, sir6_S2
;
239 SimdReal sir_S3
, sir2_S3
, sir6_S3
;
243 SimdReal FrLJ6_S0
, FrLJ12_S0
, frLJ_S0
;
244 SimdReal FrLJ6_S1
, FrLJ12_S1
, frLJ_S1
;
246 SimdReal FrLJ6_S2
, FrLJ12_S2
, frLJ_S2
;
247 SimdReal FrLJ6_S3
, FrLJ12_S3
, frLJ_S3
;
249 # endif /* CALC_LJ */
251 /* j-cluster index */
254 /* Atom indices (of the first atom in the cluster) */
256 # if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
257 # if UNROLLJ == STRIDE
260 aj2
= (cj
>> 1) * 2 * STRIDE
+ (cj
& 1) * UNROLLJ
;
263 # if UNROLLJ == STRIDE
266 ajx
= (cj
>> 1) * DIM
* STRIDE
+ (cj
& 1) * UNROLLJ
;
272 gmx_load_simd_4xn_interactions(static_cast<int>(l_cj
[cjind
].excl
), filter_S0
, filter_S1
,
273 filter_S2
, filter_S3
, nbat
->simdMasks
.interaction_array
.data(),
274 &interact_S0
, &interact_S1
, &interact_S2
, &interact_S3
);
275 # endif /* CHECK_EXCLS */
277 /* load j atom coordinates */
278 jx_S
= load
<SimdReal
>(x
+ ajx
);
279 jy_S
= load
<SimdReal
>(x
+ ajy
);
280 jz_S
= load
<SimdReal
>(x
+ ajz
);
282 /* Calculate distance */
283 dx_S0
= ix_S0
- jx_S
;
284 dy_S0
= iy_S0
- jy_S
;
285 dz_S0
= iz_S0
- jz_S
;
286 dx_S1
= ix_S1
- jx_S
;
287 dy_S1
= iy_S1
- jy_S
;
288 dz_S1
= iz_S1
- jz_S
;
289 dx_S2
= ix_S2
- jx_S
;
290 dy_S2
= iy_S2
- jy_S
;
291 dz_S2
= iz_S2
- jz_S
;
292 dx_S3
= ix_S3
- jx_S
;
293 dy_S3
= iy_S3
- jy_S
;
294 dz_S3
= iz_S3
- jz_S
;
296 /* rsq = dx*dx+dy*dy+dz*dz */
297 rsq_S0
= norm2(dx_S0
, dy_S0
, dz_S0
);
298 rsq_S1
= norm2(dx_S1
, dy_S1
, dz_S1
);
299 rsq_S2
= norm2(dx_S2
, dy_S2
, dz_S2
);
300 rsq_S3
= norm2(dx_S3
, dy_S3
, dz_S3
);
302 /* Do the cut-off check */
303 wco_S0
= (rsq_S0
< rc2_S
);
304 wco_S1
= (rsq_S1
< rc2_S
);
305 wco_S2
= (rsq_S2
< rc2_S
);
306 wco_S3
= (rsq_S3
< rc2_S
);
310 /* Only remove the (sub-)diagonal to avoid double counting */
311 # if UNROLLJ == UNROLLI
314 wco_S0
= wco_S0
&& diagonal_mask_S0
;
315 wco_S1
= wco_S1
&& diagonal_mask_S1
;
316 wco_S2
= wco_S2
&& diagonal_mask_S2
;
317 wco_S3
= wco_S3
&& diagonal_mask_S3
;
320 # if UNROLLJ < UNROLLI
323 wco_S0
= wco_S0
&& diagonal_mask0_S0
;
324 wco_S1
= wco_S1
&& diagonal_mask0_S1
;
325 wco_S2
= wco_S2
&& diagonal_mask0_S2
;
326 wco_S3
= wco_S3
&& diagonal_mask0_S3
;
328 if (cj
== ci_sh
* 2 + 1)
330 wco_S0
= wco_S0
&& diagonal_mask1_S0
;
331 wco_S1
= wco_S1
&& diagonal_mask1_S1
;
332 wco_S2
= wco_S2
&& diagonal_mask1_S2
;
333 wco_S3
= wco_S3
&& diagonal_mask1_S3
;
338 wco_S0
= wco_S0
&& diagonal_mask0_S0
;
339 wco_S1
= wco_S1
&& diagonal_mask0_S1
;
340 wco_S2
= wco_S2
&& diagonal_mask0_S2
;
341 wco_S3
= wco_S3
&& diagonal_mask0_S3
;
343 else if (cj
* 2 + 1 == ci_sh
)
345 wco_S0
= wco_S0
&& diagonal_mask1_S0
;
346 wco_S1
= wco_S1
&& diagonal_mask1_S1
;
347 wco_S2
= wco_S2
&& diagonal_mask1_S2
;
348 wco_S3
= wco_S3
&& diagonal_mask1_S3
;
352 # else /* EXCL_FORCES */
353 /* No exclusion forces: remove all excluded atom pairs from the list */
354 wco_S0
= wco_S0
&& interact_S0
;
355 wco_S1
= wco_S1
&& interact_S1
;
356 wco_S2
= wco_S2
&& interact_S2
;
357 wco_S3
= wco_S3
&& interact_S3
;
364 alignas(GMX_SIMD_ALIGNMENT
) real tmp
[2 * GMX_SIMD_REAL_WIDTH
];
366 for (i
= 0; i
< UNROLLI
; i
++)
368 store(tmp
, rc2_S
- (i
== 0 ? rsq_S0
: (i
== 1 ? rsq_S1
: (i
== 2 ? rsq_S2
: rsq_S3
))));
369 for (j
= 0; j
< UNROLLJ
; j
++)
380 // Ensure the distances do not fall below the limit where r^-12 overflows.
381 // This should never happen for normal interactions.
382 rsq_S0
= max(rsq_S0
, minRsq_S
);
383 rsq_S1
= max(rsq_S1
, minRsq_S
);
384 rsq_S2
= max(rsq_S2
, minRsq_S
);
385 rsq_S3
= max(rsq_S3
, minRsq_S
);
389 rinv_S0
= invsqrt(rsq_S0
);
390 rinv_S1
= invsqrt(rsq_S1
);
391 rinv_S2
= invsqrt(rsq_S2
);
392 rinv_S3
= invsqrt(rsq_S3
);
394 invsqrtPair(rsq_S0
, rsq_S1
, &rinv_S0
, &rinv_S1
);
395 invsqrtPair(rsq_S2
, rsq_S3
, &rinv_S2
, &rinv_S3
);
399 /* Load parameters for j atom */
400 jq_S
= load
<SimdReal
>(q
+ aj
);
401 qq_S0
= iq_S0
* jq_S
;
402 qq_S1
= iq_S1
* jq_S
;
403 qq_S2
= iq_S2
* jq_S
;
404 qq_S3
= iq_S3
* jq_S
;
408 # if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
409 SimdReal c6_S0
, c6_S1
, c12_S0
, c12_S1
;
410 gatherLoadTranspose
<c_simdBestPairAlignment
>(nbfp0
, type
+ aj
, &c6_S0
, &c12_S0
);
411 gatherLoadTranspose
<c_simdBestPairAlignment
>(nbfp1
, type
+ aj
, &c6_S1
, &c12_S1
);
413 SimdReal c6_S2
, c6_S3
, c12_S2
, c12_S3
;
414 gatherLoadTranspose
<c_simdBestPairAlignment
>(nbfp2
, type
+ aj
, &c6_S2
, &c12_S2
);
415 gatherLoadTranspose
<c_simdBestPairAlignment
>(nbfp3
, type
+ aj
, &c6_S3
, &c12_S3
);
417 # endif /* not defined any LJ rule */
420 c6s_j_S
= load
<SimdReal
>(ljc
+ aj2
+ 0);
421 c12s_j_S
= load
<SimdReal
>(ljc
+ aj2
+ STRIDE
);
422 SimdReal c6_S0
= c6s_S0
* c6s_j_S
;
423 SimdReal c6_S1
= c6s_S1
* c6s_j_S
;
425 SimdReal c6_S2
= c6s_S2
* c6s_j_S
;
426 SimdReal c6_S3
= c6s_S3
* c6s_j_S
;
428 SimdReal c12_S0
= c12s_S0
* c12s_j_S
;
429 SimdReal c12_S1
= c12s_S1
* c12s_j_S
;
431 SimdReal c12_S2
= c12s_S2
* c12s_j_S
;
432 SimdReal c12_S3
= c12s_S3
* c12s_j_S
;
434 # endif /* LJ_COMB_GEOM */
437 hsig_j_S
= load
<SimdReal
>(ljc
+ aj2
+ 0);
438 seps_j_S
= load
<SimdReal
>(ljc
+ aj2
+ STRIDE
);
440 sig_S0
= hsig_i_S0
+ hsig_j_S
;
441 sig_S1
= hsig_i_S1
+ hsig_j_S
;
442 eps_S0
= seps_i_S0
* seps_j_S
;
443 eps_S1
= seps_i_S1
* seps_j_S
;
445 sig_S2
= hsig_i_S2
+ hsig_j_S
;
446 sig_S3
= hsig_i_S3
+ hsig_j_S
;
447 eps_S2
= seps_i_S2
* seps_j_S
;
448 eps_S3
= seps_i_S3
* seps_j_S
;
450 # endif /* LJ_COMB_LB */
452 # endif /* CALC_LJ */
454 /* Set rinv to zero for r beyond the cut-off */
455 rinv_S0
= selectByMask(rinv_S0
, wco_S0
);
456 rinv_S1
= selectByMask(rinv_S1
, wco_S1
);
457 rinv_S2
= selectByMask(rinv_S2
, wco_S2
);
458 rinv_S3
= selectByMask(rinv_S3
, wco_S3
);
460 rinvsq_S0
= rinv_S0
* rinv_S0
;
461 rinvsq_S1
= rinv_S1
* rinv_S1
;
462 rinvsq_S2
= rinv_S2
* rinv_S2
;
463 rinvsq_S3
= rinv_S3
* rinv_S3
;
466 /* Note that here we calculate force*r, not the usual force/r.
467 * This allows avoiding masking the reaction-field contribution,
468 * as frcoul is later multiplied by rinvsq which has been
469 * masked with the cut-off check.
473 /* Only add 1/r for non-excluded atom pairs */
474 rinv_ex_S0
= selectByMask(rinv_S0
, interact_S0
);
475 rinv_ex_S1
= selectByMask(rinv_S1
, interact_S1
);
476 rinv_ex_S2
= selectByMask(rinv_S2
, interact_S2
);
477 rinv_ex_S3
= selectByMask(rinv_S3
, interact_S3
);
479 /* No exclusion forces, we always need 1/r */
480 # define rinv_ex_S0 rinv_S0
481 # define rinv_ex_S1 rinv_S1
482 # define rinv_ex_S2 rinv_S2
483 # define rinv_ex_S3 rinv_S3
487 /* Electrostatic interactions */
488 frcoul_S0
= qq_S0
* fma(rsq_S0
, mrc_3_S
, rinv_ex_S0
);
489 frcoul_S1
= qq_S1
* fma(rsq_S1
, mrc_3_S
, rinv_ex_S1
);
490 frcoul_S2
= qq_S2
* fma(rsq_S2
, mrc_3_S
, rinv_ex_S2
);
491 frcoul_S3
= qq_S3
* fma(rsq_S3
, mrc_3_S
, rinv_ex_S3
);
493 # ifdef CALC_ENERGIES
494 vcoul_S0
= qq_S0
* (rinv_ex_S0
+ fma(rsq_S0
, hrc_3_S
, moh_rc_S
));
495 vcoul_S1
= qq_S1
* (rinv_ex_S1
+ fma(rsq_S1
, hrc_3_S
, moh_rc_S
));
496 vcoul_S2
= qq_S2
* (rinv_ex_S2
+ fma(rsq_S2
, hrc_3_S
, moh_rc_S
));
497 vcoul_S3
= qq_S3
* (rinv_ex_S3
+ fma(rsq_S3
, hrc_3_S
, moh_rc_S
));
501 # ifdef CALC_COUL_EWALD
502 /* We need to mask (or limit) rsq for the cut-off,
503 * as large distances can cause an overflow in gmx_pmecorrF/V.
505 brsq_S0
= beta2_S
* selectByMask(rsq_S0
, wco_S0
);
506 brsq_S1
= beta2_S
* selectByMask(rsq_S1
, wco_S1
);
507 brsq_S2
= beta2_S
* selectByMask(rsq_S2
, wco_S2
);
508 brsq_S3
= beta2_S
* selectByMask(rsq_S3
, wco_S3
);
509 ewcorr_S0
= beta_S
* pmeForceCorrection(brsq_S0
);
510 ewcorr_S1
= beta_S
* pmeForceCorrection(brsq_S1
);
511 ewcorr_S2
= beta_S
* pmeForceCorrection(brsq_S2
);
512 ewcorr_S3
= beta_S
* pmeForceCorrection(brsq_S3
);
513 frcoul_S0
= qq_S0
* fma(ewcorr_S0
, brsq_S0
, rinv_ex_S0
);
514 frcoul_S1
= qq_S1
* fma(ewcorr_S1
, brsq_S1
, rinv_ex_S1
);
515 frcoul_S2
= qq_S2
* fma(ewcorr_S2
, brsq_S2
, rinv_ex_S2
);
516 frcoul_S3
= qq_S3
* fma(ewcorr_S3
, brsq_S3
, rinv_ex_S3
);
518 # ifdef CALC_ENERGIES
519 vc_sub_S0
= beta_S
* pmePotentialCorrection(brsq_S0
);
520 vc_sub_S1
= beta_S
* pmePotentialCorrection(brsq_S1
);
521 vc_sub_S2
= beta_S
* pmePotentialCorrection(brsq_S2
);
522 vc_sub_S3
= beta_S
* pmePotentialCorrection(brsq_S3
);
525 # endif /* CALC_COUL_EWALD */
527 # ifdef CALC_COUL_TAB
528 /* Electrostatic interactions */
529 r_S0
= rsq_S0
* rinv_S0
;
530 r_S1
= rsq_S1
* rinv_S1
;
531 r_S2
= rsq_S2
* rinv_S2
;
532 r_S3
= rsq_S3
* rinv_S3
;
533 /* Convert r to scaled table units */
534 rs_S0
= r_S0
* invtsp_S
;
535 rs_S1
= r_S1
* invtsp_S
;
536 rs_S2
= r_S2
* invtsp_S
;
537 rs_S3
= r_S3
* invtsp_S
;
538 /* Truncate scaled r to an int */
539 ti_S0
= cvttR2I(rs_S0
);
540 ti_S1
= cvttR2I(rs_S1
);
541 ti_S2
= cvttR2I(rs_S2
);
542 ti_S3
= cvttR2I(rs_S3
);
544 rf_S0
= trunc(rs_S0
);
545 rf_S1
= trunc(rs_S1
);
546 rf_S2
= trunc(rs_S2
);
547 rf_S3
= trunc(rs_S3
);
549 frac_S0
= rs_S0
- rf_S0
;
550 frac_S1
= rs_S1
- rf_S1
;
551 frac_S2
= rs_S2
- rf_S2
;
552 frac_S3
= rs_S3
- rf_S3
;
554 /* Load and interpolate table forces and possibly energies.
555 * Force and energy can be combined in one table, stride 4: FDV0
556 * or in two separate tables with stride 1: F and V
557 * Currently single precision uses FDV0, double F and V.
559 # ifndef CALC_ENERGIES
561 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
);
562 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S1
, &ctab0_S1
, &ctab1_S1
);
563 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
);
564 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S3
, &ctab0_S3
, &ctab1_S3
);
566 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
);
567 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S1
, &ctab0_S1
, &ctab1_S1
);
568 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
);
569 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S3
, &ctab0_S3
, &ctab1_S3
);
570 ctab1_S0
= ctab1_S0
- ctab0_S0
;
571 ctab1_S1
= ctab1_S1
- ctab0_S1
;
572 ctab1_S2
= ctab1_S2
- ctab0_S2
;
573 ctab1_S3
= ctab1_S3
- ctab0_S3
;
577 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
, &ctabv_S0
, &dum_S0
);
578 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S1
, &ctab0_S1
, &ctab1_S1
, &ctabv_S1
, &dum_S1
);
579 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
, &ctabv_S2
, &dum_S2
);
580 gatherLoadBySimdIntTranspose
<4>(tab_coul_F
, ti_S3
, &ctab0_S3
, &ctab1_S3
, &ctabv_S3
, &dum_S3
);
582 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S0
, &ctab0_S0
, &ctab1_S0
);
583 gatherLoadUBySimdIntTranspose
<1>(tab_coul_V
, ti_S0
, &ctabv_S0
, &dum_S0
);
584 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S1
, &ctab0_S1
, &ctab1_S1
);
585 gatherLoadUBySimdIntTranspose
<1>(tab_coul_V
, ti_S1
, &ctabv_S1
, &dum_S1
);
586 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S2
, &ctab0_S2
, &ctab1_S2
);
587 gatherLoadUBySimdIntTranspose
<1>(tab_coul_V
, ti_S2
, &ctabv_S2
, &dum_S2
);
588 gatherLoadUBySimdIntTranspose
<1>(tab_coul_F
, ti_S3
, &ctab0_S3
, &ctab1_S3
);
589 gatherLoadUBySimdIntTranspose
<1>(tab_coul_V
, ti_S3
, &ctabv_S3
, &dum_S3
);
590 ctab1_S0
= ctab1_S0
- ctab0_S0
;
591 ctab1_S1
= ctab1_S1
- ctab0_S1
;
592 ctab1_S2
= ctab1_S2
- ctab0_S2
;
593 ctab1_S3
= ctab1_S3
- ctab0_S3
;
596 fsub_S0
= fma(frac_S0
, ctab1_S0
, ctab0_S0
);
597 fsub_S1
= fma(frac_S1
, ctab1_S1
, ctab0_S1
);
598 fsub_S2
= fma(frac_S2
, ctab1_S2
, ctab0_S2
);
599 fsub_S3
= fma(frac_S3
, ctab1_S3
, ctab0_S3
);
600 frcoul_S0
= qq_S0
* fnma(fsub_S0
, r_S0
, rinv_ex_S0
);
601 frcoul_S1
= qq_S1
* fnma(fsub_S1
, r_S1
, rinv_ex_S1
);
602 frcoul_S2
= qq_S2
* fnma(fsub_S2
, r_S2
, rinv_ex_S2
);
603 frcoul_S3
= qq_S3
* fnma(fsub_S3
, r_S3
, rinv_ex_S3
);
605 # ifdef CALC_ENERGIES
606 vc_sub_S0
= fma((mhalfsp_S
* frac_S0
), (ctab0_S0
+ fsub_S0
), ctabv_S0
);
607 vc_sub_S1
= fma((mhalfsp_S
* frac_S1
), (ctab0_S1
+ fsub_S1
), ctabv_S1
);
608 vc_sub_S2
= fma((mhalfsp_S
* frac_S2
), (ctab0_S2
+ fsub_S2
), ctabv_S2
);
609 vc_sub_S3
= fma((mhalfsp_S
* frac_S3
), (ctab0_S3
+ fsub_S3
), ctabv_S3
);
611 # endif /* CALC_COUL_TAB */
613 # if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
614 # ifndef NO_SHIFT_EWALD
615 /* Add Ewald potential shift to vc_sub for convenience */
617 vc_sub_S0
= vc_sub_S0
+ selectByMask(sh_ewald_S
, interact_S0
);
618 vc_sub_S1
= vc_sub_S1
+ selectByMask(sh_ewald_S
, interact_S1
);
619 vc_sub_S2
= vc_sub_S2
+ selectByMask(sh_ewald_S
, interact_S2
);
620 vc_sub_S3
= vc_sub_S3
+ selectByMask(sh_ewald_S
, interact_S3
);
622 vc_sub_S0
= vc_sub_S0
+ sh_ewald_S
;
623 vc_sub_S1
= vc_sub_S1
+ sh_ewald_S
;
624 vc_sub_S2
= vc_sub_S2
+ sh_ewald_S
;
625 vc_sub_S3
= vc_sub_S3
+ sh_ewald_S
;
629 vcoul_S0
= qq_S0
* (rinv_ex_S0
- vc_sub_S0
);
630 vcoul_S1
= qq_S1
* (rinv_ex_S1
- vc_sub_S1
);
631 vcoul_S2
= qq_S2
* (rinv_ex_S2
- vc_sub_S2
);
632 vcoul_S3
= qq_S3
* (rinv_ex_S3
- vc_sub_S3
);
636 # ifdef CALC_ENERGIES
637 /* Mask energy for cut-off and diagonal */
638 vcoul_S0
= selectByMask(vcoul_S0
, wco_S0
);
639 vcoul_S1
= selectByMask(vcoul_S1
, wco_S1
);
640 vcoul_S2
= selectByMask(vcoul_S2
, wco_S2
);
641 vcoul_S3
= selectByMask(vcoul_S3
, wco_S3
);
644 # endif /* CALC_COULOMB */
647 /* Lennard-Jones interaction */
649 # ifdef VDW_CUTOFF_CHECK
650 wco_vdw_S0
= (rsq_S0
< rcvdw2_S
);
651 wco_vdw_S1
= (rsq_S1
< rcvdw2_S
);
653 wco_vdw_S2
= (rsq_S2
< rcvdw2_S
);
654 wco_vdw_S3
= (rsq_S3
< rcvdw2_S
);
657 /* Same cut-off for Coulomb and VdW, reuse the registers */
658 # define wco_vdw_S0 wco_S0
659 # define wco_vdw_S1 wco_S1
660 # define wco_vdw_S2 wco_S2
661 # define wco_vdw_S3 wco_S3
665 rinvsix_S0
= rinvsq_S0
* rinvsq_S0
* rinvsq_S0
;
666 rinvsix_S1
= rinvsq_S1
* rinvsq_S1
* rinvsq_S1
;
668 rinvsix_S0
= selectByMask(rinvsix_S0
, interact_S0
);
669 rinvsix_S1
= selectByMask(rinvsix_S1
, interact_S1
);
672 rinvsix_S2
= rinvsq_S2
* rinvsq_S2
* rinvsq_S2
;
673 rinvsix_S3
= rinvsq_S3
* rinvsq_S3
* rinvsq_S3
;
675 rinvsix_S2
= selectByMask(rinvsix_S2
, interact_S2
);
676 rinvsix_S3
= selectByMask(rinvsix_S3
, interact_S3
);
680 # if defined LJ_CUT || defined LJ_POT_SWITCH
681 /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
682 FrLJ6_S0
= c6_S0
* rinvsix_S0
;
683 FrLJ6_S1
= c6_S1
* rinvsix_S1
;
685 FrLJ6_S2
= c6_S2
* rinvsix_S2
;
686 FrLJ6_S3
= c6_S3
* rinvsix_S3
;
688 FrLJ12_S0
= c12_S0
* rinvsix_S0
* rinvsix_S0
;
689 FrLJ12_S1
= c12_S1
* rinvsix_S1
* rinvsix_S1
;
691 FrLJ12_S2
= c12_S2
* rinvsix_S2
* rinvsix_S2
;
692 FrLJ12_S3
= c12_S3
* rinvsix_S3
* rinvsix_S3
;
696 # if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
697 /* We switch the LJ force */
698 r_S0
= rsq_S0
* rinv_S0
;
699 rsw_S0
= max(r_S0
- rswitch_S
, zero_S
);
700 rsw2_S0
= rsw_S0
* rsw_S0
;
701 r_S1
= rsq_S1
* rinv_S1
;
702 rsw_S1
= max(r_S1
- rswitch_S
, zero_S
);
703 rsw2_S1
= rsw_S1
* rsw_S1
;
705 r_S2
= rsq_S2
* rinv_S2
;
706 rsw_S2
= max(r_S2
- rswitch_S
, zero_S
);
707 rsw2_S2
= rsw_S2
* rsw_S2
;
708 r_S3
= rsq_S3
* rinv_S3
;
709 rsw_S3
= max(r_S3
- rswitch_S
, zero_S
);
710 rsw2_S3
= rsw_S3
* rsw_S3
;
714 # ifdef LJ_FORCE_SWITCH
716 # define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) \
717 fma(fma(c3, rsw, c2), rsw2_r, (fr))
718 SimdReal rsw2_r_S0
= rsw2_S0
* r_S0
;
719 SimdReal rsw2_r_S1
= rsw2_S1
* r_S1
;
720 FrLJ6_S0
= c6_S0
* gmx_add_fr_switch(rinvsix_S0
, rsw_S0
, rsw2_r_S0
, p6_fc2_S
, p6_fc3_S
);
721 FrLJ6_S1
= c6_S1
* gmx_add_fr_switch(rinvsix_S1
, rsw_S1
, rsw2_r_S1
, p6_fc2_S
, p6_fc3_S
);
723 SimdReal rsw2_r_S2
= rsw2_S2
* r_S2
;
724 SimdReal rsw2_r_S3
= rsw2_S3
* r_S3
;
725 FrLJ6_S2
= c6_S2
* gmx_add_fr_switch(rinvsix_S2
, rsw_S2
, rsw2_r_S2
, p6_fc2_S
, p6_fc3_S
);
726 FrLJ6_S3
= c6_S3
* gmx_add_fr_switch(rinvsix_S3
, rsw_S3
, rsw2_r_S3
, p6_fc2_S
, p6_fc3_S
);
728 FrLJ12_S0
= c12_S0
* gmx_add_fr_switch((rinvsix_S0
* rinvsix_S0
), rsw_S0
, rsw2_r_S0
, p12_fc2_S
, p12_fc3_S
);
729 FrLJ12_S1
= c12_S1
* gmx_add_fr_switch((rinvsix_S1
* rinvsix_S1
), rsw_S1
, rsw2_r_S1
, p12_fc2_S
, p12_fc3_S
);
731 FrLJ12_S2
= c12_S2
* gmx_add_fr_switch((rinvsix_S2
* rinvsix_S2
), rsw_S2
, rsw2_r_S2
, p12_fc2_S
, p12_fc3_S
);
732 FrLJ12_S3
= c12_S3
* gmx_add_fr_switch((rinvsix_S3
* rinvsix_S3
), rsw_S3
, rsw2_r_S3
, p12_fc2_S
, p12_fc3_S
);
734 # undef gmx_add_fr_switch
735 # endif /* LJ_FORCE_SWITCH */
737 # endif /* not LJ_COMB_LB */
740 sir_S0
= sig_S0
* rinv_S0
;
741 sir_S1
= sig_S1
* rinv_S1
;
743 sir_S2
= sig_S2
* rinv_S2
;
744 sir_S3
= sig_S3
* rinv_S3
;
746 sir2_S0
= sir_S0
* sir_S0
;
747 sir2_S1
= sir_S1
* sir_S1
;
749 sir2_S2
= sir_S2
* sir_S2
;
750 sir2_S3
= sir_S3
* sir_S3
;
752 sir6_S0
= sir2_S0
* sir2_S0
* sir2_S0
;
753 sir6_S1
= sir2_S1
* sir2_S1
* sir2_S1
;
755 sir6_S0
= selectByMask(sir6_S0
, interact_S0
);
756 sir6_S1
= selectByMask(sir6_S1
, interact_S1
);
759 sir6_S2
= sir2_S2
* sir2_S2
* sir2_S2
;
760 sir6_S3
= sir2_S3
* sir2_S3
* sir2_S3
;
762 sir6_S2
= selectByMask(sir6_S2
, interact_S2
);
763 sir6_S3
= selectByMask(sir6_S3
, interact_S3
);
766 # ifdef VDW_CUTOFF_CHECK
767 sir6_S0
= selectByMask(sir6_S0
, wco_vdw_S0
);
768 sir6_S1
= selectByMask(sir6_S1
, wco_vdw_S1
);
770 sir6_S2
= selectByMask(sir6_S2
, wco_vdw_S2
);
771 sir6_S3
= selectByMask(sir6_S3
, wco_vdw_S3
);
774 FrLJ6_S0
= eps_S0
* sir6_S0
;
775 FrLJ6_S1
= eps_S1
* sir6_S1
;
777 FrLJ6_S2
= eps_S2
* sir6_S2
;
778 FrLJ6_S3
= eps_S3
* sir6_S3
;
780 FrLJ12_S0
= FrLJ6_S0
* sir6_S0
;
781 FrLJ12_S1
= FrLJ6_S1
* sir6_S1
;
783 FrLJ12_S2
= FrLJ6_S2
* sir6_S2
;
784 FrLJ12_S3
= FrLJ6_S3
* sir6_S3
;
786 # if defined CALC_ENERGIES
787 /* We need C6 and C12 to calculate the LJ potential shift */
788 sig2_S0
= sig_S0
* sig_S0
;
789 sig2_S1
= sig_S1
* sig_S1
;
791 sig2_S2
= sig_S2
* sig_S2
;
792 sig2_S3
= sig_S3
* sig_S3
;
794 sig6_S0
= sig2_S0
* sig2_S0
* sig2_S0
;
795 sig6_S1
= sig2_S1
* sig2_S1
* sig2_S1
;
797 sig6_S2
= sig2_S2
* sig2_S2
* sig2_S2
;
798 sig6_S3
= sig2_S3
* sig2_S3
* sig2_S3
;
800 SimdReal c6_S0
= eps_S0
* sig6_S0
;
801 SimdReal c6_S1
= eps_S1
* sig6_S1
;
803 SimdReal c6_S2
= eps_S2
* sig6_S2
;
804 SimdReal c6_S3
= eps_S3
* sig6_S3
;
806 SimdReal c12_S0
= c6_S0
* sig6_S0
;
807 SimdReal c12_S1
= c6_S1
* sig6_S1
;
809 SimdReal c12_S2
= c6_S2
* sig6_S2
;
810 SimdReal c12_S3
= c6_S3
* sig6_S3
;
813 # endif /* LJ_COMB_LB */
815 /* Determine the total scalar LJ force*r */
816 frLJ_S0
= FrLJ12_S0
- FrLJ6_S0
;
817 frLJ_S1
= FrLJ12_S1
- FrLJ6_S1
;
819 frLJ_S2
= FrLJ12_S2
- FrLJ6_S2
;
820 frLJ_S3
= FrLJ12_S3
- FrLJ6_S3
;
823 # if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
826 /* Calculate the LJ energies, with constant potential shift */
827 SimdReal VLJ6_S0
= sixth_S
* fma(c6_S0
, p6_cpot_S
, FrLJ6_S0
);
828 SimdReal VLJ6_S1
= sixth_S
* fma(c6_S1
, p6_cpot_S
, FrLJ6_S1
);
830 SimdReal VLJ6_S2
= sixth_S
* fma(c6_S2
, p6_cpot_S
, FrLJ6_S2
);
831 SimdReal VLJ6_S3
= sixth_S
* fma(c6_S3
, p6_cpot_S
, FrLJ6_S3
);
833 SimdReal VLJ12_S0
= twelveth_S
* fma(c12_S0
, p12_cpot_S
, FrLJ12_S0
);
834 SimdReal VLJ12_S1
= twelveth_S
* fma(c12_S1
, p12_cpot_S
, FrLJ12_S1
);
836 SimdReal VLJ12_S2
= twelveth_S
* fma(c12_S2
, p12_cpot_S
, FrLJ12_S2
);
837 SimdReal VLJ12_S3
= twelveth_S
* fma(c12_S3
, p12_cpot_S
, FrLJ12_S3
);
841 # ifdef LJ_FORCE_SWITCH
842 # define v_fswitch_r(rsw, rsw2, c0, c3, c4) \
843 fma(fma((c4), (rsw), (c3)), ((rsw2) * (rsw)), (c0))
846 c6_S0
* fma(sixth_S
, rinvsix_S0
, v_fswitch_r(rsw_S0
, rsw2_S0
, p6_6cpot_S
, p6_vc3_S
, p6_vc4_S
));
848 c6_S1
* fma(sixth_S
, rinvsix_S1
, v_fswitch_r(rsw_S1
, rsw2_S1
, p6_6cpot_S
, p6_vc3_S
, p6_vc4_S
));
851 c6_S2
* fma(sixth_S
, rinvsix_S2
, v_fswitch_r(rsw_S2
, rsw2_S2
, p6_6cpot_S
, p6_vc3_S
, p6_vc4_S
));
853 c6_S3
* fma(sixth_S
, rinvsix_S3
, v_fswitch_r(rsw_S3
, rsw2_S3
, p6_6cpot_S
, p6_vc3_S
, p6_vc4_S
));
855 SimdReal VLJ12_S0
= c12_S0
856 * fma(twelveth_S
, rinvsix_S0
* rinvsix_S0
,
857 v_fswitch_r(rsw_S0
, rsw2_S0
, p12_12cpot_S
, p12_vc3_S
, p12_vc4_S
));
858 SimdReal VLJ12_S1
= c12_S1
859 * fma(twelveth_S
, rinvsix_S1
* rinvsix_S1
,
860 v_fswitch_r(rsw_S1
, rsw2_S1
, p12_12cpot_S
, p12_vc3_S
, p12_vc4_S
));
862 SimdReal VLJ12_S2
= c12_S2
863 * fma(twelveth_S
, rinvsix_S2
* rinvsix_S2
,
864 v_fswitch_r(rsw_S2
, rsw2_S2
, p12_12cpot_S
, p12_vc3_S
, p12_vc4_S
));
865 SimdReal VLJ12_S3
= c12_S3
866 * fma(twelveth_S
, rinvsix_S3
* rinvsix_S3
,
867 v_fswitch_r(rsw_S3
, rsw2_S3
, p12_12cpot_S
, p12_vc3_S
, p12_vc4_S
));
870 # endif /* LJ_FORCE_SWITCH */
872 /* Add up the repulsion and dispersion */
873 SimdReal VLJ_S0
= VLJ12_S0
- VLJ6_S0
;
874 SimdReal VLJ_S1
= VLJ12_S1
- VLJ6_S1
;
876 SimdReal VLJ_S2
= VLJ12_S2
- VLJ6_S2
;
877 SimdReal VLJ_S3
= VLJ12_S3
- VLJ6_S3
;
880 # endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
882 # ifdef LJ_POT_SWITCH
883 /* We always need the potential, since it is needed for the force */
884 SimdReal VLJ_S0
= fnma(sixth_S
, FrLJ6_S0
, twelveth_S
* FrLJ12_S0
);
885 SimdReal VLJ_S1
= fnma(sixth_S
, FrLJ6_S1
, twelveth_S
* FrLJ12_S1
);
887 SimdReal VLJ_S2
= fnma(sixth_S
, FrLJ6_S2
, twelveth_S
* FrLJ12_S2
);
888 SimdReal VLJ_S3
= fnma(sixth_S
, FrLJ6_S3
, twelveth_S
* FrLJ12_S3
);
892 SimdReal sw_S0
, dsw_S0
;
893 SimdReal sw_S1
, dsw_S1
;
895 SimdReal sw_S2
, dsw_S2
;
896 SimdReal sw_S3
, dsw_S3
;
899 # define switch_r(rsw, rsw2, c3, c4, c5) \
900 fma(fma(fma(c5, rsw, c4), rsw, c3), ((rsw2) * (rsw)), one_S)
901 # define dswitch_r(rsw, rsw2, c2, c3, c4) (fma(fma(c4, rsw, c3), rsw, c2) * (rsw2))
903 sw_S0
= switch_r(rsw_S0
, rsw2_S0
, swV3_S
, swV4_S
, swV5_S
);
904 dsw_S0
= dswitch_r(rsw_S0
, rsw2_S0
, swF2_S
, swF3_S
, swF4_S
);
905 sw_S1
= switch_r(rsw_S1
, rsw2_S1
, swV3_S
, swV4_S
, swV5_S
);
906 dsw_S1
= dswitch_r(rsw_S1
, rsw2_S1
, swF2_S
, swF3_S
, swF4_S
);
908 sw_S2
= switch_r(rsw_S2
, rsw2_S2
, swV3_S
, swV4_S
, swV5_S
);
909 dsw_S2
= dswitch_r(rsw_S2
, rsw2_S2
, swF2_S
, swF3_S
, swF4_S
);
910 sw_S3
= switch_r(rsw_S3
, rsw2_S3
, swV3_S
, swV4_S
, swV5_S
);
911 dsw_S3
= dswitch_r(rsw_S3
, rsw2_S3
, swF2_S
, swF3_S
, swF4_S
);
913 frLJ_S0
= fnma(dsw_S0
* VLJ_S0
, r_S0
, sw_S0
* frLJ_S0
);
914 frLJ_S1
= fnma(dsw_S1
* VLJ_S1
, r_S1
, sw_S1
* frLJ_S1
);
916 frLJ_S2
= fnma(dsw_S2
* VLJ_S2
, r_S2
, sw_S2
* frLJ_S2
);
917 frLJ_S3
= fnma(dsw_S3
* VLJ_S3
, r_S3
, sw_S3
* frLJ_S3
);
919 # ifdef CALC_ENERGIES
920 VLJ_S0
= sw_S0
* VLJ_S0
;
921 VLJ_S1
= sw_S1
* VLJ_S1
;
923 VLJ_S2
= sw_S2
* VLJ_S2
;
924 VLJ_S3
= sw_S3
* VLJ_S3
;
931 # endif /* LJ_POT_SWITCH */
933 # if defined CALC_ENERGIES && defined CHECK_EXCLS
934 /* The potential shift should be removed for excluded pairs */
935 VLJ_S0
= selectByMask(VLJ_S0
, interact_S0
);
936 VLJ_S1
= selectByMask(VLJ_S1
, interact_S1
);
938 VLJ_S2
= selectByMask(VLJ_S2
, interact_S2
);
939 VLJ_S3
= selectByMask(VLJ_S3
, interact_S3
);
943 # ifdef LJ_EWALD_GEOM
946 SimdReal c6grid_S0
, rinvsix_nm_S0
, cr2_S0
, expmcr2_S0
, poly_S0
;
947 SimdReal c6grid_S1
, rinvsix_nm_S1
, cr2_S1
, expmcr2_S1
, poly_S1
;
949 SimdReal c6grid_S2
, rinvsix_nm_S2
, cr2_S2
, expmcr2_S2
, poly_S2
;
950 SimdReal c6grid_S3
, rinvsix_nm_S3
, cr2_S3
, expmcr2_S3
, poly_S3
;
952 # ifdef CALC_ENERGIES
961 /* Determine C6 for the grid using the geometric combination rule */
962 c6s_j_S
= load
<SimdReal
>(ljc
+ aj2
+ 0);
963 c6grid_S0
= c6s_S0
* c6s_j_S
;
964 c6grid_S1
= c6s_S1
* c6s_j_S
;
966 c6grid_S2
= c6s_S2
* c6s_j_S
;
967 c6grid_S3
= c6s_S3
* c6s_j_S
;
971 /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
972 rinvsix_nm_S0
= rinvsq_S0
* rinvsq_S0
* rinvsq_S0
;
973 rinvsix_nm_S1
= rinvsq_S1
* rinvsq_S1
* rinvsq_S1
;
975 rinvsix_nm_S2
= rinvsq_S2
* rinvsq_S2
* rinvsq_S2
;
976 rinvsix_nm_S3
= rinvsq_S3
* rinvsq_S3
* rinvsq_S3
;
979 /* We didn't use a mask, so we can copy */
980 rinvsix_nm_S0
= rinvsix_S0
;
981 rinvsix_nm_S1
= rinvsix_S1
;
983 rinvsix_nm_S2
= rinvsix_S2
;
984 rinvsix_nm_S3
= rinvsix_S3
;
988 /* Mask for the cut-off to avoid overflow of cr2^2 */
989 cr2_S0
= lje_c2_S
* selectByMask(rsq_S0
, wco_vdw_S0
);
990 cr2_S1
= lje_c2_S
* selectByMask(rsq_S1
, wco_vdw_S1
);
992 cr2_S2
= lje_c2_S
* selectByMask(rsq_S2
, wco_vdw_S2
);
993 cr2_S3
= lje_c2_S
* selectByMask(rsq_S3
, wco_vdw_S3
);
995 // Unsafe version of our exp() should be fine, since these arguments should never
996 // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
997 expmcr2_S0
= exp
<MathOptimization::Unsafe
>(-cr2_S0
);
998 expmcr2_S1
= exp
<MathOptimization::Unsafe
>(-cr2_S1
);
1000 expmcr2_S2
= exp
<MathOptimization::Unsafe
>(-cr2_S2
);
1001 expmcr2_S3
= exp
<MathOptimization::Unsafe
>(-cr2_S3
);
1004 /* 1 + cr2 + 1/2*cr2^2 */
1005 poly_S0
= fma(fma(half_S
, cr2_S0
, one_S
), cr2_S0
, one_S
);
1006 poly_S1
= fma(fma(half_S
, cr2_S1
, one_S
), cr2_S1
, one_S
);
1008 poly_S2
= fma(fma(half_S
, cr2_S2
, one_S
), cr2_S2
, one_S
);
1009 poly_S3
= fma(fma(half_S
, cr2_S3
, one_S
), cr2_S3
, one_S
);
1012 /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
1013 * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
1015 frLJ_S0
= fma(c6grid_S0
,
1016 fnma(expmcr2_S0
, fma(rinvsix_nm_S0
, poly_S0
, lje_c6_6_S
), rinvsix_nm_S0
), frLJ_S0
);
1017 frLJ_S1
= fma(c6grid_S1
,
1018 fnma(expmcr2_S1
, fma(rinvsix_nm_S1
, poly_S1
, lje_c6_6_S
), rinvsix_nm_S1
), frLJ_S1
);
1020 frLJ_S2
= fma(c6grid_S2
,
1021 fnma(expmcr2_S2
, fma(rinvsix_nm_S2
, poly_S2
, lje_c6_6_S
), rinvsix_nm_S2
), frLJ_S2
);
1022 frLJ_S3
= fma(c6grid_S3
,
1023 fnma(expmcr2_S3
, fma(rinvsix_nm_S3
, poly_S3
, lje_c6_6_S
), rinvsix_nm_S3
), frLJ_S3
);
1026 # ifdef CALC_ENERGIES
1028 sh_mask_S0
= selectByMask(lje_vc_S
, interact_S0
);
1029 sh_mask_S1
= selectByMask(lje_vc_S
, interact_S1
);
1031 sh_mask_S2
= selectByMask(lje_vc_S
, interact_S2
);
1032 sh_mask_S3
= selectByMask(lje_vc_S
, interact_S3
);
1035 sh_mask_S0
= lje_vc_S
;
1036 sh_mask_S1
= lje_vc_S
;
1038 sh_mask_S2
= lje_vc_S
;
1039 sh_mask_S3
= lje_vc_S
;
1043 VLJ_S0
= fma(sixth_S
* c6grid_S0
,
1044 fma(rinvsix_nm_S0
, fnma(expmcr2_S0
, poly_S0
, one_S
), sh_mask_S0
), VLJ_S0
);
1045 VLJ_S1
= fma(sixth_S
* c6grid_S1
,
1046 fma(rinvsix_nm_S1
, fnma(expmcr2_S1
, poly_S1
, one_S
), sh_mask_S1
), VLJ_S1
);
1048 VLJ_S2
= fma(sixth_S
* c6grid_S2
,
1049 fma(rinvsix_nm_S2
, fnma(expmcr2_S2
, poly_S2
, one_S
), sh_mask_S2
), VLJ_S2
);
1050 VLJ_S3
= fma(sixth_S
* c6grid_S3
,
1051 fma(rinvsix_nm_S3
, fnma(expmcr2_S3
, poly_S3
, one_S
), sh_mask_S3
), VLJ_S3
);
1053 # endif /* CALC_ENERGIES */
1055 # endif /* LJ_EWALD_GEOM */
1057 # if defined VDW_CUTOFF_CHECK
1058 /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
1059 * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
1061 frLJ_S0
= selectByMask(frLJ_S0
, wco_vdw_S0
);
1062 frLJ_S1
= selectByMask(frLJ_S1
, wco_vdw_S1
);
1064 frLJ_S2
= selectByMask(frLJ_S2
, wco_vdw_S2
);
1065 frLJ_S3
= selectByMask(frLJ_S3
, wco_vdw_S3
);
1069 # ifdef CALC_ENERGIES
1070 /* The potential shift should be removed for pairs beyond cut-off */
1071 VLJ_S0
= selectByMask(VLJ_S0
, wco_vdw_S0
);
1072 VLJ_S1
= selectByMask(VLJ_S1
, wco_vdw_S1
);
1074 VLJ_S2
= selectByMask(VLJ_S2
, wco_vdw_S2
);
1075 VLJ_S3
= selectByMask(VLJ_S3
, wco_vdw_S3
);
1079 # endif /* CALC_LJ */
1081 # ifdef CALC_ENERGIES
1082 # ifdef ENERGY_GROUPS
1083 /* Extract the group pair index per j pair.
1084 * Energy groups are stored per i-cluster, so things get
1085 * complicated when the i- and j-cluster size don't match.
1090 egps_j
= nbatParams
.energrp
[cj
>> 1];
1091 egp_jj
[0] = ((egps_j
>> ((cj
& 1) * egps_jshift
)) & egps_jmask
) * egps_jstride
;
1093 /* We assume UNROLLI <= UNROLLJ */
1095 for (jdi
= 0; jdi
< UNROLLJ
/ UNROLLI
; jdi
++)
1098 egps_j
= nbatParams
.energrp
[cj
* (UNROLLJ
/ UNROLLI
) + jdi
];
1099 for (jj
= 0; jj
< (UNROLLI
/ 2); jj
++)
1101 egp_jj
[jdi
* (UNROLLI
/ 2) + jj
] =
1102 ((egps_j
>> (jj
* egps_jshift
)) & egps_jmask
) * egps_jstride
;
1109 # ifdef CALC_COULOMB
1110 # ifndef ENERGY_GROUPS
1111 vctot_S
= vctot_S
+ vcoul_S0
+ vcoul_S1
+ vcoul_S2
+ vcoul_S3
;
1113 add_ener_grp(vcoul_S0
, vctp
[0], egp_jj
);
1114 add_ener_grp(vcoul_S1
, vctp
[1], egp_jj
);
1115 add_ener_grp(vcoul_S2
, vctp
[2], egp_jj
);
1116 add_ener_grp(vcoul_S3
, vctp
[3], egp_jj
);
1122 # ifndef ENERGY_GROUPS
1124 Vvdwtot_S
= Vvdwtot_S
+ VLJ_S0
+ VLJ_S1
+ VLJ_S2
+ VLJ_S3
;
1126 Vvdwtot_S
= Vvdwtot_S
+ VLJ_S0
+ VLJ_S1
;
1129 add_ener_grp(VLJ_S0
, vvdwtp
[0], egp_jj
);
1130 add_ener_grp(VLJ_S1
, vvdwtp
[1], egp_jj
);
1132 add_ener_grp(VLJ_S2
, vvdwtp
[2], egp_jj
);
1133 add_ener_grp(VLJ_S3
, vvdwtp
[3], egp_jj
);
1136 # endif /* CALC_LJ */
1137 # endif /* CALC_ENERGIES */
1140 # ifdef CALC_COULOMB
1141 fscal_S0
= rinvsq_S0
* (frcoul_S0
+ frLJ_S0
);
1143 fscal_S0
= rinvsq_S0
* frLJ_S0
;
1145 # ifdef CALC_COULOMB
1146 fscal_S1
= rinvsq_S1
* (frcoul_S1
+ frLJ_S1
);
1148 fscal_S1
= rinvsq_S1
* frLJ_S1
;
1151 fscal_S0
= rinvsq_S0
* frcoul_S0
;
1152 fscal_S1
= rinvsq_S1
* frcoul_S1
;
1153 # endif /* CALC_LJ */
1154 # if defined CALC_LJ && !defined HALF_LJ
1155 # ifdef CALC_COULOMB
1156 fscal_S2
= rinvsq_S2
* (frcoul_S2
+ frLJ_S2
);
1157 fscal_S3
= rinvsq_S3
* (frcoul_S3
+ frLJ_S3
);
1159 fscal_S2
= rinvsq_S2
* frLJ_S2
;
1160 fscal_S3
= rinvsq_S3
* frLJ_S3
;
1163 /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
1164 fscal_S2
= rinvsq_S2
* frcoul_S2
;
1165 fscal_S3
= rinvsq_S3
* frcoul_S3
;
1168 /* Calculate temporary vectorial force */
1169 tx_S0
= fscal_S0
* dx_S0
;
1170 tx_S1
= fscal_S1
* dx_S1
;
1171 tx_S2
= fscal_S2
* dx_S2
;
1172 tx_S3
= fscal_S3
* dx_S3
;
1173 ty_S0
= fscal_S0
* dy_S0
;
1174 ty_S1
= fscal_S1
* dy_S1
;
1175 ty_S2
= fscal_S2
* dy_S2
;
1176 ty_S3
= fscal_S3
* dy_S3
;
1177 tz_S0
= fscal_S0
* dz_S0
;
1178 tz_S1
= fscal_S1
* dz_S1
;
1179 tz_S2
= fscal_S2
* dz_S2
;
1180 tz_S3
= fscal_S3
* dz_S3
;
1182 /* Increment i atom force */
1183 fix_S0
= fix_S0
+ tx_S0
;
1184 fix_S1
= fix_S1
+ tx_S1
;
1185 fix_S2
= fix_S2
+ tx_S2
;
1186 fix_S3
= fix_S3
+ tx_S3
;
1187 fiy_S0
= fiy_S0
+ ty_S0
;
1188 fiy_S1
= fiy_S1
+ ty_S1
;
1189 fiy_S2
= fiy_S2
+ ty_S2
;
1190 fiy_S3
= fiy_S3
+ ty_S3
;
1191 fiz_S0
= fiz_S0
+ tz_S0
;
1192 fiz_S1
= fiz_S1
+ tz_S1
;
1193 fiz_S2
= fiz_S2
+ tz_S2
;
1194 fiz_S3
= fiz_S3
+ tz_S3
;
1196 /* Decrement j atom force */
1197 store(f
+ ajx
, load
<SimdReal
>(f
+ ajx
) - (tx_S0
+ tx_S1
+ tx_S2
+ tx_S3
));
1198 store(f
+ ajy
, load
<SimdReal
>(f
+ ajy
) - (ty_S0
+ ty_S1
+ ty_S2
+ ty_S3
));
1199 store(f
+ ajz
, load
<SimdReal
>(f
+ ajz
) - (tz_S0
+ tz_S1
+ tz_S2
+ tz_S3
));