Update instructions in containers.rst
[gromacs.git] / src / gromacs / nbnxm / kernels_simd_4xm / kernel_inner.h
blob01646f3b04dabfc41239c09bbed8d2e5f5f17de7
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 /* Doxygen gets confused (buggy) about the block in this file in combination with
38 * the namespace prefix, and thinks store is documented here.
39 * This will solve itself with the second-generation nbnxn kernels, so for now
40 * we just tell Doxygen to stay out.
42 #ifndef DOXYGEN
44 /* This is the innermost loop contents for the 4 x N atom simd kernel.
45 * This flavor of the kernel calculates interactions of 4 i-atoms
46 * with N j-atoms stored in N wide simd registers.
49 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
50 * forces on excluded atom pairs here in the non-bonded loops.
51 * But when energies and/or virial is required we calculate them
52 * separately to as then it is easier to separate the energy and virial
53 * contributions.
55 # if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
56 # define EXCL_FORCES
57 # endif
60 int cj, ajx, ajy, ajz;
61 int gmx_unused aj;
63 # ifdef ENERGY_GROUPS
64 /* Energy group indices for two atoms packed into one int */
65 int egp_jj[UNROLLJ / 2];
66 # endif
68 # ifdef CHECK_EXCLS
69 /* Interaction (non-exclusion) mask of all 1's or 0's */
70 SimdBool interact_S0;
71 SimdBool interact_S1;
72 SimdBool interact_S2;
73 SimdBool interact_S3;
74 # endif
76 SimdReal jx_S, jy_S, jz_S;
77 SimdReal dx_S0, dy_S0, dz_S0;
78 SimdReal dx_S1, dy_S1, dz_S1;
79 SimdReal dx_S2, dy_S2, dz_S2;
80 SimdReal dx_S3, dy_S3, dz_S3;
81 SimdReal tx_S0, ty_S0, tz_S0;
82 SimdReal tx_S1, ty_S1, tz_S1;
83 SimdReal tx_S2, ty_S2, tz_S2;
84 SimdReal tx_S3, ty_S3, tz_S3;
85 SimdReal rsq_S0, rinv_S0, rinvsq_S0;
86 SimdReal rsq_S1, rinv_S1, rinvsq_S1;
87 SimdReal rsq_S2, rinv_S2, rinvsq_S2;
88 SimdReal rsq_S3, rinv_S3, rinvsq_S3;
90 /* wco: within cut-off, mask of all 1's or 0's */
91 SimdBool wco_S0;
92 SimdBool wco_S1;
93 SimdBool wco_S2;
94 SimdBool wco_S3;
96 # ifdef VDW_CUTOFF_CHECK
97 SimdBool wco_vdw_S0;
98 SimdBool wco_vdw_S1;
99 # ifndef HALF_LJ
100 SimdBool wco_vdw_S2;
101 SimdBool wco_vdw_S3;
102 # endif
103 # endif
105 # if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
106 SimdReal r_S0;
107 SimdReal r_S1;
108 # if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
109 SimdReal r_S2;
110 SimdReal r_S3;
111 # endif
112 # endif
114 # if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
115 SimdReal rsw_S0, rsw2_S0;
116 SimdReal rsw_S1, rsw2_S1;
117 # ifndef HALF_LJ
118 SimdReal rsw_S2, rsw2_S2;
119 SimdReal rsw_S3, rsw2_S3;
120 # endif
121 # endif
123 # ifdef CALC_COULOMB
124 # ifdef CHECK_EXCLS
125 /* 1/r masked with the interaction mask */
126 SimdReal rinv_ex_S0;
127 SimdReal rinv_ex_S1;
128 SimdReal rinv_ex_S2;
129 SimdReal rinv_ex_S3;
130 # endif
131 SimdReal jq_S;
132 SimdReal qq_S0;
133 SimdReal qq_S1;
134 SimdReal qq_S2;
135 SimdReal qq_S3;
136 # ifdef CALC_COUL_TAB
137 /* The force (PME mesh force) we need to subtract from 1/r^2 */
138 SimdReal fsub_S0;
139 SimdReal fsub_S1;
140 SimdReal fsub_S2;
141 SimdReal fsub_S3;
142 # endif
143 # ifdef CALC_COUL_EWALD
144 SimdReal brsq_S0, brsq_S1, brsq_S2, brsq_S3;
145 SimdReal ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
146 # endif
148 /* frcoul = (1/r - fsub)*r */
149 SimdReal frcoul_S0;
150 SimdReal frcoul_S1;
151 SimdReal frcoul_S2;
152 SimdReal frcoul_S3;
153 # ifdef CALC_COUL_TAB
154 /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
155 SimdReal rs_S0, rf_S0, frac_S0;
156 SimdReal rs_S1, rf_S1, frac_S1;
157 SimdReal rs_S2, rf_S2, frac_S2;
158 SimdReal rs_S3, rf_S3, frac_S3;
159 /* Table index: rs truncated to an int */
160 SimdInt32 ti_S0, ti_S1, ti_S2, ti_S3;
161 /* Linear force table values */
162 SimdReal ctab0_S0, ctab1_S0;
163 SimdReal ctab0_S1, ctab1_S1;
164 SimdReal ctab0_S2, ctab1_S2;
165 SimdReal ctab0_S3, ctab1_S3;
166 # ifdef CALC_ENERGIES
167 /* Quadratic energy table value */
168 SimdReal ctabv_S0, dum_S0;
169 SimdReal ctabv_S1, dum_S1;
170 SimdReal ctabv_S2, dum_S2;
171 SimdReal ctabv_S3, dum_S3;
172 # endif
173 # endif
174 # if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
175 /* The potential (PME mesh) we need to subtract from 1/r */
176 SimdReal vc_sub_S0;
177 SimdReal vc_sub_S1;
178 SimdReal vc_sub_S2;
179 SimdReal vc_sub_S3;
180 # endif
181 # ifdef CALC_ENERGIES
182 /* Electrostatic potential */
183 SimdReal vcoul_S0;
184 SimdReal vcoul_S1;
185 SimdReal vcoul_S2;
186 SimdReal vcoul_S3;
187 # endif
188 # endif
189 /* The force times 1/r */
190 SimdReal fscal_S0;
191 SimdReal fscal_S1;
192 SimdReal fscal_S2;
193 SimdReal fscal_S3;
195 # ifdef CALC_LJ
196 # ifdef LJ_COMB_LB
197 /* LJ sigma_j/2 and sqrt(epsilon_j) */
198 SimdReal hsig_j_S, seps_j_S;
199 /* LJ sigma_ij and epsilon_ij */
200 SimdReal sig_S0, eps_S0;
201 SimdReal sig_S1, eps_S1;
202 # ifndef HALF_LJ
203 SimdReal sig_S2, eps_S2;
204 SimdReal sig_S3, eps_S3;
205 # endif
206 # ifdef CALC_ENERGIES
207 SimdReal sig2_S0, sig6_S0;
208 SimdReal sig2_S1, sig6_S1;
209 # ifndef HALF_LJ
210 SimdReal sig2_S2, sig6_S2;
211 SimdReal sig2_S3, sig6_S3;
212 # endif
213 # endif /* LJ_COMB_LB */
214 # endif /* CALC_LJ */
216 # ifdef LJ_COMB_GEOM
217 SimdReal c6s_j_S, c12s_j_S;
218 # endif
220 # if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
221 /* Index for loading LJ parameters, complicated when interleaving */
222 int aj2;
223 # endif
225 /* Intermediate variables for LJ calculation */
226 # ifndef LJ_COMB_LB
227 SimdReal rinvsix_S0;
228 SimdReal rinvsix_S1;
229 # ifndef HALF_LJ
230 SimdReal rinvsix_S2;
231 SimdReal rinvsix_S3;
232 # endif
233 # endif
234 # ifdef LJ_COMB_LB
235 SimdReal sir_S0, sir2_S0, sir6_S0;
236 SimdReal sir_S1, sir2_S1, sir6_S1;
237 # ifndef HALF_LJ
238 SimdReal sir_S2, sir2_S2, sir6_S2;
239 SimdReal sir_S3, sir2_S3, sir6_S3;
240 # endif
241 # endif
243 SimdReal FrLJ6_S0, FrLJ12_S0, frLJ_S0;
244 SimdReal FrLJ6_S1, FrLJ12_S1, frLJ_S1;
245 # ifndef HALF_LJ
246 SimdReal FrLJ6_S2, FrLJ12_S2, frLJ_S2;
247 SimdReal FrLJ6_S3, FrLJ12_S3, frLJ_S3;
248 # endif
249 # endif /* CALC_LJ */
251 /* j-cluster index */
252 cj = l_cj[cjind].cj;
254 /* Atom indices (of the first atom in the cluster) */
255 aj = cj * UNROLLJ;
256 # if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
257 # if UNROLLJ == STRIDE
258 aj2 = aj * 2;
259 # else
260 aj2 = (cj >> 1) * 2 * STRIDE + (cj & 1) * UNROLLJ;
261 # endif
262 # endif
263 # if UNROLLJ == STRIDE
264 ajx = aj * DIM;
265 # else
266 ajx = (cj >> 1) * DIM * STRIDE + (cj & 1) * UNROLLJ;
267 # endif
268 ajy = ajx + STRIDE;
269 ajz = ajy + STRIDE;
271 # ifdef CHECK_EXCLS
272 gmx_load_simd_4xn_interactions(static_cast<int>(l_cj[cjind].excl), filter_S0, filter_S1,
273 filter_S2, filter_S3, nbat->simdMasks.interaction_array.data(),
274 &interact_S0, &interact_S1, &interact_S2, &interact_S3);
275 # endif /* CHECK_EXCLS */
277 /* load j atom coordinates */
278 jx_S = load<SimdReal>(x + ajx);
279 jy_S = load<SimdReal>(x + ajy);
280 jz_S = load<SimdReal>(x + ajz);
282 /* Calculate distance */
283 dx_S0 = ix_S0 - jx_S;
284 dy_S0 = iy_S0 - jy_S;
285 dz_S0 = iz_S0 - jz_S;
286 dx_S1 = ix_S1 - jx_S;
287 dy_S1 = iy_S1 - jy_S;
288 dz_S1 = iz_S1 - jz_S;
289 dx_S2 = ix_S2 - jx_S;
290 dy_S2 = iy_S2 - jy_S;
291 dz_S2 = iz_S2 - jz_S;
292 dx_S3 = ix_S3 - jx_S;
293 dy_S3 = iy_S3 - jy_S;
294 dz_S3 = iz_S3 - jz_S;
296 /* rsq = dx*dx+dy*dy+dz*dz */
297 rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
298 rsq_S1 = norm2(dx_S1, dy_S1, dz_S1);
299 rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
300 rsq_S3 = norm2(dx_S3, dy_S3, dz_S3);
302 /* Do the cut-off check */
303 wco_S0 = (rsq_S0 < rc2_S);
304 wco_S1 = (rsq_S1 < rc2_S);
305 wco_S2 = (rsq_S2 < rc2_S);
306 wco_S3 = (rsq_S3 < rc2_S);
308 # ifdef CHECK_EXCLS
309 # ifdef EXCL_FORCES
310 /* Only remove the (sub-)diagonal to avoid double counting */
311 # if UNROLLJ == UNROLLI
312 if (cj == ci_sh)
314 wco_S0 = wco_S0 && diagonal_mask_S0;
315 wco_S1 = wco_S1 && diagonal_mask_S1;
316 wco_S2 = wco_S2 && diagonal_mask_S2;
317 wco_S3 = wco_S3 && diagonal_mask_S3;
319 # else
320 # if UNROLLJ < UNROLLI
321 if (cj == ci_sh * 2)
323 wco_S0 = wco_S0 && diagonal_mask0_S0;
324 wco_S1 = wco_S1 && diagonal_mask0_S1;
325 wco_S2 = wco_S2 && diagonal_mask0_S2;
326 wco_S3 = wco_S3 && diagonal_mask0_S3;
328 if (cj == ci_sh * 2 + 1)
330 wco_S0 = wco_S0 && diagonal_mask1_S0;
331 wco_S1 = wco_S1 && diagonal_mask1_S1;
332 wco_S2 = wco_S2 && diagonal_mask1_S2;
333 wco_S3 = wco_S3 && diagonal_mask1_S3;
335 # else
336 if (cj * 2 == ci_sh)
338 wco_S0 = wco_S0 && diagonal_mask0_S0;
339 wco_S1 = wco_S1 && diagonal_mask0_S1;
340 wco_S2 = wco_S2 && diagonal_mask0_S2;
341 wco_S3 = wco_S3 && diagonal_mask0_S3;
343 else if (cj * 2 + 1 == ci_sh)
345 wco_S0 = wco_S0 && diagonal_mask1_S0;
346 wco_S1 = wco_S1 && diagonal_mask1_S1;
347 wco_S2 = wco_S2 && diagonal_mask1_S2;
348 wco_S3 = wco_S3 && diagonal_mask1_S3;
350 # endif
351 # endif
352 # else /* EXCL_FORCES */
353 /* No exclusion forces: remove all excluded atom pairs from the list */
354 wco_S0 = wco_S0 && interact_S0;
355 wco_S1 = wco_S1 && interact_S1;
356 wco_S2 = wco_S2 && interact_S2;
357 wco_S3 = wco_S3 && interact_S3;
358 # endif
359 # endif
361 # ifdef COUNT_PAIRS
363 int i, j;
364 alignas(GMX_SIMD_ALIGNMENT) real tmp[2 * GMX_SIMD_REAL_WIDTH];
366 for (i = 0; i < UNROLLI; i++)
368 store(tmp, rc2_S - (i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
369 for (j = 0; j < UNROLLJ; j++)
371 if (tmp[j] >= 0)
373 npair++;
378 # endif
380 // Ensure the distances do not fall below the limit where r^-12 overflows.
381 // This should never happen for normal interactions.
382 rsq_S0 = max(rsq_S0, minRsq_S);
383 rsq_S1 = max(rsq_S1, minRsq_S);
384 rsq_S2 = max(rsq_S2, minRsq_S);
385 rsq_S3 = max(rsq_S3, minRsq_S);
387 /* Calculate 1/r */
388 # if !GMX_DOUBLE
389 rinv_S0 = invsqrt(rsq_S0);
390 rinv_S1 = invsqrt(rsq_S1);
391 rinv_S2 = invsqrt(rsq_S2);
392 rinv_S3 = invsqrt(rsq_S3);
393 # else
394 invsqrtPair(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
395 invsqrtPair(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
396 # endif
398 # ifdef CALC_COULOMB
399 /* Load parameters for j atom */
400 jq_S = load<SimdReal>(q + aj);
401 qq_S0 = iq_S0 * jq_S;
402 qq_S1 = iq_S1 * jq_S;
403 qq_S2 = iq_S2 * jq_S;
404 qq_S3 = iq_S3 * jq_S;
405 # endif
407 # ifdef CALC_LJ
408 # if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
409 SimdReal c6_S0, c6_S1, c12_S0, c12_S1;
410 gatherLoadTranspose<c_simdBestPairAlignment>(nbfp0, type + aj, &c6_S0, &c12_S0);
411 gatherLoadTranspose<c_simdBestPairAlignment>(nbfp1, type + aj, &c6_S1, &c12_S1);
412 # ifndef HALF_LJ
413 SimdReal c6_S2, c6_S3, c12_S2, c12_S3;
414 gatherLoadTranspose<c_simdBestPairAlignment>(nbfp2, type + aj, &c6_S2, &c12_S2);
415 gatherLoadTranspose<c_simdBestPairAlignment>(nbfp3, type + aj, &c6_S3, &c12_S3);
416 # endif
417 # endif /* not defined any LJ rule */
419 # ifdef LJ_COMB_GEOM
420 c6s_j_S = load<SimdReal>(ljc + aj2 + 0);
421 c12s_j_S = load<SimdReal>(ljc + aj2 + STRIDE);
422 SimdReal c6_S0 = c6s_S0 * c6s_j_S;
423 SimdReal c6_S1 = c6s_S1 * c6s_j_S;
424 # ifndef HALF_LJ
425 SimdReal c6_S2 = c6s_S2 * c6s_j_S;
426 SimdReal c6_S3 = c6s_S3 * c6s_j_S;
427 # endif
428 SimdReal c12_S0 = c12s_S0 * c12s_j_S;
429 SimdReal c12_S1 = c12s_S1 * c12s_j_S;
430 # ifndef HALF_LJ
431 SimdReal c12_S2 = c12s_S2 * c12s_j_S;
432 SimdReal c12_S3 = c12s_S3 * c12s_j_S;
433 # endif
434 # endif /* LJ_COMB_GEOM */
436 # ifdef LJ_COMB_LB
437 hsig_j_S = load<SimdReal>(ljc + aj2 + 0);
438 seps_j_S = load<SimdReal>(ljc + aj2 + STRIDE);
440 sig_S0 = hsig_i_S0 + hsig_j_S;
441 sig_S1 = hsig_i_S1 + hsig_j_S;
442 eps_S0 = seps_i_S0 * seps_j_S;
443 eps_S1 = seps_i_S1 * seps_j_S;
444 # ifndef HALF_LJ
445 sig_S2 = hsig_i_S2 + hsig_j_S;
446 sig_S3 = hsig_i_S3 + hsig_j_S;
447 eps_S2 = seps_i_S2 * seps_j_S;
448 eps_S3 = seps_i_S3 * seps_j_S;
449 # endif
450 # endif /* LJ_COMB_LB */
452 # endif /* CALC_LJ */
454 /* Set rinv to zero for r beyond the cut-off */
455 rinv_S0 = selectByMask(rinv_S0, wco_S0);
456 rinv_S1 = selectByMask(rinv_S1, wco_S1);
457 rinv_S2 = selectByMask(rinv_S2, wco_S2);
458 rinv_S3 = selectByMask(rinv_S3, wco_S3);
460 rinvsq_S0 = rinv_S0 * rinv_S0;
461 rinvsq_S1 = rinv_S1 * rinv_S1;
462 rinvsq_S2 = rinv_S2 * rinv_S2;
463 rinvsq_S3 = rinv_S3 * rinv_S3;
465 # ifdef CALC_COULOMB
466 /* Note that here we calculate force*r, not the usual force/r.
467 * This allows avoiding masking the reaction-field contribution,
468 * as frcoul is later multiplied by rinvsq which has been
469 * masked with the cut-off check.
472 # ifdef EXCL_FORCES
473 /* Only add 1/r for non-excluded atom pairs */
474 rinv_ex_S0 = selectByMask(rinv_S0, interact_S0);
475 rinv_ex_S1 = selectByMask(rinv_S1, interact_S1);
476 rinv_ex_S2 = selectByMask(rinv_S2, interact_S2);
477 rinv_ex_S3 = selectByMask(rinv_S3, interact_S3);
478 # else
479 /* No exclusion forces, we always need 1/r */
480 # define rinv_ex_S0 rinv_S0
481 # define rinv_ex_S1 rinv_S1
482 # define rinv_ex_S2 rinv_S2
483 # define rinv_ex_S3 rinv_S3
484 # endif
486 # ifdef CALC_COUL_RF
487 /* Electrostatic interactions */
488 frcoul_S0 = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
489 frcoul_S1 = qq_S1 * fma(rsq_S1, mrc_3_S, rinv_ex_S1);
490 frcoul_S2 = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
491 frcoul_S3 = qq_S3 * fma(rsq_S3, mrc_3_S, rinv_ex_S3);
493 # ifdef CALC_ENERGIES
494 vcoul_S0 = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
495 vcoul_S1 = qq_S1 * (rinv_ex_S1 + fma(rsq_S1, hrc_3_S, moh_rc_S));
496 vcoul_S2 = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
497 vcoul_S3 = qq_S3 * (rinv_ex_S3 + fma(rsq_S3, hrc_3_S, moh_rc_S));
498 # endif
499 # endif
501 # ifdef CALC_COUL_EWALD
502 /* We need to mask (or limit) rsq for the cut-off,
503 * as large distances can cause an overflow in gmx_pmecorrF/V.
505 brsq_S0 = beta2_S * selectByMask(rsq_S0, wco_S0);
506 brsq_S1 = beta2_S * selectByMask(rsq_S1, wco_S1);
507 brsq_S2 = beta2_S * selectByMask(rsq_S2, wco_S2);
508 brsq_S3 = beta2_S * selectByMask(rsq_S3, wco_S3);
509 ewcorr_S0 = beta_S * pmeForceCorrection(brsq_S0);
510 ewcorr_S1 = beta_S * pmeForceCorrection(brsq_S1);
511 ewcorr_S2 = beta_S * pmeForceCorrection(brsq_S2);
512 ewcorr_S3 = beta_S * pmeForceCorrection(brsq_S3);
513 frcoul_S0 = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
514 frcoul_S1 = qq_S1 * fma(ewcorr_S1, brsq_S1, rinv_ex_S1);
515 frcoul_S2 = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
516 frcoul_S3 = qq_S3 * fma(ewcorr_S3, brsq_S3, rinv_ex_S3);
518 # ifdef CALC_ENERGIES
519 vc_sub_S0 = beta_S * pmePotentialCorrection(brsq_S0);
520 vc_sub_S1 = beta_S * pmePotentialCorrection(brsq_S1);
521 vc_sub_S2 = beta_S * pmePotentialCorrection(brsq_S2);
522 vc_sub_S3 = beta_S * pmePotentialCorrection(brsq_S3);
523 # endif
525 # endif /* CALC_COUL_EWALD */
527 # ifdef CALC_COUL_TAB
528 /* Electrostatic interactions */
529 r_S0 = rsq_S0 * rinv_S0;
530 r_S1 = rsq_S1 * rinv_S1;
531 r_S2 = rsq_S2 * rinv_S2;
532 r_S3 = rsq_S3 * rinv_S3;
533 /* Convert r to scaled table units */
534 rs_S0 = r_S0 * invtsp_S;
535 rs_S1 = r_S1 * invtsp_S;
536 rs_S2 = r_S2 * invtsp_S;
537 rs_S3 = r_S3 * invtsp_S;
538 /* Truncate scaled r to an int */
539 ti_S0 = cvttR2I(rs_S0);
540 ti_S1 = cvttR2I(rs_S1);
541 ti_S2 = cvttR2I(rs_S2);
542 ti_S3 = cvttR2I(rs_S3);
544 rf_S0 = trunc(rs_S0);
545 rf_S1 = trunc(rs_S1);
546 rf_S2 = trunc(rs_S2);
547 rf_S3 = trunc(rs_S3);
549 frac_S0 = rs_S0 - rf_S0;
550 frac_S1 = rs_S1 - rf_S1;
551 frac_S2 = rs_S2 - rf_S2;
552 frac_S3 = rs_S3 - rf_S3;
554 /* Load and interpolate table forces and possibly energies.
555 * Force and energy can be combined in one table, stride 4: FDV0
556 * or in two separate tables with stride 1: F and V
557 * Currently single precision uses FDV0, double F and V.
559 # ifndef CALC_ENERGIES
560 # ifdef TAB_FDV0
561 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
562 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
563 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
564 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
565 # else
566 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
567 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
568 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
569 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
570 ctab1_S0 = ctab1_S0 - ctab0_S0;
571 ctab1_S1 = ctab1_S1 - ctab0_S1;
572 ctab1_S2 = ctab1_S2 - ctab0_S2;
573 ctab1_S3 = ctab1_S3 - ctab0_S3;
574 # endif
575 # else
576 # ifdef TAB_FDV0
577 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
578 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1, &ctabv_S1, &dum_S1);
579 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
580 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3, &ctabv_S3, &dum_S3);
581 # else
582 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
583 gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
584 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
585 gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S1, &ctabv_S1, &dum_S1);
586 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
587 gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
588 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
589 gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S3, &ctabv_S3, &dum_S3);
590 ctab1_S0 = ctab1_S0 - ctab0_S0;
591 ctab1_S1 = ctab1_S1 - ctab0_S1;
592 ctab1_S2 = ctab1_S2 - ctab0_S2;
593 ctab1_S3 = ctab1_S3 - ctab0_S3;
594 # endif
595 # endif
596 fsub_S0 = fma(frac_S0, ctab1_S0, ctab0_S0);
597 fsub_S1 = fma(frac_S1, ctab1_S1, ctab0_S1);
598 fsub_S2 = fma(frac_S2, ctab1_S2, ctab0_S2);
599 fsub_S3 = fma(frac_S3, ctab1_S3, ctab0_S3);
600 frcoul_S0 = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
601 frcoul_S1 = qq_S1 * fnma(fsub_S1, r_S1, rinv_ex_S1);
602 frcoul_S2 = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
603 frcoul_S3 = qq_S3 * fnma(fsub_S3, r_S3, rinv_ex_S3);
605 # ifdef CALC_ENERGIES
606 vc_sub_S0 = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
607 vc_sub_S1 = fma((mhalfsp_S * frac_S1), (ctab0_S1 + fsub_S1), ctabv_S1);
608 vc_sub_S2 = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
609 vc_sub_S3 = fma((mhalfsp_S * frac_S3), (ctab0_S3 + fsub_S3), ctabv_S3);
610 # endif
611 # endif /* CALC_COUL_TAB */
613 # if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
614 # ifndef NO_SHIFT_EWALD
615 /* Add Ewald potential shift to vc_sub for convenience */
616 # ifdef CHECK_EXCLS
617 vc_sub_S0 = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
618 vc_sub_S1 = vc_sub_S1 + selectByMask(sh_ewald_S, interact_S1);
619 vc_sub_S2 = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
620 vc_sub_S3 = vc_sub_S3 + selectByMask(sh_ewald_S, interact_S3);
621 # else
622 vc_sub_S0 = vc_sub_S0 + sh_ewald_S;
623 vc_sub_S1 = vc_sub_S1 + sh_ewald_S;
624 vc_sub_S2 = vc_sub_S2 + sh_ewald_S;
625 vc_sub_S3 = vc_sub_S3 + sh_ewald_S;
626 # endif
627 # endif
629 vcoul_S0 = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
630 vcoul_S1 = qq_S1 * (rinv_ex_S1 - vc_sub_S1);
631 vcoul_S2 = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
632 vcoul_S3 = qq_S3 * (rinv_ex_S3 - vc_sub_S3);
634 # endif
636 # ifdef CALC_ENERGIES
637 /* Mask energy for cut-off and diagonal */
638 vcoul_S0 = selectByMask(vcoul_S0, wco_S0);
639 vcoul_S1 = selectByMask(vcoul_S1, wco_S1);
640 vcoul_S2 = selectByMask(vcoul_S2, wco_S2);
641 vcoul_S3 = selectByMask(vcoul_S3, wco_S3);
642 # endif
644 # endif /* CALC_COULOMB */
646 # ifdef CALC_LJ
647 /* Lennard-Jones interaction */
649 # ifdef VDW_CUTOFF_CHECK
650 wco_vdw_S0 = (rsq_S0 < rcvdw2_S);
651 wco_vdw_S1 = (rsq_S1 < rcvdw2_S);
652 # ifndef HALF_LJ
653 wco_vdw_S2 = (rsq_S2 < rcvdw2_S);
654 wco_vdw_S3 = (rsq_S3 < rcvdw2_S);
655 # endif
656 # else
657 /* Same cut-off for Coulomb and VdW, reuse the registers */
658 # define wco_vdw_S0 wco_S0
659 # define wco_vdw_S1 wco_S1
660 # define wco_vdw_S2 wco_S2
661 # define wco_vdw_S3 wco_S3
662 # endif
664 # ifndef LJ_COMB_LB
665 rinvsix_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
666 rinvsix_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
667 # ifdef EXCL_FORCES
668 rinvsix_S0 = selectByMask(rinvsix_S0, interact_S0);
669 rinvsix_S1 = selectByMask(rinvsix_S1, interact_S1);
670 # endif
671 # ifndef HALF_LJ
672 rinvsix_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
673 rinvsix_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
674 # ifdef EXCL_FORCES
675 rinvsix_S2 = selectByMask(rinvsix_S2, interact_S2);
676 rinvsix_S3 = selectByMask(rinvsix_S3, interact_S3);
677 # endif
678 # endif
680 # if defined LJ_CUT || defined LJ_POT_SWITCH
681 /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
682 FrLJ6_S0 = c6_S0 * rinvsix_S0;
683 FrLJ6_S1 = c6_S1 * rinvsix_S1;
684 # ifndef HALF_LJ
685 FrLJ6_S2 = c6_S2 * rinvsix_S2;
686 FrLJ6_S3 = c6_S3 * rinvsix_S3;
687 # endif
688 FrLJ12_S0 = c12_S0 * rinvsix_S0 * rinvsix_S0;
689 FrLJ12_S1 = c12_S1 * rinvsix_S1 * rinvsix_S1;
690 # ifndef HALF_LJ
691 FrLJ12_S2 = c12_S2 * rinvsix_S2 * rinvsix_S2;
692 FrLJ12_S3 = c12_S3 * rinvsix_S3 * rinvsix_S3;
693 # endif
694 # endif
696 # if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
697 /* We switch the LJ force */
698 r_S0 = rsq_S0 * rinv_S0;
699 rsw_S0 = max(r_S0 - rswitch_S, zero_S);
700 rsw2_S0 = rsw_S0 * rsw_S0;
701 r_S1 = rsq_S1 * rinv_S1;
702 rsw_S1 = max(r_S1 - rswitch_S, zero_S);
703 rsw2_S1 = rsw_S1 * rsw_S1;
704 # ifndef HALF_LJ
705 r_S2 = rsq_S2 * rinv_S2;
706 rsw_S2 = max(r_S2 - rswitch_S, zero_S);
707 rsw2_S2 = rsw_S2 * rsw_S2;
708 r_S3 = rsq_S3 * rinv_S3;
709 rsw_S3 = max(r_S3 - rswitch_S, zero_S);
710 rsw2_S3 = rsw_S3 * rsw_S3;
711 # endif
712 # endif
714 # ifdef LJ_FORCE_SWITCH
716 # define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) \
717 fma(fma(c3, rsw, c2), rsw2_r, (fr))
718 SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
719 SimdReal rsw2_r_S1 = rsw2_S1 * r_S1;
720 FrLJ6_S0 = c6_S0 * gmx_add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
721 FrLJ6_S1 = c6_S1 * gmx_add_fr_switch(rinvsix_S1, rsw_S1, rsw2_r_S1, p6_fc2_S, p6_fc3_S);
722 # ifndef HALF_LJ
723 SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
724 SimdReal rsw2_r_S3 = rsw2_S3 * r_S3;
725 FrLJ6_S2 = c6_S2 * gmx_add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
726 FrLJ6_S3 = c6_S3 * gmx_add_fr_switch(rinvsix_S3, rsw_S3, rsw2_r_S3, p6_fc2_S, p6_fc3_S);
727 # endif
728 FrLJ12_S0 = c12_S0 * gmx_add_fr_switch((rinvsix_S0 * rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
729 FrLJ12_S1 = c12_S1 * gmx_add_fr_switch((rinvsix_S1 * rinvsix_S1), rsw_S1, rsw2_r_S1, p12_fc2_S, p12_fc3_S);
730 # ifndef HALF_LJ
731 FrLJ12_S2 = c12_S2 * gmx_add_fr_switch((rinvsix_S2 * rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
732 FrLJ12_S3 = c12_S3 * gmx_add_fr_switch((rinvsix_S3 * rinvsix_S3), rsw_S3, rsw2_r_S3, p12_fc2_S, p12_fc3_S);
733 # endif
734 # undef gmx_add_fr_switch
735 # endif /* LJ_FORCE_SWITCH */
737 # endif /* not LJ_COMB_LB */
739 # ifdef LJ_COMB_LB
740 sir_S0 = sig_S0 * rinv_S0;
741 sir_S1 = sig_S1 * rinv_S1;
742 # ifndef HALF_LJ
743 sir_S2 = sig_S2 * rinv_S2;
744 sir_S3 = sig_S3 * rinv_S3;
745 # endif
746 sir2_S0 = sir_S0 * sir_S0;
747 sir2_S1 = sir_S1 * sir_S1;
748 # ifndef HALF_LJ
749 sir2_S2 = sir_S2 * sir_S2;
750 sir2_S3 = sir_S3 * sir_S3;
751 # endif
752 sir6_S0 = sir2_S0 * sir2_S0 * sir2_S0;
753 sir6_S1 = sir2_S1 * sir2_S1 * sir2_S1;
754 # ifdef EXCL_FORCES
755 sir6_S0 = selectByMask(sir6_S0, interact_S0);
756 sir6_S1 = selectByMask(sir6_S1, interact_S1);
757 # endif
758 # ifndef HALF_LJ
759 sir6_S2 = sir2_S2 * sir2_S2 * sir2_S2;
760 sir6_S3 = sir2_S3 * sir2_S3 * sir2_S3;
761 # ifdef EXCL_FORCES
762 sir6_S2 = selectByMask(sir6_S2, interact_S2);
763 sir6_S3 = selectByMask(sir6_S3, interact_S3);
764 # endif
765 # endif
766 # ifdef VDW_CUTOFF_CHECK
767 sir6_S0 = selectByMask(sir6_S0, wco_vdw_S0);
768 sir6_S1 = selectByMask(sir6_S1, wco_vdw_S1);
769 # ifndef HALF_LJ
770 sir6_S2 = selectByMask(sir6_S2, wco_vdw_S2);
771 sir6_S3 = selectByMask(sir6_S3, wco_vdw_S3);
772 # endif
773 # endif
774 FrLJ6_S0 = eps_S0 * sir6_S0;
775 FrLJ6_S1 = eps_S1 * sir6_S1;
776 # ifndef HALF_LJ
777 FrLJ6_S2 = eps_S2 * sir6_S2;
778 FrLJ6_S3 = eps_S3 * sir6_S3;
779 # endif
780 FrLJ12_S0 = FrLJ6_S0 * sir6_S0;
781 FrLJ12_S1 = FrLJ6_S1 * sir6_S1;
782 # ifndef HALF_LJ
783 FrLJ12_S2 = FrLJ6_S2 * sir6_S2;
784 FrLJ12_S3 = FrLJ6_S3 * sir6_S3;
785 # endif
786 # if defined CALC_ENERGIES
787 /* We need C6 and C12 to calculate the LJ potential shift */
788 sig2_S0 = sig_S0 * sig_S0;
789 sig2_S1 = sig_S1 * sig_S1;
790 # ifndef HALF_LJ
791 sig2_S2 = sig_S2 * sig_S2;
792 sig2_S3 = sig_S3 * sig_S3;
793 # endif
794 sig6_S0 = sig2_S0 * sig2_S0 * sig2_S0;
795 sig6_S1 = sig2_S1 * sig2_S1 * sig2_S1;
796 # ifndef HALF_LJ
797 sig6_S2 = sig2_S2 * sig2_S2 * sig2_S2;
798 sig6_S3 = sig2_S3 * sig2_S3 * sig2_S3;
799 # endif
800 SimdReal c6_S0 = eps_S0 * sig6_S0;
801 SimdReal c6_S1 = eps_S1 * sig6_S1;
802 # ifndef HALF_LJ
803 SimdReal c6_S2 = eps_S2 * sig6_S2;
804 SimdReal c6_S3 = eps_S3 * sig6_S3;
805 # endif
806 SimdReal c12_S0 = c6_S0 * sig6_S0;
807 SimdReal c12_S1 = c6_S1 * sig6_S1;
808 # ifndef HALF_LJ
809 SimdReal c12_S2 = c6_S2 * sig6_S2;
810 SimdReal c12_S3 = c6_S3 * sig6_S3;
811 # endif
812 # endif
813 # endif /* LJ_COMB_LB */
815 /* Determine the total scalar LJ force*r */
816 frLJ_S0 = FrLJ12_S0 - FrLJ6_S0;
817 frLJ_S1 = FrLJ12_S1 - FrLJ6_S1;
818 # ifndef HALF_LJ
819 frLJ_S2 = FrLJ12_S2 - FrLJ6_S2;
820 frLJ_S3 = FrLJ12_S3 - FrLJ6_S3;
821 # endif
823 # if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
825 # ifdef LJ_CUT
826 /* Calculate the LJ energies, with constant potential shift */
827 SimdReal VLJ6_S0 = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
828 SimdReal VLJ6_S1 = sixth_S * fma(c6_S1, p6_cpot_S, FrLJ6_S1);
829 # ifndef HALF_LJ
830 SimdReal VLJ6_S2 = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
831 SimdReal VLJ6_S3 = sixth_S * fma(c6_S3, p6_cpot_S, FrLJ6_S3);
832 # endif
833 SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
834 SimdReal VLJ12_S1 = twelveth_S * fma(c12_S1, p12_cpot_S, FrLJ12_S1);
835 # ifndef HALF_LJ
836 SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
837 SimdReal VLJ12_S3 = twelveth_S * fma(c12_S3, p12_cpot_S, FrLJ12_S3);
838 # endif
839 # endif /* LJ_CUT */
841 # ifdef LJ_FORCE_SWITCH
842 # define v_fswitch_r(rsw, rsw2, c0, c3, c4) \
843 fma(fma((c4), (rsw), (c3)), ((rsw2) * (rsw)), (c0))
845 SimdReal VLJ6_S0 =
846 c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
847 SimdReal VLJ6_S1 =
848 c6_S1 * fma(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
849 # ifndef HALF_LJ
850 SimdReal VLJ6_S2 =
851 c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
852 SimdReal VLJ6_S3 =
853 c6_S3 * fma(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
854 # endif
855 SimdReal VLJ12_S0 = c12_S0
856 * fma(twelveth_S, rinvsix_S0 * rinvsix_S0,
857 v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
858 SimdReal VLJ12_S1 = c12_S1
859 * fma(twelveth_S, rinvsix_S1 * rinvsix_S1,
860 v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
861 # ifndef HALF_LJ
862 SimdReal VLJ12_S2 = c12_S2
863 * fma(twelveth_S, rinvsix_S2 * rinvsix_S2,
864 v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
865 SimdReal VLJ12_S3 = c12_S3
866 * fma(twelveth_S, rinvsix_S3 * rinvsix_S3,
867 v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
868 # endif
869 # undef v_fswitch_r
870 # endif /* LJ_FORCE_SWITCH */
872 /* Add up the repulsion and dispersion */
873 SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
874 SimdReal VLJ_S1 = VLJ12_S1 - VLJ6_S1;
875 # ifndef HALF_LJ
876 SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
877 SimdReal VLJ_S3 = VLJ12_S3 - VLJ6_S3;
878 # endif
880 # endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
882 # ifdef LJ_POT_SWITCH
883 /* We always need the potential, since it is needed for the force */
884 SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
885 SimdReal VLJ_S1 = fnma(sixth_S, FrLJ6_S1, twelveth_S * FrLJ12_S1);
886 # ifndef HALF_LJ
887 SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
888 SimdReal VLJ_S3 = fnma(sixth_S, FrLJ6_S3, twelveth_S * FrLJ12_S3);
889 # endif
892 SimdReal sw_S0, dsw_S0;
893 SimdReal sw_S1, dsw_S1;
894 # ifndef HALF_LJ
895 SimdReal sw_S2, dsw_S2;
896 SimdReal sw_S3, dsw_S3;
897 # endif
899 # define switch_r(rsw, rsw2, c3, c4, c5) \
900 fma(fma(fma(c5, rsw, c4), rsw, c3), ((rsw2) * (rsw)), one_S)
901 # define dswitch_r(rsw, rsw2, c2, c3, c4) (fma(fma(c4, rsw, c3), rsw, c2) * (rsw2))
903 sw_S0 = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
904 dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
905 sw_S1 = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S);
906 dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S);
907 # ifndef HALF_LJ
908 sw_S2 = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
909 dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
910 sw_S3 = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S);
911 dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S);
912 # endif
913 frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
914 frLJ_S1 = fnma(dsw_S1 * VLJ_S1, r_S1, sw_S1 * frLJ_S1);
915 # ifndef HALF_LJ
916 frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
917 frLJ_S3 = fnma(dsw_S3 * VLJ_S3, r_S3, sw_S3 * frLJ_S3);
918 # endif
919 # ifdef CALC_ENERGIES
920 VLJ_S0 = sw_S0 * VLJ_S0;
921 VLJ_S1 = sw_S1 * VLJ_S1;
922 # ifndef HALF_LJ
923 VLJ_S2 = sw_S2 * VLJ_S2;
924 VLJ_S3 = sw_S3 * VLJ_S3;
925 # endif
926 # endif
928 # undef switch_r
929 # undef dswitch_r
931 # endif /* LJ_POT_SWITCH */
933 # if defined CALC_ENERGIES && defined CHECK_EXCLS
934 /* The potential shift should be removed for excluded pairs */
935 VLJ_S0 = selectByMask(VLJ_S0, interact_S0);
936 VLJ_S1 = selectByMask(VLJ_S1, interact_S1);
937 # ifndef HALF_LJ
938 VLJ_S2 = selectByMask(VLJ_S2, interact_S2);
939 VLJ_S3 = selectByMask(VLJ_S3, interact_S3);
940 # endif
941 # endif
943 # ifdef LJ_EWALD_GEOM
945 SimdReal c6s_j_S;
946 SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
947 SimdReal c6grid_S1, rinvsix_nm_S1, cr2_S1, expmcr2_S1, poly_S1;
948 # ifndef HALF_LJ
949 SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
950 SimdReal c6grid_S3, rinvsix_nm_S3, cr2_S3, expmcr2_S3, poly_S3;
951 # endif
952 # ifdef CALC_ENERGIES
953 SimdReal sh_mask_S0;
954 SimdReal sh_mask_S1;
955 # ifndef HALF_LJ
956 SimdReal sh_mask_S2;
957 SimdReal sh_mask_S3;
958 # endif
959 # endif
961 /* Determine C6 for the grid using the geometric combination rule */
962 c6s_j_S = load<SimdReal>(ljc + aj2 + 0);
963 c6grid_S0 = c6s_S0 * c6s_j_S;
964 c6grid_S1 = c6s_S1 * c6s_j_S;
965 # ifndef HALF_LJ
966 c6grid_S2 = c6s_S2 * c6s_j_S;
967 c6grid_S3 = c6s_S3 * c6s_j_S;
968 # endif
970 # ifdef CHECK_EXCLS
971 /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
972 rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
973 rinvsix_nm_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
974 # ifndef HALF_LJ
975 rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
976 rinvsix_nm_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
977 # endif
978 # else
979 /* We didn't use a mask, so we can copy */
980 rinvsix_nm_S0 = rinvsix_S0;
981 rinvsix_nm_S1 = rinvsix_S1;
982 # ifndef HALF_LJ
983 rinvsix_nm_S2 = rinvsix_S2;
984 rinvsix_nm_S3 = rinvsix_S3;
985 # endif
986 # endif
988 /* Mask for the cut-off to avoid overflow of cr2^2 */
989 cr2_S0 = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
990 cr2_S1 = lje_c2_S * selectByMask(rsq_S1, wco_vdw_S1);
991 # ifndef HALF_LJ
992 cr2_S2 = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
993 cr2_S3 = lje_c2_S * selectByMask(rsq_S3, wco_vdw_S3);
994 # endif
995 // Unsafe version of our exp() should be fine, since these arguments should never
996 // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
997 expmcr2_S0 = exp<MathOptimization::Unsafe>(-cr2_S0);
998 expmcr2_S1 = exp<MathOptimization::Unsafe>(-cr2_S1);
999 # ifndef HALF_LJ
1000 expmcr2_S2 = exp<MathOptimization::Unsafe>(-cr2_S2);
1001 expmcr2_S3 = exp<MathOptimization::Unsafe>(-cr2_S3);
1002 # endif
1004 /* 1 + cr2 + 1/2*cr2^2 */
1005 poly_S0 = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
1006 poly_S1 = fma(fma(half_S, cr2_S1, one_S), cr2_S1, one_S);
1007 # ifndef HALF_LJ
1008 poly_S2 = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
1009 poly_S3 = fma(fma(half_S, cr2_S3, one_S), cr2_S3, one_S);
1010 # endif
1012 /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
1013 * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
1015 frLJ_S0 = fma(c6grid_S0,
1016 fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
1017 frLJ_S1 = fma(c6grid_S1,
1018 fnma(expmcr2_S1, fma(rinvsix_nm_S1, poly_S1, lje_c6_6_S), rinvsix_nm_S1), frLJ_S1);
1019 # ifndef HALF_LJ
1020 frLJ_S2 = fma(c6grid_S2,
1021 fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
1022 frLJ_S3 = fma(c6grid_S3,
1023 fnma(expmcr2_S3, fma(rinvsix_nm_S3, poly_S3, lje_c6_6_S), rinvsix_nm_S3), frLJ_S3);
1024 # endif
1026 # ifdef CALC_ENERGIES
1027 # ifdef CHECK_EXCLS
1028 sh_mask_S0 = selectByMask(lje_vc_S, interact_S0);
1029 sh_mask_S1 = selectByMask(lje_vc_S, interact_S1);
1030 # ifndef HALF_LJ
1031 sh_mask_S2 = selectByMask(lje_vc_S, interact_S2);
1032 sh_mask_S3 = selectByMask(lje_vc_S, interact_S3);
1033 # endif
1034 # else
1035 sh_mask_S0 = lje_vc_S;
1036 sh_mask_S1 = lje_vc_S;
1037 # ifndef HALF_LJ
1038 sh_mask_S2 = lje_vc_S;
1039 sh_mask_S3 = lje_vc_S;
1040 # endif
1041 # endif
1043 VLJ_S0 = fma(sixth_S * c6grid_S0,
1044 fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
1045 VLJ_S1 = fma(sixth_S * c6grid_S1,
1046 fma(rinvsix_nm_S1, fnma(expmcr2_S1, poly_S1, one_S), sh_mask_S1), VLJ_S1);
1047 # ifndef HALF_LJ
1048 VLJ_S2 = fma(sixth_S * c6grid_S2,
1049 fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
1050 VLJ_S3 = fma(sixth_S * c6grid_S3,
1051 fma(rinvsix_nm_S3, fnma(expmcr2_S3, poly_S3, one_S), sh_mask_S3), VLJ_S3);
1052 # endif
1053 # endif /* CALC_ENERGIES */
1055 # endif /* LJ_EWALD_GEOM */
1057 # if defined VDW_CUTOFF_CHECK
1058 /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
1059 * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
1061 frLJ_S0 = selectByMask(frLJ_S0, wco_vdw_S0);
1062 frLJ_S1 = selectByMask(frLJ_S1, wco_vdw_S1);
1063 # ifndef HALF_LJ
1064 frLJ_S2 = selectByMask(frLJ_S2, wco_vdw_S2);
1065 frLJ_S3 = selectByMask(frLJ_S3, wco_vdw_S3);
1066 # endif
1067 # endif
1069 # ifdef CALC_ENERGIES
1070 /* The potential shift should be removed for pairs beyond cut-off */
1071 VLJ_S0 = selectByMask(VLJ_S0, wco_vdw_S0);
1072 VLJ_S1 = selectByMask(VLJ_S1, wco_vdw_S1);
1073 # ifndef HALF_LJ
1074 VLJ_S2 = selectByMask(VLJ_S2, wco_vdw_S2);
1075 VLJ_S3 = selectByMask(VLJ_S3, wco_vdw_S3);
1076 # endif
1077 # endif
1079 # endif /* CALC_LJ */
1081 # ifdef CALC_ENERGIES
1082 # ifdef ENERGY_GROUPS
1083 /* Extract the group pair index per j pair.
1084 * Energy groups are stored per i-cluster, so things get
1085 * complicated when the i- and j-cluster size don't match.
1088 int egps_j;
1089 # if UNROLLJ == 2
1090 egps_j = nbatParams.energrp[cj >> 1];
1091 egp_jj[0] = ((egps_j >> ((cj & 1) * egps_jshift)) & egps_jmask) * egps_jstride;
1092 # else
1093 /* We assume UNROLLI <= UNROLLJ */
1094 int jdi;
1095 for (jdi = 0; jdi < UNROLLJ / UNROLLI; jdi++)
1097 int jj;
1098 egps_j = nbatParams.energrp[cj * (UNROLLJ / UNROLLI) + jdi];
1099 for (jj = 0; jj < (UNROLLI / 2); jj++)
1101 egp_jj[jdi * (UNROLLI / 2) + jj] =
1102 ((egps_j >> (jj * egps_jshift)) & egps_jmask) * egps_jstride;
1105 # endif
1107 # endif
1109 # ifdef CALC_COULOMB
1110 # ifndef ENERGY_GROUPS
1111 vctot_S = vctot_S + vcoul_S0 + vcoul_S1 + vcoul_S2 + vcoul_S3;
1112 # else
1113 add_ener_grp(vcoul_S0, vctp[0], egp_jj);
1114 add_ener_grp(vcoul_S1, vctp[1], egp_jj);
1115 add_ener_grp(vcoul_S2, vctp[2], egp_jj);
1116 add_ener_grp(vcoul_S3, vctp[3], egp_jj);
1117 # endif
1118 # endif
1120 # ifdef CALC_LJ
1122 # ifndef ENERGY_GROUPS
1123 # ifndef HALF_LJ
1124 Vvdwtot_S = Vvdwtot_S + VLJ_S0 + VLJ_S1 + VLJ_S2 + VLJ_S3;
1125 # else
1126 Vvdwtot_S = Vvdwtot_S + VLJ_S0 + VLJ_S1;
1127 # endif
1128 # else
1129 add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
1130 add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
1131 # ifndef HALF_LJ
1132 add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
1133 add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
1134 # endif
1135 # endif
1136 # endif /* CALC_LJ */
1137 # endif /* CALC_ENERGIES */
1139 # ifdef CALC_LJ
1140 # ifdef CALC_COULOMB
1141 fscal_S0 = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
1142 # else
1143 fscal_S0 = rinvsq_S0 * frLJ_S0;
1144 # endif
1145 # ifdef CALC_COULOMB
1146 fscal_S1 = rinvsq_S1 * (frcoul_S1 + frLJ_S1);
1147 # else
1148 fscal_S1 = rinvsq_S1 * frLJ_S1;
1149 # endif
1150 # else
1151 fscal_S0 = rinvsq_S0 * frcoul_S0;
1152 fscal_S1 = rinvsq_S1 * frcoul_S1;
1153 # endif /* CALC_LJ */
1154 # if defined CALC_LJ && !defined HALF_LJ
1155 # ifdef CALC_COULOMB
1156 fscal_S2 = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
1157 fscal_S3 = rinvsq_S3 * (frcoul_S3 + frLJ_S3);
1158 # else
1159 fscal_S2 = rinvsq_S2 * frLJ_S2;
1160 fscal_S3 = rinvsq_S3 * frLJ_S3;
1161 # endif
1162 # else
1163 /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
1164 fscal_S2 = rinvsq_S2 * frcoul_S2;
1165 fscal_S3 = rinvsq_S3 * frcoul_S3;
1166 # endif
1168 /* Calculate temporary vectorial force */
1169 tx_S0 = fscal_S0 * dx_S0;
1170 tx_S1 = fscal_S1 * dx_S1;
1171 tx_S2 = fscal_S2 * dx_S2;
1172 tx_S3 = fscal_S3 * dx_S3;
1173 ty_S0 = fscal_S0 * dy_S0;
1174 ty_S1 = fscal_S1 * dy_S1;
1175 ty_S2 = fscal_S2 * dy_S2;
1176 ty_S3 = fscal_S3 * dy_S3;
1177 tz_S0 = fscal_S0 * dz_S0;
1178 tz_S1 = fscal_S1 * dz_S1;
1179 tz_S2 = fscal_S2 * dz_S2;
1180 tz_S3 = fscal_S3 * dz_S3;
1182 /* Increment i atom force */
1183 fix_S0 = fix_S0 + tx_S0;
1184 fix_S1 = fix_S1 + tx_S1;
1185 fix_S2 = fix_S2 + tx_S2;
1186 fix_S3 = fix_S3 + tx_S3;
1187 fiy_S0 = fiy_S0 + ty_S0;
1188 fiy_S1 = fiy_S1 + ty_S1;
1189 fiy_S2 = fiy_S2 + ty_S2;
1190 fiy_S3 = fiy_S3 + ty_S3;
1191 fiz_S0 = fiz_S0 + tz_S0;
1192 fiz_S1 = fiz_S1 + tz_S1;
1193 fiz_S2 = fiz_S2 + tz_S2;
1194 fiz_S3 = fiz_S3 + tz_S3;
1196 /* Decrement j atom force */
1197 store(f + ajx, load<SimdReal>(f + ajx) - (tx_S0 + tx_S1 + tx_S2 + tx_S3));
1198 store(f + ajy, load<SimdReal>(f + ajy) - (ty_S0 + ty_S1 + ty_S2 + ty_S3));
1199 store(f + ajz, load<SimdReal>(f + ajz) - (tz_S0 + tz_S1 + tz_S2 + tz_S3));
1202 # undef rinv_ex_S0
1203 # undef rinv_ex_S1
1204 # undef rinv_ex_S2
1205 # undef rinv_ex_S3
1207 # undef wco_vdw_S0
1208 # undef wco_vdw_S1
1209 # undef wco_vdw_S2
1210 # undef wco_vdw_S3
1212 # undef EXCL_FORCES
1214 #endif // !DOXYGEN