2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/awh/awh.h"
49 #include "gromacs/domdec/dlbtiming.h"
50 #include "gromacs/domdec/domdec.h"
51 #include "gromacs/domdec/domdec_struct.h"
52 #include "gromacs/domdec/gpuhaloexchange.h"
53 #include "gromacs/domdec/partition.h"
54 #include "gromacs/essentialdynamics/edsam.h"
55 #include "gromacs/ewald/pme.h"
56 #include "gromacs/ewald/pme_pp.h"
57 #include "gromacs/ewald/pme_pp_comm_gpu.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
60 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
61 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed_forces/disre.h"
65 #include "gromacs/listed_forces/gpubonded.h"
66 #include "gromacs/listed_forces/listed_forces.h"
67 #include "gromacs/listed_forces/orires.h"
68 #include "gromacs/math/arrayrefwithpadding.h"
69 #include "gromacs/math/functions.h"
70 #include "gromacs/math/units.h"
71 #include "gromacs/math/vec.h"
72 #include "gromacs/math/vecdump.h"
73 #include "gromacs/mdlib/calcmu.h"
74 #include "gromacs/mdlib/calcvir.h"
75 #include "gromacs/mdlib/constr.h"
76 #include "gromacs/mdlib/dispersioncorrection.h"
77 #include "gromacs/mdlib/enerdata_utils.h"
78 #include "gromacs/mdlib/force.h"
79 #include "gromacs/mdlib/force_flags.h"
80 #include "gromacs/mdlib/forcerec.h"
81 #include "gromacs/mdlib/gmx_omp_nthreads.h"
82 #include "gromacs/mdlib/update.h"
83 #include "gromacs/mdlib/vsite.h"
84 #include "gromacs/mdlib/wholemoleculetransform.h"
85 #include "gromacs/mdtypes/commrec.h"
86 #include "gromacs/mdtypes/enerdata.h"
87 #include "gromacs/mdtypes/forceoutput.h"
88 #include "gromacs/mdtypes/forcerec.h"
89 #include "gromacs/mdtypes/iforceprovider.h"
90 #include "gromacs/mdtypes/inputrec.h"
91 #include "gromacs/mdtypes/md_enums.h"
92 #include "gromacs/mdtypes/mdatom.h"
93 #include "gromacs/mdtypes/simulation_workload.h"
94 #include "gromacs/mdtypes/state.h"
95 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
96 #include "gromacs/nbnxm/gpu_data_mgmt.h"
97 #include "gromacs/nbnxm/nbnxm.h"
98 #include "gromacs/nbnxm/nbnxm_gpu.h"
99 #include "gromacs/pbcutil/ishift.h"
100 #include "gromacs/pbcutil/pbc.h"
101 #include "gromacs/pulling/pull.h"
102 #include "gromacs/pulling/pull_rotation.h"
103 #include "gromacs/timing/cyclecounter.h"
104 #include "gromacs/timing/gpu_timing.h"
105 #include "gromacs/timing/wallcycle.h"
106 #include "gromacs/timing/wallcyclereporting.h"
107 #include "gromacs/timing/walltime_accounting.h"
108 #include "gromacs/topology/topology.h"
109 #include "gromacs/utility/arrayref.h"
110 #include "gromacs/utility/basedefinitions.h"
111 #include "gromacs/utility/cstringutil.h"
112 #include "gromacs/utility/exceptions.h"
113 #include "gromacs/utility/fatalerror.h"
114 #include "gromacs/utility/fixedcapacityvector.h"
115 #include "gromacs/utility/gmxassert.h"
116 #include "gromacs/utility/gmxmpi.h"
117 #include "gromacs/utility/logger.h"
118 #include "gromacs/utility/smalloc.h"
119 #include "gromacs/utility/strconvert.h"
120 #include "gromacs/utility/sysinfo.h"
123 using gmx::AtomLocality
;
124 using gmx::DomainLifetimeWorkload
;
125 using gmx::ForceOutputs
;
126 using gmx::ForceWithShiftForces
;
127 using gmx::InteractionLocality
;
129 using gmx::SimulationWorkload
;
130 using gmx::StepWorkload
;
132 // TODO: this environment variable allows us to verify before release
133 // that on less common architectures the total cost of polling is not larger than
134 // a blocking wait (so polling does not introduce overhead when the static
135 // PME-first ordering would suffice).
136 static const bool c_disableAlternatingWait
= (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
138 static void sum_forces(ArrayRef
<RVec
> f
, ArrayRef
<const RVec
> forceToAdd
)
140 GMX_ASSERT(f
.size() >= forceToAdd
.size(), "Accumulation buffer should be sufficiently large");
141 const int end
= forceToAdd
.size();
143 int gmx_unused nt
= gmx_omp_nthreads_get(emntDefault
);
144 #pragma omp parallel for num_threads(nt) schedule(static)
145 for (int i
= 0; i
< end
; i
++)
147 rvec_inc(f
[i
], forceToAdd
[i
]);
151 static void calc_virial(int start
,
154 const gmx::ForceWithShiftForces
& forceWithShiftForces
,
158 const t_forcerec
* fr
,
161 /* The short-range virial from surrounding boxes */
162 const rvec
* fshift
= as_rvec_array(forceWithShiftForces
.shiftForces().data());
163 calc_vir(SHIFTS
, fr
->shift_vec
, fshift
, vir_part
, pbcType
== PbcType::Screw
, box
);
164 inc_nrnb(nrnb
, eNR_VIRIAL
, SHIFTS
);
166 /* Calculate partial virial, for local atoms only, based on short range.
167 * Total virial is computed in global_stat, called from do_md
169 const rvec
* f
= as_rvec_array(forceWithShiftForces
.force().data());
170 f_calc_vir(start
, start
+ homenr
, x
, f
, vir_part
, box
);
171 inc_nrnb(nrnb
, eNR_VIRIAL
, homenr
);
175 pr_rvecs(debug
, 0, "vir_part", vir_part
, DIM
);
179 static void pull_potential_wrapper(const t_commrec
* cr
,
180 const t_inputrec
* ir
,
182 gmx::ArrayRef
<const gmx::RVec
> x
,
183 gmx::ForceWithVirial
* force
,
184 const t_mdatoms
* mdatoms
,
185 gmx_enerdata_t
* enerd
,
189 gmx_wallcycle_t wcycle
)
194 /* Calculate the center of mass forces, this requires communication,
195 * which is why pull_potential is called close to other communication.
197 wallcycle_start(wcycle
, ewcPULLPOT
);
198 set_pbc(&pbc
, ir
->pbcType
, box
);
200 enerd
->term
[F_COM_PULL
] +=
201 pull_potential(pull_work
, mdatoms
->massT
, &pbc
, cr
, t
, lambda
[efptRESTRAINT
],
202 as_rvec_array(x
.data()), force
, &dvdl
);
203 enerd
->dvdl_lin
[efptRESTRAINT
] += dvdl
;
204 wallcycle_stop(wcycle
, ewcPULLPOT
);
207 static void pme_receive_force_ener(t_forcerec
* fr
,
209 gmx::ForceWithVirial
* forceWithVirial
,
210 gmx_enerdata_t
* enerd
,
211 bool useGpuPmePpComms
,
212 bool receivePmeForceToGpu
,
213 gmx_wallcycle_t wcycle
)
215 real e_q
, e_lj
, dvdl_q
, dvdl_lj
;
216 float cycles_ppdpme
, cycles_seppme
;
218 cycles_ppdpme
= wallcycle_stop(wcycle
, ewcPPDURINGPME
);
219 dd_cycles_add(cr
->dd
, cycles_ppdpme
, ddCyclPPduringPME
);
221 /* In case of node-splitting, the PP nodes receive the long-range
222 * forces, virial and energy from the PME nodes here.
224 wallcycle_start(wcycle
, ewcPP_PMEWAITRECVF
);
227 gmx_pme_receive_f(fr
->pmePpCommGpu
.get(), cr
, forceWithVirial
, &e_q
, &e_lj
, &dvdl_q
, &dvdl_lj
,
228 useGpuPmePpComms
, receivePmeForceToGpu
, &cycles_seppme
);
229 enerd
->term
[F_COUL_RECIP
] += e_q
;
230 enerd
->term
[F_LJ_RECIP
] += e_lj
;
231 enerd
->dvdl_lin
[efptCOUL
] += dvdl_q
;
232 enerd
->dvdl_lin
[efptVDW
] += dvdl_lj
;
236 dd_cycles_add(cr
->dd
, cycles_seppme
, ddCyclPME
);
238 wallcycle_stop(wcycle
, ewcPP_PMEWAITRECVF
);
241 static void print_large_forces(FILE* fp
,
246 ArrayRef
<const RVec
> x
,
247 ArrayRef
<const RVec
> f
)
249 real force2Tolerance
= gmx::square(forceTolerance
);
250 gmx::index numNonFinite
= 0;
251 for (int i
= 0; i
< md
->homenr
; i
++)
253 real force2
= norm2(f
[i
]);
254 bool nonFinite
= !std::isfinite(force2
);
255 if (force2
>= force2Tolerance
|| nonFinite
)
257 fprintf(fp
, "step %" PRId64
" atom %6d x %8.3f %8.3f %8.3f force %12.5e\n", step
,
258 ddglatnr(cr
->dd
, i
), x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], std::sqrt(force2
));
265 if (numNonFinite
> 0)
267 /* Note that with MPI this fatal call on one rank might interrupt
268 * the printing on other ranks. But we can only avoid that with
269 * an expensive MPI barrier that we would need at each step.
271 gmx_fatal(FARGS
, "At step %" PRId64
" detected non-finite forces on %td atoms", step
, numNonFinite
);
275 //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
276 static void postProcessForceWithShiftForces(t_nrnb
* nrnb
,
277 gmx_wallcycle_t wcycle
,
279 ArrayRef
<const RVec
> x
,
280 ForceOutputs
* forceOutputs
,
282 const t_mdatoms
& mdatoms
,
283 const t_forcerec
& fr
,
284 gmx::VirtualSitesHandler
* vsite
,
285 const StepWorkload
& stepWork
)
287 ForceWithShiftForces
& forceWithShiftForces
= forceOutputs
->forceWithShiftForces();
289 /* If we have NoVirSum forces, but we do not calculate the virial,
290 * we later sum the forceWithShiftForces buffer together with
291 * the noVirSum buffer and spread the combined vsite forces at once.
293 if (vsite
&& (!forceOutputs
->haveForceWithVirial() || stepWork
.computeVirial
))
295 using VirialHandling
= gmx::VirtualSitesHandler::VirialHandling
;
297 auto f
= forceWithShiftForces
.force();
298 auto fshift
= forceWithShiftForces
.shiftForces();
299 const VirialHandling virialHandling
=
300 (stepWork
.computeVirial
? VirialHandling::Pbc
: VirialHandling::None
);
301 vsite
->spreadForces(x
, f
, virialHandling
, fshift
, nullptr, nrnb
, box
, wcycle
);
302 forceWithShiftForces
.haveSpreadVsiteForces() = true;
305 if (stepWork
.computeVirial
)
307 /* Calculation of the virial must be done after vsites! */
308 calc_virial(0, mdatoms
.homenr
, as_rvec_array(x
.data()), forceWithShiftForces
, vir_force
,
309 box
, nrnb
, &fr
, fr
.pbcType
);
313 //! Spread, compute virial for and sum forces, when necessary
314 static void postProcessForces(const t_commrec
* cr
,
317 gmx_wallcycle_t wcycle
,
319 ArrayRef
<const RVec
> x
,
320 ForceOutputs
* forceOutputs
,
322 const t_mdatoms
* mdatoms
,
323 const t_forcerec
* fr
,
324 gmx::VirtualSitesHandler
* vsite
,
325 const StepWorkload
& stepWork
)
327 // Extract the final output force buffer, which is also the buffer for forces with shift forces
328 ArrayRef
<RVec
> f
= forceOutputs
->forceWithShiftForces().force();
330 if (forceOutputs
->haveForceWithVirial())
332 auto& forceWithVirial
= forceOutputs
->forceWithVirial();
336 /* Spread the mesh force on virtual sites to the other particles...
337 * This is parallellized. MPI communication is performed
338 * if the constructing atoms aren't local.
340 GMX_ASSERT(!stepWork
.computeVirial
|| f
.data() != forceWithVirial
.force_
.data(),
341 "We need separate force buffers for shift and virial forces when "
342 "computing the virial");
343 GMX_ASSERT(!stepWork
.computeVirial
344 || forceOutputs
->forceWithShiftForces().haveSpreadVsiteForces(),
345 "We should spread the force with shift forces separately when computing "
347 const gmx::VirtualSitesHandler::VirialHandling virialHandling
=
348 (stepWork
.computeVirial
? gmx::VirtualSitesHandler::VirialHandling::NonLinear
349 : gmx::VirtualSitesHandler::VirialHandling::None
);
350 matrix virial
= { { 0 } };
351 vsite
->spreadForces(x
, forceWithVirial
.force_
, virialHandling
, {}, virial
, nrnb
, box
, wcycle
);
352 forceWithVirial
.addVirialContribution(virial
);
355 if (stepWork
.computeVirial
)
357 /* Now add the forces, this is local */
358 sum_forces(f
, forceWithVirial
.force_
);
360 /* Add the direct virial contributions */
362 forceWithVirial
.computeVirial_
,
363 "forceWithVirial should request virial computation when we request the virial");
364 m_add(vir_force
, forceWithVirial
.getVirial(), vir_force
);
368 pr_rvecs(debug
, 0, "vir_force", vir_force
, DIM
);
374 GMX_ASSERT(vsite
== nullptr || forceOutputs
->forceWithShiftForces().haveSpreadVsiteForces(),
375 "We should have spread the vsite forces (earlier)");
378 if (fr
->print_force
>= 0)
380 print_large_forces(stderr
, mdatoms
, cr
, step
, fr
->print_force
, x
, f
);
384 static void do_nb_verlet(t_forcerec
* fr
,
385 const interaction_const_t
* ic
,
386 gmx_enerdata_t
* enerd
,
387 const StepWorkload
& stepWork
,
388 const InteractionLocality ilocality
,
392 gmx_wallcycle_t wcycle
)
394 if (!stepWork
.computeNonbondedForces
)
396 /* skip non-bonded calculation */
400 nonbonded_verlet_t
* nbv
= fr
->nbv
.get();
402 /* GPU kernel launch overhead is already timed separately */
403 if (fr
->cutoff_scheme
!= ecutsVERLET
)
405 gmx_incons("Invalid cut-off scheme passed!");
410 /* When dynamic pair-list pruning is requested, we need to prune
411 * at nstlistPrune steps.
413 if (nbv
->isDynamicPruningStepCpu(step
))
415 /* Prune the pair-list beyond fr->ic->rlistPrune using
416 * the current coordinates of the atoms.
418 wallcycle_sub_start(wcycle
, ewcsNONBONDED_PRUNING
);
419 nbv
->dispatchPruneKernelCpu(ilocality
, fr
->shift_vec
);
420 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_PRUNING
);
424 nbv
->dispatchNonbondedKernel(ilocality
, *ic
, stepWork
, clearF
, *fr
, enerd
, nrnb
);
427 static inline void clearRVecs(ArrayRef
<RVec
> v
, const bool useOpenmpThreading
)
429 int nth
= gmx_omp_nthreads_get_simple_rvec_task(emntDefault
, v
.ssize());
431 /* Note that we would like to avoid this conditional by putting it
432 * into the omp pragma instead, but then we still take the full
433 * omp parallel for overhead (at least with gcc5).
435 if (!useOpenmpThreading
|| nth
== 1)
444 #pragma omp parallel for num_threads(nth) schedule(static)
445 for (gmx::index i
= 0; i
< v
.ssize(); i
++)
452 /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
454 * \param groupOptions Group options, containing T-coupling options
456 static real
averageKineticEnergyEstimate(const t_grpopts
& groupOptions
)
458 real nrdfCoupled
= 0;
459 real nrdfUncoupled
= 0;
460 real kineticEnergy
= 0;
461 for (int g
= 0; g
< groupOptions
.ngtc
; g
++)
463 if (groupOptions
.tau_t
[g
] >= 0)
465 nrdfCoupled
+= groupOptions
.nrdf
[g
];
466 kineticEnergy
+= groupOptions
.nrdf
[g
] * 0.5 * groupOptions
.ref_t
[g
] * BOLTZ
;
470 nrdfUncoupled
+= groupOptions
.nrdf
[g
];
474 /* This conditional with > also catches nrdf=0 */
475 if (nrdfCoupled
> nrdfUncoupled
)
477 return kineticEnergy
* (nrdfCoupled
+ nrdfUncoupled
) / nrdfCoupled
;
485 /*! \brief This routine checks that the potential energy is finite.
487 * Always checks that the potential energy is finite. If step equals
488 * inputrec.init_step also checks that the magnitude of the potential energy
489 * is reasonable. Terminates with a fatal error when a check fails.
490 * Note that passing this check does not guarantee finite forces,
491 * since those use slightly different arithmetics. But in most cases
492 * there is just a narrow coordinate range where forces are not finite
493 * and energies are finite.
495 * \param[in] step The step number, used for checking and printing
496 * \param[in] enerd The energy data; the non-bonded group energies need to be added to
497 * enerd.term[F_EPOT] before calling this routine \param[in] inputrec The input record
499 static void checkPotentialEnergyValidity(int64_t step
, const gmx_enerdata_t
& enerd
, const t_inputrec
& inputrec
)
501 /* Threshold valid for comparing absolute potential energy against
502 * the kinetic energy. Normally one should not consider absolute
503 * potential energy values, but with a factor of one million
504 * we should never get false positives.
506 constexpr real c_thresholdFactor
= 1e6
;
508 bool energyIsNotFinite
= !std::isfinite(enerd
.term
[F_EPOT
]);
509 real averageKineticEnergy
= 0;
510 /* We only check for large potential energy at the initial step,
511 * because that is by far the most likely step for this too occur
512 * and because computing the average kinetic energy is not free.
513 * Note: nstcalcenergy >> 1 often does not allow to catch large energies
514 * before they become NaN.
516 if (step
== inputrec
.init_step
&& EI_DYNAMICS(inputrec
.eI
))
518 averageKineticEnergy
= averageKineticEnergyEstimate(inputrec
.opts
);
521 if (energyIsNotFinite
522 || (averageKineticEnergy
> 0 && enerd
.term
[F_EPOT
] > c_thresholdFactor
* averageKineticEnergy
))
527 ": The total potential energy is %g, which is %s. The LJ and electrostatic "
528 "contributions to the energy are %g and %g, respectively. A %s potential energy "
529 "can be caused by overlapping interactions in bonded interactions or very large%s "
530 "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
531 "configuration, incorrect interactions or parameters in the topology.",
532 step
, enerd
.term
[F_EPOT
], energyIsNotFinite
? "not finite" : "extremely high",
533 enerd
.term
[F_LJ
], enerd
.term
[F_COUL_SR
],
534 energyIsNotFinite
? "non-finite" : "very high", energyIsNotFinite
? " or Nan" : "");
538 /*! \brief Return true if there are special forces computed this step.
540 * The conditionals exactly correspond to those in computeSpecialForces().
542 static bool haveSpecialForces(const t_inputrec
& inputrec
,
543 const gmx::ForceProviders
& forceProviders
,
544 const pull_t
* pull_work
,
545 const bool computeForces
,
549 return ((computeForces
&& forceProviders
.hasForceProvider()) || // forceProviders
550 (inputrec
.bPull
&& pull_have_potential(pull_work
)) || // pull
551 inputrec
.bRot
|| // enforced rotation
552 (ed
!= nullptr) || // flooding
553 (inputrec
.bIMD
&& computeForces
)); // IMD
556 /*! \brief Compute forces and/or energies for special algorithms
558 * The intention is to collect all calls to algorithms that compute
559 * forces on local atoms only and that do not contribute to the local
560 * virial sum (but add their virial contribution separately).
561 * Eventually these should likely all become ForceProviders.
562 * Within this function the intention is to have algorithms that do
563 * global communication at the end, so global barriers within the MD loop
564 * are as close together as possible.
566 * \param[in] fplog The log file
567 * \param[in] cr The communication record
568 * \param[in] inputrec The input record
569 * \param[in] awh The Awh module (nullptr if none in use).
570 * \param[in] enforcedRotation Enforced rotation module.
571 * \param[in] imdSession The IMD session
572 * \param[in] pull_work The pull work structure.
573 * \param[in] step The current MD step
574 * \param[in] t The current time
575 * \param[in,out] wcycle Wallcycle accounting struct
576 * \param[in,out] forceProviders Pointer to a list of force providers
577 * \param[in] box The unit cell
578 * \param[in] x The coordinates
579 * \param[in] mdatoms Per atom properties
580 * \param[in] lambda Array of free-energy lambda values
581 * \param[in] stepWork Step schedule flags
582 * \param[in,out] forceWithVirial Force and virial buffers
583 * \param[in,out] enerd Energy buffer
584 * \param[in,out] ed Essential dynamics pointer
585 * \param[in] didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
587 * \todo Remove didNeighborSearch, which is used incorrectly.
588 * \todo Convert all other algorithms called here to ForceProviders.
590 static void computeSpecialForces(FILE* fplog
,
592 const t_inputrec
* inputrec
,
594 gmx_enfrot
* enforcedRotation
,
595 gmx::ImdSession
* imdSession
,
599 gmx_wallcycle_t wcycle
,
600 gmx::ForceProviders
* forceProviders
,
602 gmx::ArrayRef
<const gmx::RVec
> x
,
603 const t_mdatoms
* mdatoms
,
604 gmx::ArrayRef
<const real
> lambda
,
605 const StepWorkload
& stepWork
,
606 gmx::ForceWithVirial
* forceWithVirial
,
607 gmx_enerdata_t
* enerd
,
609 bool didNeighborSearch
)
611 /* NOTE: Currently all ForceProviders only provide forces.
612 * When they also provide energies, remove this conditional.
614 if (stepWork
.computeForces
)
616 gmx::ForceProviderInput
forceProviderInput(x
, *mdatoms
, t
, box
, *cr
);
617 gmx::ForceProviderOutput
forceProviderOutput(forceWithVirial
, enerd
);
619 /* Collect forces from modules */
620 forceProviders
->calculateForces(forceProviderInput
, &forceProviderOutput
);
623 if (inputrec
->bPull
&& pull_have_potential(pull_work
))
625 pull_potential_wrapper(cr
, inputrec
, box
, x
, forceWithVirial
, mdatoms
, enerd
, pull_work
,
626 lambda
.data(), t
, wcycle
);
630 enerd
->term
[F_COM_PULL
] += awh
->applyBiasForcesAndUpdateBias(
631 inputrec
->pbcType
, mdatoms
->massT
, box
, forceWithVirial
, t
, step
, wcycle
, fplog
);
635 rvec
* f
= as_rvec_array(forceWithVirial
->force_
.data());
637 /* Add the forces from enforced rotation potentials (if any) */
640 wallcycle_start(wcycle
, ewcROTadd
);
641 enerd
->term
[F_COM_PULL
] += add_rot_forces(enforcedRotation
, f
, cr
, step
, t
);
642 wallcycle_stop(wcycle
, ewcROTadd
);
647 /* Note that since init_edsam() is called after the initialization
648 * of forcerec, edsam doesn't request the noVirSum force buffer.
649 * Thus if no other algorithm (e.g. PME) requires it, the forces
650 * here will contribute to the virial.
652 do_flood(cr
, inputrec
, as_rvec_array(x
.data()), f
, ed
, box
, step
, didNeighborSearch
);
655 /* Add forces from interactive molecular dynamics (IMD), if any */
656 if (inputrec
->bIMD
&& stepWork
.computeForces
)
658 imdSession
->applyForces(f
);
662 /*! \brief Launch the prepare_step and spread stages of PME GPU.
664 * \param[in] pmedata The PME structure
665 * \param[in] box The box matrix
666 * \param[in] stepWork Step schedule flags
667 * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
668 * \param[in] lambdaQ The Coulomb lambda of the current state.
669 * \param[in] wcycle The wallcycle structure
671 static inline void launchPmeGpuSpread(gmx_pme_t
* pmedata
,
673 const StepWorkload
& stepWork
,
674 GpuEventSynchronizer
* xReadyOnDevice
,
676 gmx_wallcycle_t wcycle
)
678 pme_gpu_prepare_computation(pmedata
, box
, wcycle
, stepWork
);
679 pme_gpu_launch_spread(pmedata
, xReadyOnDevice
, wcycle
, lambdaQ
);
682 /*! \brief Launch the FFT and gather stages of PME GPU
684 * This function only implements setting the output forces (no accumulation).
686 * \param[in] pmedata The PME structure
687 * \param[in] lambdaQ The Coulomb lambda of the current system state.
688 * \param[in] wcycle The wallcycle structure
689 * \param[in] stepWork Step schedule flags
691 static void launchPmeGpuFftAndGather(gmx_pme_t
* pmedata
,
693 gmx_wallcycle_t wcycle
,
694 const gmx::StepWorkload
& stepWork
)
696 pme_gpu_launch_complex_transforms(pmedata
, wcycle
, stepWork
);
697 pme_gpu_launch_gather(pmedata
, wcycle
, lambdaQ
);
701 * Polling wait for either of the PME or nonbonded GPU tasks.
703 * Instead of a static order in waiting for GPU tasks, this function
704 * polls checking which of the two tasks completes first, and does the
705 * associated force buffer reduction overlapped with the other task.
706 * By doing that, unlike static scheduling order, it can always overlap
707 * one of the reductions, regardless of the GPU task completion order.
709 * \param[in] nbv Nonbonded verlet structure
710 * \param[in,out] pmedata PME module data
711 * \param[in,out] forceOutputs Output buffer for the forces and virial
712 * \param[in,out] enerd Energy data structure results are reduced into
713 * \param[in] lambdaQ The Coulomb lambda of the current system state.
714 * \param[in] stepWork Step schedule flags
715 * \param[in] wcycle The wallcycle structure
717 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t
* nbv
,
719 gmx::ForceOutputs
* forceOutputs
,
720 gmx_enerdata_t
* enerd
,
722 const StepWorkload
& stepWork
,
723 gmx_wallcycle_t wcycle
)
725 bool isPmeGpuDone
= false;
726 bool isNbGpuDone
= false;
729 gmx::ForceWithShiftForces
& forceWithShiftForces
= forceOutputs
->forceWithShiftForces();
730 gmx::ForceWithVirial
& forceWithVirial
= forceOutputs
->forceWithVirial();
732 gmx::ArrayRef
<const gmx::RVec
> pmeGpuForces
;
734 while (!isPmeGpuDone
|| !isNbGpuDone
)
738 GpuTaskCompletion completionType
=
739 (isNbGpuDone
) ? GpuTaskCompletion::Wait
: GpuTaskCompletion::Check
;
740 isPmeGpuDone
= pme_gpu_try_finish_task(pmedata
, stepWork
, wcycle
, &forceWithVirial
,
741 enerd
, lambdaQ
, completionType
);
746 GpuTaskCompletion completionType
=
747 (isPmeGpuDone
) ? GpuTaskCompletion::Wait
: GpuTaskCompletion::Check
;
748 isNbGpuDone
= Nbnxm::gpu_try_finish_task(
749 nbv
->gpu_nbv
, stepWork
, AtomLocality::Local
, enerd
->grpp
.ener
[egLJSR
].data(),
750 enerd
->grpp
.ener
[egCOULSR
].data(), forceWithShiftForces
.shiftForces(),
751 completionType
, wcycle
);
755 nbv
->atomdata_add_nbat_f_to_f(AtomLocality::Local
, forceWithShiftForces
.force());
761 /*! \brief Set up the different force buffers; also does clearing.
763 * \param[in] forceHelperBuffers Helper force buffers
764 * \param[in] pull_work The pull work object.
765 * \param[in] inputrec input record
766 * \param[in] force force array
767 * \param[in] stepWork Step schedule flags
768 * \param[out] wcycle wallcycle recording structure
770 * \returns Cleared force output structure
772 static ForceOutputs
setupForceOutputs(ForceHelperBuffers
* forceHelperBuffers
,
774 const t_inputrec
& inputrec
,
775 gmx::ArrayRefWithPadding
<gmx::RVec
> force
,
776 const StepWorkload
& stepWork
,
777 gmx_wallcycle_t wcycle
)
779 wallcycle_sub_start(wcycle
, ewcsCLEAR_FORCE_BUFFER
);
781 /* NOTE: We assume fr->shiftForces is all zeros here */
782 gmx::ForceWithShiftForces
forceWithShiftForces(force
, stepWork
.computeVirial
,
783 forceHelperBuffers
->shiftForces());
785 if (stepWork
.computeForces
)
787 /* Clear the short- and long-range forces */
788 clearRVecs(forceWithShiftForces
.force(), true);
790 /* Clear the shift forces */
791 clearRVecs(forceWithShiftForces
.shiftForces(), false);
794 /* If we need to compute the virial, we might need a separate
795 * force buffer for algorithms for which the virial is calculated
796 * directly, such as PME. Otherwise, forceWithVirial uses the
797 * the same force (f in legacy calls) buffer as other algorithms.
799 const bool useSeparateForceWithVirialBuffer
=
800 (stepWork
.computeForces
801 && (stepWork
.computeVirial
&& forceHelperBuffers
->haveDirectVirialContributions()));
802 /* forceWithVirial uses the local atom range only */
803 gmx::ForceWithVirial
forceWithVirial(
804 useSeparateForceWithVirialBuffer
? forceHelperBuffers
->forceBufferForDirectVirialContributions()
805 : force
.unpaddedArrayRef(),
806 stepWork
.computeVirial
);
808 if (useSeparateForceWithVirialBuffer
)
810 /* TODO: update comment
811 * We only compute forces on local atoms. Note that vsites can
812 * spread to non-local atoms, but that part of the buffer is
813 * cleared separately in the vsite spreading code.
815 clearRVecs(forceWithVirial
.force_
, true);
818 if (inputrec
.bPull
&& pull_have_constraint(pull_work
))
820 clear_pull_forces(pull_work
);
823 wallcycle_sub_stop(wcycle
, ewcsCLEAR_FORCE_BUFFER
);
825 return ForceOutputs(forceWithShiftForces
, forceHelperBuffers
->haveDirectVirialContributions(),
830 /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
832 static DomainLifetimeWorkload
setupDomainLifetimeWorkload(const t_inputrec
& inputrec
,
833 const t_forcerec
& fr
,
834 const pull_t
* pull_work
,
836 const t_mdatoms
& mdatoms
,
837 const SimulationWorkload
& simulationWork
,
838 const StepWorkload
& stepWork
)
840 DomainLifetimeWorkload domainWork
;
841 // Note that haveSpecialForces is constant over the whole run
842 domainWork
.haveSpecialForces
=
843 haveSpecialForces(inputrec
, *fr
.forceProviders
, pull_work
, stepWork
.computeForces
, ed
);
844 domainWork
.haveCpuBondedWork
= fr
.listedForces
->haveCpuBondeds();
845 domainWork
.haveGpuBondedWork
= ((fr
.gpuBonded
!= nullptr) && fr
.gpuBonded
->haveInteractions());
846 domainWork
.haveRestraintsWork
= fr
.listedForces
->haveRestraints();
847 domainWork
.haveCpuListedForceWork
= fr
.listedForces
->haveCpuListedForces();
848 // Note that haveFreeEnergyWork is constant over the whole run
849 domainWork
.haveFreeEnergyWork
= (fr
.efep
!= efepNO
&& mdatoms
.nPerturbed
!= 0);
850 // We assume we have local force work if there are CPU
851 // force tasks including PME or nonbondeds.
852 domainWork
.haveCpuLocalForceWork
=
853 domainWork
.haveSpecialForces
|| domainWork
.haveCpuListedForceWork
854 || domainWork
.haveFreeEnergyWork
|| simulationWork
.useCpuNonbonded
|| simulationWork
.useCpuPme
855 || simulationWork
.haveEwaldSurfaceContribution
|| inputrec
.nwall
> 0;
860 /*! \brief Set up force flag stuct from the force bitmask.
862 * \param[in] legacyFlags Force bitmask flags used to construct the new flags
863 * \param[in] isNonbondedOn Global override, if false forces to turn off all nonbonded calculation.
864 * \param[in] simulationWork Simulation workload description.
865 * \param[in] rankHasPmeDuty If this rank computes PME.
867 * \returns New Stepworkload description.
869 static StepWorkload
setupStepWorkload(const int legacyFlags
,
870 const bool isNonbondedOn
,
871 const SimulationWorkload
& simulationWork
,
872 const bool rankHasPmeDuty
)
875 flags
.stateChanged
= ((legacyFlags
& GMX_FORCE_STATECHANGED
) != 0);
876 flags
.haveDynamicBox
= ((legacyFlags
& GMX_FORCE_DYNAMICBOX
) != 0);
877 flags
.doNeighborSearch
= ((legacyFlags
& GMX_FORCE_NS
) != 0);
878 flags
.computeVirial
= ((legacyFlags
& GMX_FORCE_VIRIAL
) != 0);
879 flags
.computeEnergy
= ((legacyFlags
& GMX_FORCE_ENERGY
) != 0);
880 flags
.computeForces
= ((legacyFlags
& GMX_FORCE_FORCES
) != 0);
881 flags
.computeListedForces
= ((legacyFlags
& GMX_FORCE_LISTED
) != 0);
882 flags
.computeNonbondedForces
= ((legacyFlags
& GMX_FORCE_NONBONDED
) != 0) && isNonbondedOn
;
883 flags
.computeDhdl
= ((legacyFlags
& GMX_FORCE_DHDL
) != 0);
885 if (simulationWork
.useGpuBufferOps
)
887 GMX_ASSERT(simulationWork
.useGpuNonbonded
,
888 "Can only offload buffer ops if nonbonded computation is also offloaded");
890 flags
.useGpuXBufferOps
= simulationWork
.useGpuBufferOps
;
891 // on virial steps the CPU reduction path is taken
892 flags
.useGpuFBufferOps
= simulationWork
.useGpuBufferOps
&& !flags
.computeVirial
;
893 flags
.useGpuPmeFReduction
= flags
.useGpuFBufferOps
894 && (simulationWork
.useGpuPme
895 && (rankHasPmeDuty
|| simulationWork
.useGpuPmePpCommunication
));
901 /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
903 * TODO: eliminate \p useGpuPmeOnThisRank when this is
904 * incorporated in DomainLifetimeWorkload.
906 static void launchGpuEndOfStepTasks(nonbonded_verlet_t
* nbv
,
907 gmx::GpuBonded
* gpuBonded
,
909 gmx_enerdata_t
* enerd
,
910 const gmx::MdrunScheduleWorkload
& runScheduleWork
,
911 bool useGpuPmeOnThisRank
,
913 gmx_wallcycle_t wcycle
)
915 if (runScheduleWork
.simulationWork
.useGpuNonbonded
)
917 /* Launch pruning before buffer clearing because the API overhead of the
918 * clear kernel launches can leave the GPU idle while it could be running
921 if (nbv
->isDynamicPruningStepGpu(step
))
923 nbv
->dispatchPruneKernelGpu(step
);
926 /* now clear the GPU outputs while we finish the step on the CPU */
927 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
928 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
929 Nbnxm::gpu_clear_outputs(nbv
->gpu_nbv
, runScheduleWork
.stepWork
.computeVirial
);
930 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
931 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
934 if (useGpuPmeOnThisRank
)
936 pme_gpu_reinit_computation(pmedata
, wcycle
);
939 if (runScheduleWork
.domainWork
.haveGpuBondedWork
&& runScheduleWork
.stepWork
.computeEnergy
)
941 // in principle this should be included in the DD balancing region,
942 // but generally it is infrequent so we'll omit it for the sake of
944 gpuBonded
->waitAccumulateEnergyTerms(enerd
);
946 gpuBonded
->clearEnergies();
950 //! \brief Data structure to hold dipole-related data and staging arrays
953 //! Dipole staging for fast summing over MPI
954 gmx::DVec muStaging
[2] = { { 0.0, 0.0, 0.0 } };
955 //! Dipole staging for states A and B (index 0 and 1 resp.)
956 gmx::RVec muStateAB
[2] = { { 0.0_real
, 0.0_real
, 0.0_real
} };
960 static void reduceAndUpdateMuTot(DipoleData
* dipoleData
,
962 const bool haveFreeEnergy
,
963 gmx::ArrayRef
<const real
> lambda
,
965 const DDBalanceRegionHandler
& ddBalanceRegionHandler
)
969 gmx_sumd(2 * DIM
, dipoleData
->muStaging
[0], cr
);
970 ddBalanceRegionHandler
.reopenRegionCpu();
972 for (int i
= 0; i
< 2; i
++)
974 for (int j
= 0; j
< DIM
; j
++)
976 dipoleData
->muStateAB
[i
][j
] = dipoleData
->muStaging
[i
][j
];
982 copy_rvec(dipoleData
->muStateAB
[0], muTotal
);
986 for (int j
= 0; j
< DIM
; j
++)
988 muTotal
[j
] = (1.0 - lambda
[efptCOUL
]) * dipoleData
->muStateAB
[0][j
]
989 + lambda
[efptCOUL
] * dipoleData
->muStateAB
[1][j
];
994 void do_force(FILE* fplog
,
996 const gmx_multisim_t
* ms
,
997 const t_inputrec
* inputrec
,
999 gmx_enfrot
* enforcedRotation
,
1000 gmx::ImdSession
* imdSession
,
1004 gmx_wallcycle_t wcycle
,
1005 const gmx_localtop_t
* top
,
1007 gmx::ArrayRefWithPadding
<gmx::RVec
> x
,
1009 gmx::ArrayRefWithPadding
<gmx::RVec
> force
,
1011 const t_mdatoms
* mdatoms
,
1012 gmx_enerdata_t
* enerd
,
1013 gmx::ArrayRef
<const real
> lambda
,
1015 gmx::MdrunScheduleWorkload
* runScheduleWork
,
1016 gmx::VirtualSitesHandler
* vsite
,
1021 const DDBalanceRegionHandler
& ddBalanceRegionHandler
)
1023 GMX_ASSERT(force
.unpaddedArrayRef().ssize() >= fr
->natoms_force_constr
,
1024 "The size of the force buffer should be at least the number of atoms to compute "
1027 nonbonded_verlet_t
* nbv
= fr
->nbv
.get();
1028 interaction_const_t
* ic
= fr
->ic
;
1029 gmx::StatePropagatorDataGpu
* stateGpu
= fr
->stateGpu
;
1031 const SimulationWorkload
& simulationWork
= runScheduleWork
->simulationWork
;
1034 runScheduleWork
->stepWork
= setupStepWorkload(legacyFlags
, fr
->bNonbonded
, simulationWork
,
1035 thisRankHasDuty(cr
, DUTY_PME
));
1036 const StepWorkload
& stepWork
= runScheduleWork
->stepWork
;
1039 const bool useGpuPmeOnThisRank
= simulationWork
.useGpuPme
&& thisRankHasDuty(cr
, DUTY_PME
);
1041 /* At a search step we need to start the first balancing region
1042 * somewhere early inside the step after communication during domain
1043 * decomposition (and not during the previous step as usual).
1045 if (stepWork
.doNeighborSearch
)
1047 ddBalanceRegionHandler
.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes
);
1050 clear_mat(vir_force
);
1052 if (fr
->pbcType
!= PbcType::No
)
1054 /* Compute shift vectors every step,
1055 * because of pressure coupling or box deformation!
1057 if (stepWork
.haveDynamicBox
&& stepWork
.stateChanged
)
1059 calc_shifts(box
, fr
->shift_vec
);
1062 const bool fillGrid
= (stepWork
.doNeighborSearch
&& stepWork
.stateChanged
);
1063 const bool calcCGCM
= (fillGrid
&& !DOMAINDECOMP(cr
));
1066 put_atoms_in_box_omp(fr
->pbcType
, box
, x
.unpaddedArrayRef().subArray(0, mdatoms
->homenr
),
1067 gmx_omp_nthreads_get(emntDefault
));
1068 inc_nrnb(nrnb
, eNR_SHIFTX
, mdatoms
->homenr
);
1072 nbnxn_atomdata_copy_shiftvec(stepWork
.haveDynamicBox
, fr
->shift_vec
, nbv
->nbat
.get());
1074 const bool pmeSendCoordinatesFromGpu
=
1075 GMX_MPI
&& simulationWork
.useGpuPmePpCommunication
&& !(stepWork
.doNeighborSearch
);
1076 const bool reinitGpuPmePpComms
=
1077 GMX_MPI
&& simulationWork
.useGpuPmePpCommunication
&& (stepWork
.doNeighborSearch
);
1079 const auto localXReadyOnDevice
= (useGpuPmeOnThisRank
|| simulationWork
.useGpuBufferOps
)
1080 ? stateGpu
->getCoordinatesReadyOnDeviceEvent(
1081 AtomLocality::Local
, simulationWork
, stepWork
)
1084 // If coordinates are to be sent to PME task from CPU memory, perform that send here.
1085 // Otherwise the send will occur after H2D coordinate transfer.
1086 if (GMX_MPI
&& !thisRankHasDuty(cr
, DUTY_PME
) && !pmeSendCoordinatesFromGpu
)
1088 /* Send particle coordinates to the pme nodes */
1089 if (!stepWork
.doNeighborSearch
&& simulationWork
.useGpuUpdate
)
1091 GMX_RELEASE_ASSERT(false,
1092 "GPU update and separate PME ranks are only supported with GPU "
1093 "direct communication!");
1094 // TODO: when this code-path becomes supported add:
1095 // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1098 gmx_pme_send_coordinates(fr
, cr
, box
, as_rvec_array(x
.unpaddedArrayRef().data()), lambda
[efptCOUL
],
1099 lambda
[efptVDW
], (stepWork
.computeVirial
|| stepWork
.computeEnergy
),
1100 step
, simulationWork
.useGpuPmePpCommunication
, reinitGpuPmePpComms
,
1101 pmeSendCoordinatesFromGpu
, localXReadyOnDevice
, wcycle
);
1104 // Coordinates on the device are needed if PME or BufferOps are offloaded.
1105 // The local coordinates can be copied right away.
1106 // NOTE: Consider moving this copy to right after they are updated and constrained,
1107 // if the later is not offloaded.
1108 if (useGpuPmeOnThisRank
|| stepWork
.useGpuXBufferOps
)
1110 if (stepWork
.doNeighborSearch
)
1112 // TODO refactor this to do_md, after partitioning.
1113 stateGpu
->reinit(mdatoms
->homenr
,
1114 cr
->dd
!= nullptr ? dd_numAtomsZones(*cr
->dd
) : mdatoms
->homenr
);
1115 if (useGpuPmeOnThisRank
)
1117 // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
1118 pme_gpu_set_device_x(fr
->pmedata
, stateGpu
->getCoordinates());
1121 // We need to copy coordinates when:
1122 // 1. Update is not offloaded
1123 // 2. The buffers were reinitialized on search step
1124 if (!simulationWork
.useGpuUpdate
|| stepWork
.doNeighborSearch
)
1126 GMX_ASSERT(stateGpu
!= nullptr, "stateGpu should not be null");
1127 stateGpu
->copyCoordinatesToGpu(x
.unpaddedArrayRef(), AtomLocality::Local
);
1131 // TODO Update this comment when introducing SimulationWorkload
1133 // The conditions for gpuHaloExchange e.g. using GPU buffer
1134 // operations were checked before construction, so here we can
1135 // just use it and assert upon any conditions.
1136 const bool ddUsesGpuDirectCommunication
=
1137 ((cr
->dd
!= nullptr) && (!cr
->dd
->gpuHaloExchange
.empty()));
1138 GMX_ASSERT(!ddUsesGpuDirectCommunication
|| stepWork
.useGpuXBufferOps
,
1139 "Must use coordinate buffer ops with GPU halo exchange");
1140 const bool useGpuForcesHaloExchange
= ddUsesGpuDirectCommunication
&& stepWork
.useGpuFBufferOps
;
1142 // Copy coordinate from the GPU if update is on the GPU and there
1143 // are forces to be computed on the CPU, or for the computation of
1144 // virial, or if host-side data will be transferred from this task
1145 // to a remote task for halo exchange or PME-PP communication. At
1146 // search steps the current coordinates are already on the host,
1147 // hence copy is not needed.
1148 const bool haveHostPmePpComms
=
1149 !thisRankHasDuty(cr
, DUTY_PME
) && !simulationWork
.useGpuPmePpCommunication
;
1150 const bool haveHostHaloExchangeComms
= havePPDomainDecomposition(cr
) && !ddUsesGpuDirectCommunication
;
1152 bool gmx_used_in_debug haveCopiedXFromGpu
= false;
1153 if (simulationWork
.useGpuUpdate
&& !stepWork
.doNeighborSearch
1154 && (runScheduleWork
->domainWork
.haveCpuLocalForceWork
|| stepWork
.computeVirial
1155 || haveHostPmePpComms
|| haveHostHaloExchangeComms
))
1157 GMX_ASSERT(stateGpu
!= nullptr, "stateGpu should not be null");
1158 stateGpu
->copyCoordinatesFromGpu(x
.unpaddedArrayRef(), AtomLocality::Local
);
1159 haveCopiedXFromGpu
= true;
1162 // If coordinates are to be sent to PME task from GPU memory, perform that send here.
1163 // Otherwise the send will occur before the H2D coordinate transfer.
1164 if (!thisRankHasDuty(cr
, DUTY_PME
) && pmeSendCoordinatesFromGpu
)
1166 /* Send particle coordinates to the pme nodes */
1167 gmx_pme_send_coordinates(fr
, cr
, box
, as_rvec_array(x
.unpaddedArrayRef().data()), lambda
[efptCOUL
],
1168 lambda
[efptVDW
], (stepWork
.computeVirial
|| stepWork
.computeEnergy
),
1169 step
, simulationWork
.useGpuPmePpCommunication
, reinitGpuPmePpComms
,
1170 pmeSendCoordinatesFromGpu
, localXReadyOnDevice
, wcycle
);
1173 if (useGpuPmeOnThisRank
)
1175 launchPmeGpuSpread(fr
->pmedata
, box
, stepWork
, localXReadyOnDevice
, lambda
[efptCOUL
], wcycle
);
1178 /* do gridding for pair search */
1179 if (stepWork
.doNeighborSearch
)
1181 if (fr
->wholeMoleculeTransform
&& stepWork
.stateChanged
)
1183 fr
->wholeMoleculeTransform
->updateForAtomPbcJumps(x
.unpaddedArrayRef(), box
);
1187 // - vzero is constant, do we need to pass it?
1188 // - box_diag should be passed directly to nbnxn_put_on_grid
1194 box_diag
[XX
] = box
[XX
][XX
];
1195 box_diag
[YY
] = box
[YY
][YY
];
1196 box_diag
[ZZ
] = box
[ZZ
][ZZ
];
1198 wallcycle_start(wcycle
, ewcNS
);
1199 if (!DOMAINDECOMP(cr
))
1201 wallcycle_sub_start(wcycle
, ewcsNBS_GRID_LOCAL
);
1202 nbnxn_put_on_grid(nbv
, box
, 0, vzero
, box_diag
, nullptr, { 0, mdatoms
->homenr
}, -1,
1203 fr
->cginfo
, x
.unpaddedArrayRef(), 0, nullptr);
1204 wallcycle_sub_stop(wcycle
, ewcsNBS_GRID_LOCAL
);
1208 wallcycle_sub_start(wcycle
, ewcsNBS_GRID_NONLOCAL
);
1209 nbnxn_put_on_grid_nonlocal(nbv
, domdec_zones(cr
->dd
), fr
->cginfo
, x
.unpaddedArrayRef());
1210 wallcycle_sub_stop(wcycle
, ewcsNBS_GRID_NONLOCAL
);
1213 nbv
->setAtomProperties(gmx::constArrayRefFromArray(mdatoms
->typeA
, mdatoms
->nr
),
1214 gmx::constArrayRefFromArray(mdatoms
->chargeA
, mdatoms
->nr
), fr
->cginfo
);
1216 wallcycle_stop(wcycle
, ewcNS
);
1218 /* initialize the GPU nbnxm atom data and bonded data structures */
1219 if (simulationWork
.useGpuNonbonded
)
1221 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1223 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1224 Nbnxm::gpu_init_atomdata(nbv
->gpu_nbv
, nbv
->nbat
.get());
1225 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1229 /* Now we put all atoms on the grid, we can assign bonded
1230 * interactions to the GPU, where the grid order is
1231 * needed. Also the xq, f and fshift device buffers have
1232 * been reallocated if needed, so the bonded code can
1233 * learn about them. */
1234 // TODO the xq, f, and fshift buffers are now shared
1235 // resources, so they should be maintained by a
1236 // higher-level object than the nb module.
1237 fr
->gpuBonded
->updateInteractionListsAndDeviceBuffers(
1238 nbv
->getGridIndices(), top
->idef
, Nbnxm::gpu_get_xq(nbv
->gpu_nbv
),
1239 Nbnxm::gpu_get_f(nbv
->gpu_nbv
), Nbnxm::gpu_get_fshift(nbv
->gpu_nbv
));
1241 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1244 // Need to run after the GPU-offload bonded interaction lists
1245 // are set up to be able to determine whether there is bonded work.
1246 runScheduleWork
->domainWork
= setupDomainLifetimeWorkload(
1247 *inputrec
, *fr
, pull_work
, ed
, *mdatoms
, simulationWork
, stepWork
);
1249 wallcycle_start_nocount(wcycle
, ewcNS
);
1250 wallcycle_sub_start(wcycle
, ewcsNBS_SEARCH_LOCAL
);
1251 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1252 nbv
->constructPairlist(InteractionLocality::Local
, top
->excls
, step
, nrnb
);
1254 nbv
->setupGpuShortRangeWork(fr
->gpuBonded
, InteractionLocality::Local
);
1256 wallcycle_sub_stop(wcycle
, ewcsNBS_SEARCH_LOCAL
);
1257 wallcycle_stop(wcycle
, ewcNS
);
1259 if (stepWork
.useGpuXBufferOps
)
1261 nbv
->atomdata_init_copy_x_to_nbat_x_gpu();
1263 // For force buffer ops, we use the below conditon rather than
1264 // useGpuFBufferOps to ensure that init is performed even if this
1265 // NS step is also a virial step (on which f buf ops are deactivated).
1266 if (GMX_GPU_CUDA
&& simulationWork
.useGpuBufferOps
&& simulationWork
.useGpuNonbonded
)
1268 GMX_ASSERT(stateGpu
, "stateGpu should be valid when buffer ops are offloaded");
1269 nbv
->atomdata_init_add_nbat_f_to_f_gpu(stateGpu
->fReducedOnDevice());
1272 else if (!EI_TPI(inputrec
->eI
))
1274 if (stepWork
.useGpuXBufferOps
)
1276 GMX_ASSERT(stateGpu
, "stateGpu should be valid when buffer ops are offloaded");
1277 nbv
->convertCoordinatesGpu(AtomLocality::Local
, false, stateGpu
->getCoordinates(),
1278 localXReadyOnDevice
);
1282 if (simulationWork
.useGpuUpdate
)
1284 GMX_ASSERT(stateGpu
, "need a valid stateGpu object");
1285 GMX_ASSERT(haveCopiedXFromGpu
,
1286 "a wait should only be triggered if copy has been scheduled");
1287 stateGpu
->waitCoordinatesReadyOnHost(AtomLocality::Local
);
1289 nbv
->convertCoordinates(AtomLocality::Local
, false, x
.unpaddedArrayRef());
1293 const gmx::DomainLifetimeWorkload
& domainWork
= runScheduleWork
->domainWork
;
1295 if (simulationWork
.useGpuNonbonded
)
1297 ddBalanceRegionHandler
.openBeforeForceComputationGpu();
1299 wallcycle_start(wcycle
, ewcLAUNCH_GPU
);
1301 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1302 Nbnxm::gpu_upload_shiftvec(nbv
->gpu_nbv
, nbv
->nbat
.get());
1303 if (stepWork
.doNeighborSearch
|| !stepWork
.useGpuXBufferOps
)
1305 Nbnxm::gpu_copy_xq_to_gpu(nbv
->gpu_nbv
, nbv
->nbat
.get(), AtomLocality::Local
);
1307 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1308 // with X buffer ops offloaded to the GPU on all but the search steps
1310 // bonded work not split into separate local and non-local, so with DD
1311 // we can only launch the kernel after non-local coordinates have been received.
1312 if (domainWork
.haveGpuBondedWork
&& !havePPDomainDecomposition(cr
))
1314 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_BONDED
);
1315 fr
->gpuBonded
->setPbcAndlaunchKernel(fr
->pbcType
, box
, fr
->bMolPBC
, stepWork
);
1316 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_BONDED
);
1319 /* launch local nonbonded work on GPU */
1320 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1321 do_nb_verlet(fr
, ic
, enerd
, stepWork
, InteractionLocality::Local
, enbvClearFNo
, step
, nrnb
, wcycle
);
1322 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1323 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1326 if (useGpuPmeOnThisRank
)
1328 // In PME GPU and mixed mode we launch FFT / gather after the
1329 // X copy/transform to allow overlap as well as after the GPU NB
1330 // launch to avoid FFT launch overhead hijacking the CPU and delaying
1331 // the nonbonded kernel.
1332 launchPmeGpuFftAndGather(fr
->pmedata
, lambda
[efptCOUL
], wcycle
, stepWork
);
1335 /* Communicate coordinates and sum dipole if necessary +
1336 do non-local pair search */
1337 if (havePPDomainDecomposition(cr
))
1339 if (stepWork
.doNeighborSearch
)
1341 // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
1342 wallcycle_start_nocount(wcycle
, ewcNS
);
1343 wallcycle_sub_start(wcycle
, ewcsNBS_SEARCH_NONLOCAL
);
1344 /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1345 nbv
->constructPairlist(InteractionLocality::NonLocal
, top
->excls
, step
, nrnb
);
1347 nbv
->setupGpuShortRangeWork(fr
->gpuBonded
, InteractionLocality::NonLocal
);
1348 wallcycle_sub_stop(wcycle
, ewcsNBS_SEARCH_NONLOCAL
);
1349 wallcycle_stop(wcycle
, ewcNS
);
1350 // TODO refactor this GPU halo exchange re-initialisation
1351 // to location in do_md where GPU halo exchange is
1352 // constructed at partitioning, after above stateGpu
1353 // re-initialization has similarly been refactored
1354 if (ddUsesGpuDirectCommunication
)
1356 reinitGpuHaloExchange(*cr
, stateGpu
->getCoordinates(), stateGpu
->getForces());
1361 if (ddUsesGpuDirectCommunication
)
1363 // The following must be called after local setCoordinates (which records an event
1364 // when the coordinate data has been copied to the device).
1365 communicateGpuHaloCoordinates(*cr
, box
, localXReadyOnDevice
);
1367 if (domainWork
.haveCpuBondedWork
|| domainWork
.haveFreeEnergyWork
)
1369 // non-local part of coordinate buffer must be copied back to host for CPU work
1370 stateGpu
->copyCoordinatesFromGpu(x
.unpaddedArrayRef(), AtomLocality::NonLocal
);
1375 // Note: GPU update + DD without direct communication is not supported,
1376 // a waitCoordinatesReadyOnHost() should be issued if it will be.
1377 GMX_ASSERT(!simulationWork
.useGpuUpdate
,
1378 "GPU update is not supported with CPU halo exchange");
1379 dd_move_x(cr
->dd
, box
, x
.unpaddedArrayRef(), wcycle
);
1382 if (stepWork
.useGpuXBufferOps
)
1384 if (!useGpuPmeOnThisRank
&& !ddUsesGpuDirectCommunication
)
1386 stateGpu
->copyCoordinatesToGpu(x
.unpaddedArrayRef(), AtomLocality::NonLocal
);
1388 nbv
->convertCoordinatesGpu(AtomLocality::NonLocal
, false, stateGpu
->getCoordinates(),
1389 stateGpu
->getCoordinatesReadyOnDeviceEvent(
1390 AtomLocality::NonLocal
, simulationWork
, stepWork
));
1394 nbv
->convertCoordinates(AtomLocality::NonLocal
, false, x
.unpaddedArrayRef());
1398 if (simulationWork
.useGpuNonbonded
)
1400 wallcycle_start(wcycle
, ewcLAUNCH_GPU
);
1402 if (stepWork
.doNeighborSearch
|| !stepWork
.useGpuXBufferOps
)
1404 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1405 Nbnxm::gpu_copy_xq_to_gpu(nbv
->gpu_nbv
, nbv
->nbat
.get(), AtomLocality::NonLocal
);
1406 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1409 if (domainWork
.haveGpuBondedWork
)
1411 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_BONDED
);
1412 fr
->gpuBonded
->setPbcAndlaunchKernel(fr
->pbcType
, box
, fr
->bMolPBC
, stepWork
);
1413 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_BONDED
);
1416 /* launch non-local nonbonded tasks on GPU */
1417 wallcycle_sub_start(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1418 do_nb_verlet(fr
, ic
, enerd
, stepWork
, InteractionLocality::NonLocal
, enbvClearFNo
, step
,
1420 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1422 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1426 if (simulationWork
.useGpuNonbonded
)
1428 /* launch D2H copy-back F */
1429 wallcycle_start_nocount(wcycle
, ewcLAUNCH_GPU
);
1430 wallcycle_sub_start_nocount(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1432 if (havePPDomainDecomposition(cr
))
1434 Nbnxm::gpu_launch_cpyback(nbv
->gpu_nbv
, nbv
->nbat
.get(), stepWork
, AtomLocality::NonLocal
);
1436 Nbnxm::gpu_launch_cpyback(nbv
->gpu_nbv
, nbv
->nbat
.get(), stepWork
, AtomLocality::Local
);
1437 wallcycle_sub_stop(wcycle
, ewcsLAUNCH_GPU_NONBONDED
);
1439 if (domainWork
.haveGpuBondedWork
&& stepWork
.computeEnergy
)
1441 fr
->gpuBonded
->launchEnergyTransfer();
1443 wallcycle_stop(wcycle
, ewcLAUNCH_GPU
);
1446 gmx::ArrayRef
<const gmx::RVec
> xWholeMolecules
;
1447 if (fr
->wholeMoleculeTransform
)
1449 xWholeMolecules
= fr
->wholeMoleculeTransform
->wholeMoleculeCoordinates(x
.unpaddedArrayRef(), box
);
1452 DipoleData dipoleData
;
1454 if (simulationWork
.computeMuTot
)
1456 const int start
= 0;
1458 /* Calculate total (local) dipole moment in a temporary common array.
1459 * This makes it possible to sum them over nodes faster.
1461 gmx::ArrayRef
<const gmx::RVec
> xRef
=
1462 (xWholeMolecules
.empty() ? x
.unpaddedArrayRef() : xWholeMolecules
);
1463 calc_mu(start
, mdatoms
->homenr
, xRef
, mdatoms
->chargeA
, mdatoms
->chargeB
,
1464 mdatoms
->nChargePerturbed
, dipoleData
.muStaging
[0], dipoleData
.muStaging
[1]);
1466 reduceAndUpdateMuTot(&dipoleData
, cr
, (fr
->efep
!= efepNO
), lambda
, muTotal
, ddBalanceRegionHandler
);
1469 /* Reset energies */
1470 reset_enerdata(enerd
);
1472 if (DOMAINDECOMP(cr
) && !thisRankHasDuty(cr
, DUTY_PME
))
1474 wallcycle_start(wcycle
, ewcPPDURINGPME
);
1475 dd_force_flop_start(cr
->dd
, nrnb
);
1478 // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
1479 // this wait ensures that the D2H transfer is complete.
1480 if ((simulationWork
.useGpuUpdate
)
1481 && (runScheduleWork
->domainWork
.haveCpuLocalForceWork
|| stepWork
.computeVirial
))
1483 stateGpu
->waitCoordinatesReadyOnHost(AtomLocality::Local
);
1488 wallcycle_start(wcycle
, ewcROT
);
1489 do_rotation(cr
, enforcedRotation
, box
, as_rvec_array(x
.unpaddedArrayRef().data()), t
, step
,
1490 stepWork
.doNeighborSearch
);
1491 wallcycle_stop(wcycle
, ewcROT
);
1494 /* Start the force cycle counter.
1495 * Note that a different counter is used for dynamic load balancing.
1497 wallcycle_start(wcycle
, ewcFORCE
);
1499 // Set up and clear force outputs.
1500 // We use std::move to keep the compiler happy, it has no effect.
1501 ForceOutputs forceOut
= setupForceOutputs(fr
->forceHelperBuffers
.get(), pull_work
, *inputrec
,
1502 std::move(force
), stepWork
, wcycle
);
1504 /* We calculate the non-bonded forces, when done on the CPU, here.
1505 * We do this before calling do_force_lowlevel, because in that
1506 * function, the listed forces are calculated before PME, which
1507 * does communication. With this order, non-bonded and listed
1508 * force calculation imbalance can be balanced out by the domain
1509 * decomposition load balancing.
1512 const bool useOrEmulateGpuNb
= simulationWork
.useGpuNonbonded
|| fr
->nbv
->emulateGpu();
1514 if (!useOrEmulateGpuNb
)
1516 do_nb_verlet(fr
, ic
, enerd
, stepWork
, InteractionLocality::Local
, enbvClearFYes
, step
, nrnb
, wcycle
);
1519 if (fr
->efep
!= efepNO
)
1521 /* Calculate the local and non-local free energy interactions here.
1522 * Happens here on the CPU both with and without GPU.
1524 nbv
->dispatchFreeEnergyKernel(InteractionLocality::Local
, fr
,
1525 as_rvec_array(x
.unpaddedArrayRef().data()),
1526 &forceOut
.forceWithShiftForces(), *mdatoms
, inputrec
->fepvals
,
1527 lambda
, enerd
, stepWork
, nrnb
);
1529 if (havePPDomainDecomposition(cr
))
1531 nbv
->dispatchFreeEnergyKernel(InteractionLocality::NonLocal
, fr
,
1532 as_rvec_array(x
.unpaddedArrayRef().data()),
1533 &forceOut
.forceWithShiftForces(), *mdatoms
,
1534 inputrec
->fepvals
, lambda
, enerd
, stepWork
, nrnb
);
1538 if (!useOrEmulateGpuNb
)
1540 if (havePPDomainDecomposition(cr
))
1542 do_nb_verlet(fr
, ic
, enerd
, stepWork
, InteractionLocality::NonLocal
, enbvClearFNo
, step
,
1546 if (stepWork
.computeForces
)
1548 /* Add all the non-bonded force to the normal force array.
1549 * This can be split into a local and a non-local part when overlapping
1550 * communication with calculation with domain decomposition.
1552 wallcycle_stop(wcycle
, ewcFORCE
);
1553 nbv
->atomdata_add_nbat_f_to_f(AtomLocality::All
, forceOut
.forceWithShiftForces().force());
1554 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1557 /* If there are multiple fshift output buffers we need to reduce them */
1558 if (stepWork
.computeVirial
)
1560 /* This is not in a subcounter because it takes a
1561 negligible and constant-sized amount of time */
1562 nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv
->nbat
,
1563 forceOut
.forceWithShiftForces().shiftForces());
1567 // TODO Force flags should include haveFreeEnergyWork for this domain
1568 if (ddUsesGpuDirectCommunication
&& (domainWork
.haveCpuBondedWork
|| domainWork
.haveFreeEnergyWork
))
1570 /* Wait for non-local coordinate data to be copied from device */
1571 stateGpu
->waitCoordinatesReadyOnHost(AtomLocality::NonLocal
);
1573 /* Compute the bonded and non-bonded energies and optionally forces */
1574 do_force_lowlevel(fr
, inputrec
, cr
, ms
, nrnb
, wcycle
, mdatoms
, x
, xWholeMolecules
, hist
,
1575 &forceOut
, enerd
, box
, lambda
.data(), as_rvec_array(dipoleData
.muStateAB
),
1576 stepWork
, ddBalanceRegionHandler
);
1578 wallcycle_stop(wcycle
, ewcFORCE
);
1580 computeSpecialForces(fplog
, cr
, inputrec
, awh
, enforcedRotation
, imdSession
, pull_work
, step
, t
,
1581 wcycle
, fr
->forceProviders
, box
, x
.unpaddedArrayRef(), mdatoms
, lambda
,
1582 stepWork
, &forceOut
.forceWithVirial(), enerd
, ed
, stepWork
.doNeighborSearch
);
1585 // Will store the amount of cycles spent waiting for the GPU that
1586 // will be later used in the DLB accounting.
1587 float cycles_wait_gpu
= 0;
1588 if (useOrEmulateGpuNb
)
1590 auto& forceWithShiftForces
= forceOut
.forceWithShiftForces();
1592 /* wait for non-local forces (or calculate in emulation mode) */
1593 if (havePPDomainDecomposition(cr
))
1595 if (simulationWork
.useGpuNonbonded
)
1597 cycles_wait_gpu
+= Nbnxm::gpu_wait_finish_task(
1598 nbv
->gpu_nbv
, stepWork
, AtomLocality::NonLocal
, enerd
->grpp
.ener
[egLJSR
].data(),
1599 enerd
->grpp
.ener
[egCOULSR
].data(), forceWithShiftForces
.shiftForces(), wcycle
);
1603 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1604 do_nb_verlet(fr
, ic
, enerd
, stepWork
, InteractionLocality::NonLocal
, enbvClearFYes
,
1605 step
, nrnb
, wcycle
);
1606 wallcycle_stop(wcycle
, ewcFORCE
);
1609 if (stepWork
.useGpuFBufferOps
)
1611 gmx::FixedCapacityVector
<GpuEventSynchronizer
*, 1> dependencyList
;
1613 // TODO: move this into DomainLifetimeWorkload, including the second part of the
1614 // condition The bonded and free energy CPU tasks can have non-local force
1615 // contributions which are a dependency for the GPU force reduction.
1616 bool haveNonLocalForceContribInCpuBuffer
=
1617 domainWork
.haveCpuBondedWork
|| domainWork
.haveFreeEnergyWork
;
1619 if (haveNonLocalForceContribInCpuBuffer
)
1621 stateGpu
->copyForcesToGpu(forceOut
.forceWithShiftForces().force(),
1622 AtomLocality::NonLocal
);
1623 dependencyList
.push_back(stateGpu
->getForcesReadyOnDeviceEvent(
1624 AtomLocality::NonLocal
, stepWork
.useGpuFBufferOps
));
1627 nbv
->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal
, stateGpu
->getForces(),
1628 pme_gpu_get_device_f(fr
->pmedata
), dependencyList
,
1629 false, haveNonLocalForceContribInCpuBuffer
);
1630 if (!useGpuForcesHaloExchange
)
1632 // copy from GPU input for dd_move_f()
1633 stateGpu
->copyForcesFromGpu(forceOut
.forceWithShiftForces().force(),
1634 AtomLocality::NonLocal
);
1639 nbv
->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal
, forceWithShiftForces
.force());
1643 if (fr
->nbv
->emulateGpu() && stepWork
.computeVirial
)
1645 nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv
->nbat
, forceWithShiftForces
.shiftForces());
1650 if (havePPDomainDecomposition(cr
))
1652 /* We are done with the CPU compute.
1653 * We will now communicate the non-local forces.
1654 * If we use a GPU this will overlap with GPU work, so in that case
1655 * we do not close the DD force balancing region here.
1657 ddBalanceRegionHandler
.closeAfterForceComputationCpu();
1659 if (stepWork
.computeForces
)
1662 if (useGpuForcesHaloExchange
)
1664 if (domainWork
.haveCpuLocalForceWork
)
1666 stateGpu
->copyForcesToGpu(forceOut
.forceWithShiftForces().force(), AtomLocality::Local
);
1668 communicateGpuHaloForces(*cr
, domainWork
.haveCpuLocalForceWork
);
1672 if (stepWork
.useGpuFBufferOps
)
1674 stateGpu
->waitForcesReadyOnHost(AtomLocality::NonLocal
);
1676 dd_move_f(cr
->dd
, &forceOut
.forceWithShiftForces(), wcycle
);
1681 // With both nonbonded and PME offloaded a GPU on the same rank, we use
1682 // an alternating wait/reduction scheme.
1683 bool alternateGpuWait
= (!c_disableAlternatingWait
&& useGpuPmeOnThisRank
&& simulationWork
.useGpuNonbonded
1684 && !DOMAINDECOMP(cr
) && !stepWork
.useGpuFBufferOps
);
1685 if (alternateGpuWait
)
1687 alternatePmeNbGpuWaitReduce(fr
->nbv
.get(), fr
->pmedata
, &forceOut
, enerd
, lambda
[efptCOUL
],
1691 if (!alternateGpuWait
&& useGpuPmeOnThisRank
)
1693 pme_gpu_wait_and_reduce(fr
->pmedata
, stepWork
, wcycle
, &forceOut
.forceWithVirial(), enerd
,
1697 /* Wait for local GPU NB outputs on the non-alternating wait path */
1698 if (!alternateGpuWait
&& simulationWork
.useGpuNonbonded
)
1700 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1701 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1702 * but even with a step of 0.1 ms the difference is less than 1%
1705 const float gpuWaitApiOverheadMargin
= 2e6F
; /* cycles */
1706 const float waitCycles
= Nbnxm::gpu_wait_finish_task(
1707 nbv
->gpu_nbv
, stepWork
, AtomLocality::Local
, enerd
->grpp
.ener
[egLJSR
].data(),
1708 enerd
->grpp
.ener
[egCOULSR
].data(), forceOut
.forceWithShiftForces().shiftForces(), wcycle
);
1710 if (ddBalanceRegionHandler
.useBalancingRegion())
1712 DdBalanceRegionWaitedForGpu waitedForGpu
= DdBalanceRegionWaitedForGpu::yes
;
1713 if (stepWork
.computeForces
&& waitCycles
<= gpuWaitApiOverheadMargin
)
1715 /* We measured few cycles, it could be that the kernel
1716 * and transfer finished earlier and there was no actual
1717 * wait time, only API call overhead.
1718 * Then the actual time could be anywhere between 0 and
1719 * cycles_wait_est. We will use half of cycles_wait_est.
1721 waitedForGpu
= DdBalanceRegionWaitedForGpu::no
;
1723 ddBalanceRegionHandler
.closeAfterForceComputationGpu(cycles_wait_gpu
, waitedForGpu
);
1727 if (fr
->nbv
->emulateGpu())
1729 // NOTE: emulation kernel is not included in the balancing region,
1730 // but emulation mode does not target performance anyway
1731 wallcycle_start_nocount(wcycle
, ewcFORCE
);
1732 do_nb_verlet(fr
, ic
, enerd
, stepWork
, InteractionLocality::Local
,
1733 DOMAINDECOMP(cr
) ? enbvClearFNo
: enbvClearFYes
, step
, nrnb
, wcycle
);
1734 wallcycle_stop(wcycle
, ewcFORCE
);
1737 // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops
1738 // TODO refactor this and unify with below default-path call to the same function
1739 if (PAR(cr
) && !thisRankHasDuty(cr
, DUTY_PME
)
1740 && (simulationWork
.useGpuPmePpCommunication
|| simulationWork
.useGpuUpdate
))
1742 /* In case of node-splitting, the PP nodes receive the long-range
1743 * forces, virial and energy from the PME nodes here.
1745 pme_receive_force_ener(fr
, cr
, &forceOut
.forceWithVirial(), enerd
,
1746 simulationWork
.useGpuPmePpCommunication
,
1747 stepWork
.useGpuPmeFReduction
, wcycle
);
1751 /* Do the nonbonded GPU (or emulation) force buffer reduction
1752 * on the non-alternating path. */
1753 if (useOrEmulateGpuNb
&& !alternateGpuWait
)
1755 // TODO simplify the below conditionals. Pass buffer and sync pointers at init stage rather than here. Unify getter fns for sameGPU/otherGPU cases.
1757 stepWork
.useGpuPmeFReduction
1758 ? (thisRankHasDuty(cr
, DUTY_PME
) ? pme_gpu_get_device_f(fr
->pmedata
)
1759 : // PME force buffer on same GPU
1760 fr
->pmePpCommGpu
->getGpuForceStagingPtr()) // buffer received from other GPU
1761 : nullptr; // PME reduction not active on GPU
1763 GpuEventSynchronizer
* const pmeSynchronizer
=
1764 stepWork
.useGpuPmeFReduction
1765 ? (thisRankHasDuty(cr
, DUTY_PME
) ? pme_gpu_get_f_ready_synchronizer(fr
->pmedata
)
1766 : // PME force buffer on same GPU
1767 static_cast<GpuEventSynchronizer
*>(
1768 fr
->pmePpCommGpu
->getForcesReadySynchronizer())) // buffer received from other GPU
1769 : nullptr; // PME reduction not active on GPU
1771 gmx::FixedCapacityVector
<GpuEventSynchronizer
*, 3> dependencyList
;
1773 if (stepWork
.useGpuPmeFReduction
)
1775 dependencyList
.push_back(pmeSynchronizer
);
1778 gmx::ArrayRef
<gmx::RVec
> forceWithShift
= forceOut
.forceWithShiftForces().force();
1780 if (stepWork
.useGpuFBufferOps
)
1782 // Flag to specify whether the CPU force buffer has contributions to
1783 // local atoms. This depends on whether there are CPU-based force tasks
1784 // or when DD is active the halo exchange has resulted in contributions
1785 // from the non-local part.
1786 const bool haveLocalForceContribInCpuBuffer
=
1787 (domainWork
.haveCpuLocalForceWork
|| havePPDomainDecomposition(cr
));
1789 // TODO: move these steps as early as possible:
1790 // - CPU f H2D should be as soon as all CPU-side forces are done
1791 // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
1792 // before the next CPU task that consumes the forces: vsite spread or update)
1793 // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
1794 // of the halo exchange. In that case the copy is instead performed above, before the exchange.
1795 // These should be unified.
1796 if (haveLocalForceContribInCpuBuffer
&& !useGpuForcesHaloExchange
)
1798 // Note: AtomLocality::All is used for the non-DD case because, as in this
1799 // case copyForcesToGpu() uses a separate stream, it allows overlap of
1800 // CPU force H2D with GPU force tasks on all streams including those in the
1801 // local stream which would otherwise be implicit dependencies for the
1802 // transfer and would not overlap.
1803 auto locality
= havePPDomainDecomposition(cr
) ? AtomLocality::Local
: AtomLocality::All
;
1805 stateGpu
->copyForcesToGpu(forceWithShift
, locality
);
1806 dependencyList
.push_back(
1807 stateGpu
->getForcesReadyOnDeviceEvent(locality
, stepWork
.useGpuFBufferOps
));
1809 if (useGpuForcesHaloExchange
)
1811 dependencyList
.push_back(cr
->dd
->gpuHaloExchange
[0]->getForcesReadyOnDeviceEvent());
1813 nbv
->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local
, stateGpu
->getForces(), pmeForcePtr
,
1814 dependencyList
, stepWork
.useGpuPmeFReduction
,
1815 haveLocalForceContribInCpuBuffer
);
1816 // Copy forces to host if they are needed for update or if virtual sites are enabled.
1817 // If there are vsites, we need to copy forces every step to spread vsite forces on host.
1818 // TODO: When the output flags will be included in step workload, this copy can be combined with the
1819 // copy call done in sim_utils(...) for the output.
1820 // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
1821 // they should not be copied in do_md(...) for the output.
1822 if (!simulationWork
.useGpuUpdate
|| vsite
)
1824 stateGpu
->copyForcesFromGpu(forceWithShift
, AtomLocality::Local
);
1825 stateGpu
->waitForcesReadyOnHost(AtomLocality::Local
);
1830 nbv
->atomdata_add_nbat_f_to_f(AtomLocality::Local
, forceWithShift
);
1834 launchGpuEndOfStepTasks(nbv
, fr
->gpuBonded
, fr
->pmedata
, enerd
, *runScheduleWork
,
1835 useGpuPmeOnThisRank
, step
, wcycle
);
1837 if (DOMAINDECOMP(cr
))
1839 dd_force_flop_stop(cr
->dd
, nrnb
);
1842 if (stepWork
.computeForces
)
1844 postProcessForceWithShiftForces(nrnb
, wcycle
, box
, x
.unpaddedArrayRef(), &forceOut
,
1845 vir_force
, *mdatoms
, *fr
, vsite
, stepWork
);
1848 // VdW dispersion correction, only computed on master rank to avoid double counting
1849 if ((stepWork
.computeEnergy
|| stepWork
.computeVirial
) && fr
->dispersionCorrection
&& MASTER(cr
))
1851 // Calculate long range corrections to pressure and energy
1852 const DispersionCorrection::Correction correction
=
1853 fr
->dispersionCorrection
->calculate(box
, lambda
[efptVDW
]);
1855 if (stepWork
.computeEnergy
)
1857 enerd
->term
[F_DISPCORR
] = correction
.energy
;
1858 enerd
->term
[F_DVDL_VDW
] += correction
.dvdl
;
1859 enerd
->dvdl_lin
[efptVDW
] += correction
.dvdl
;
1861 if (stepWork
.computeVirial
)
1863 correction
.correctVirial(vir_force
);
1864 enerd
->term
[F_PDISPCORR
] = correction
.pressure
;
1868 // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
1869 if (PAR(cr
) && !thisRankHasDuty(cr
, DUTY_PME
) && !simulationWork
.useGpuPmePpCommunication
1870 && !simulationWork
.useGpuUpdate
)
1872 /* In case of node-splitting, the PP nodes receive the long-range
1873 * forces, virial and energy from the PME nodes here.
1875 pme_receive_force_ener(fr
, cr
, &forceOut
.forceWithVirial(), enerd
,
1876 simulationWork
.useGpuPmePpCommunication
, false, wcycle
);
1879 if (stepWork
.computeForces
)
1881 postProcessForces(cr
, step
, nrnb
, wcycle
, box
, x
.unpaddedArrayRef(), &forceOut
, vir_force
,
1882 mdatoms
, fr
, vsite
, stepWork
);
1885 if (stepWork
.computeEnergy
)
1887 /* Compute the final potential energy terms */
1888 accumulatePotentialEnergies(enerd
, lambda
, inputrec
->fepvals
);
1890 if (!EI_TPI(inputrec
->eI
))
1892 checkPotentialEnergyValidity(step
, *enerd
, *inputrec
);
1896 /* In case we don't have constraints and are using GPUs, the next balancing
1897 * region starts here.
1898 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1899 * virial calculation and COM pulling, is not thus not included in
1900 * the balance timing, which is ok as most tasks do communication.
1902 ddBalanceRegionHandler
.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no
);