2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 * \brief Implements the VirtualSitesHandler class and vsite standalone functions
41 * \author Berk Hess <hess@kth.se>
42 * \ingroup module_mdlib
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/math/functions.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/gmx_omp_nthreads.h"
62 #include "gromacs/mdtypes/commrec.h"
63 #include "gromacs/mdtypes/mdatom.h"
64 #include "gromacs/pbcutil/ishift.h"
65 #include "gromacs/pbcutil/pbc.h"
66 #include "gromacs/timing/wallcycle.h"
67 #include "gromacs/topology/ifunc.h"
68 #include "gromacs/topology/mtop_util.h"
69 #include "gromacs/topology/topology.h"
70 #include "gromacs/utility/exceptions.h"
71 #include "gromacs/utility/fatalerror.h"
72 #include "gromacs/utility/gmxassert.h"
73 #include "gromacs/utility/gmxomp.h"
75 /* The strategy used here for assigning virtual sites to (thread-)tasks
78 * We divide the atom range that vsites operate on (natoms_local with DD,
79 * 0 - last atom involved in vsites without DD) equally over all threads.
81 * Vsites in the local range constructed from atoms in the local range
82 * and/or other vsites that are fully local are assigned to a simple,
85 * Vsites that are not assigned after using the above criterion get assigned
86 * to a so called "interdependent" thread task when none of the constructing
87 * atoms is a vsite. These tasks are called interdependent, because one task
88 * accesses atoms assigned to a different task/thread.
89 * Note that this option is turned off with large (local) atom counts
90 * to avoid high memory usage.
92 * Any remaining vsites are assigned to a separate master thread task.
97 //! VirialHandling is often used outside VirtualSitesHandler class members
98 using VirialHandling
= VirtualSitesHandler::VirialHandling
;
100 /*! \brief Information on PBC and domain decomposition for virtual sites
105 //! Constructs without PBC and DD
106 DomainInfo() = default;
108 //! Constructs with PBC and DD, if !=nullptr
109 DomainInfo(PbcType pbcType
, bool haveInterUpdateGroupVirtualSites
, gmx_domdec_t
* domdec
) :
111 useMolPbc_(pbcType
!= PbcType::No
&& haveInterUpdateGroupVirtualSites
),
116 //! Returns whether we are using domain decomposition with more than 1 DD rank
117 bool useDomdec() const { return (domdec_
!= nullptr); }
120 const PbcType pbcType_
= PbcType::No
;
121 //! Whether molecules are broken over PBC
122 const bool useMolPbc_
= false;
123 //! Pointer to the domain decomposition struct, nullptr without PP DD
124 const gmx_domdec_t
* domdec_
= nullptr;
127 /*! \brief List of atom indices belonging to a task
131 //! List of atom indices
132 std::vector
<int> atom
;
135 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range
137 struct InterdependentTask
139 //! The interaction lists, only vsite entries are used
140 InteractionLists ilist
;
141 //! Thread/task-local force buffer
142 std::vector
<RVec
> force
;
143 //! The atom indices of the vsites of our task
144 std::vector
<int> vsite
;
145 //! Flags if elements in force are spread to or not
146 std::vector
<bool> use
;
147 //! The number of entries set to true in use
149 //! Array of atoms indices, size nthreads, covering all nuse set elements in use
150 std::vector
<AtomIndex
> atomIndex
;
151 //! List of tasks (force blocks) this task spread forces to
152 std::vector
<int> spreadTask
;
153 //! List of tasks that write to this tasks force block range
154 std::vector
<int> reduceTask
;
157 /*! \brief Vsite thread task data structure
161 //! Start of atom range of this task
163 //! End of atom range of this task
165 //! The interaction lists, only vsite entries are used
166 std::array
<InteractionList
, F_NRE
> ilist
;
167 //! Local fshift accumulation buffer
168 std::array
<RVec
, SHIFTS
> fshift
;
169 //! Local virial dx*df accumulation buffer
171 //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
172 bool useInterdependentTask
;
173 //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
174 InterdependentTask idTask
;
176 /*! \brief Constructor */
181 for (auto& elem
: fshift
)
183 elem
= { 0.0_real
, 0.0_real
, 0.0_real
};
186 useInterdependentTask
= false;
191 /*! \brief Information on how the virtual site work is divided over thread tasks
196 //! Constructor, retrieves the number of threads to use from gmx_omp_nthreads.h
199 //! Returns the number of threads to use for vsite operations
200 int numThreads() const { return numThreads_
; }
202 //! Returns the thread data for the given thread
203 const VsiteThread
& threadData(int threadIndex
) const { return *tData_
[threadIndex
]; }
205 //! Returns the thread data for the given thread
206 VsiteThread
& threadData(int threadIndex
) { return *tData_
[threadIndex
]; }
208 //! Returns the thread data for vsites that depend on non-local vsites
209 const VsiteThread
& threadDataNonLocalDependent() const { return *tData_
[numThreads_
]; }
211 //! Returns the thread data for vsites that depend on non-local vsites
212 VsiteThread
& threadDataNonLocalDependent() { return *tData_
[numThreads_
]; }
214 //! Set VSites and distribute VSite work over threads, should be called after DD partitioning
215 void setVirtualSites(ArrayRef
<const InteractionList
> ilist
,
216 ArrayRef
<const t_iparams
> iparams
,
217 const t_mdatoms
& mdatoms
,
221 //! Number of threads used for vsite operations
222 const int numThreads_
;
223 //! Thread local vsites and work structs
224 std::vector
<std::unique_ptr
<VsiteThread
>> tData_
;
225 //! Work array for dividing vsites over threads
226 std::vector
<int> taskIndex_
;
229 /*! \brief Impl class for VirtualSitesHandler
231 class VirtualSitesHandler::Impl
234 //! Constructor, domdec should be nullptr without DD
235 Impl(const gmx_mtop_t
& mtop
, gmx_domdec_t
* domdec
, PbcType pbcType
);
237 //! Returns the number of virtual sites acting over multiple update groups
238 int numInterUpdategroupVirtualSites() const { return numInterUpdategroupVirtualSites_
; }
240 //! Set VSites and distribute VSite work over threads, should be called after DD partitioning
241 void setVirtualSites(ArrayRef
<const InteractionList
> ilist
, const t_mdatoms
& mdatoms
);
243 /*! \brief Create positions of vsite atoms based for the local system
245 * \param[in,out] x The coordinates
246 * \param[in] dt The time step
247 * \param[in,out] v When != nullptr, velocities for vsites are set as displacement/dt
248 * \param[in] box The box
250 void construct(ArrayRef
<RVec
> x
, real dt
, ArrayRef
<RVec
> v
, const matrix box
) const;
252 /*! \brief Spread the force operating on the vsite atoms on the surrounding atoms.
254 * vsite should point to a valid object.
255 * The virialHandling parameter determines how virial contributions are handled.
256 * If this is set to Linear, shift forces are accumulated into fshift.
257 * If this is set to NonLinear, non-linear contributions are added to virial.
258 * This non-linear correction is required when the virial is not calculated
259 * afterwards from the particle position and forces, but in a different way,
260 * as for instance for the PME mesh contribution.
262 void spreadForces(ArrayRef
<const RVec
> x
,
264 VirialHandling virialHandling
,
265 ArrayRef
<RVec
> fshift
,
269 gmx_wallcycle
* wcycle
);
272 //! The number of vsites that cross update groups, when =0 no PBC treatment is needed
273 const int numInterUpdategroupVirtualSites_
;
274 //! PBC and DD information
275 const DomainInfo domainInfo_
;
276 //! The interaction parameters
277 const ArrayRef
<const t_iparams
> iparams_
;
278 //! The interaction lists
279 ArrayRef
<const InteractionList
> ilists_
;
280 //! Information for handling vsite threading
281 ThreadingInfo threadingInfo_
;
284 VirtualSitesHandler::~VirtualSitesHandler() = default;
286 int VirtualSitesHandler::numInterUpdategroupVirtualSites() const
288 return impl_
->numInterUpdategroupVirtualSites();
291 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
293 * \param[in] ilist The interaction list
295 static int vsiteIlistNrCount(ArrayRef
<const InteractionList
> ilist
)
298 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
300 nr
+= ilist
[ftype
].size();
306 //! Computes the distance between xi and xj, pbc is used when pbc!=nullptr
307 static int pbc_rvec_sub(const t_pbc
* pbc
, const rvec xi
, const rvec xj
, rvec dx
)
311 return pbc_dx_aiuc(pbc
, xi
, xj
, dx
);
315 rvec_sub(xi
, xj
, dx
);
320 //! Returns the 1/norm(x)
321 static inline real
inverseNorm(const rvec x
)
323 return gmx::invsqrt(iprod(x
, x
));
327 /* Vsite construction routines */
329 static void constr_vsite1(const rvec xi
, rvec x
)
336 static void constr_vsite2(const rvec xi
, const rvec xj
, rvec x
, real a
, const t_pbc
* pbc
)
344 pbc_dx_aiuc(pbc
, xj
, xi
, dx
);
345 x
[XX
] = xi
[XX
] + a
* dx
[XX
];
346 x
[YY
] = xi
[YY
] + a
* dx
[YY
];
347 x
[ZZ
] = xi
[ZZ
] + a
* dx
[ZZ
];
351 x
[XX
] = b
* xi
[XX
] + a
* xj
[XX
];
352 x
[YY
] = b
* xi
[YY
] + a
* xj
[YY
];
353 x
[ZZ
] = b
* xi
[ZZ
] + a
* xj
[ZZ
];
357 /* TOTAL: 10 flops */
360 static void constr_vsite2FD(const rvec xi
, const rvec xj
, rvec x
, real a
, const t_pbc
* pbc
)
363 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
366 const real b
= a
* inverseNorm(xij
);
369 x
[XX
] = xi
[XX
] + b
* xij
[XX
];
370 x
[YY
] = xi
[YY
] + b
* xij
[YY
];
371 x
[ZZ
] = xi
[ZZ
] + b
* xij
[ZZ
];
374 /* TOTAL: 25 flops */
377 static void constr_vsite3(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
* pbc
)
386 pbc_dx_aiuc(pbc
, xj
, xi
, dxj
);
387 pbc_dx_aiuc(pbc
, xk
, xi
, dxk
);
388 x
[XX
] = xi
[XX
] + a
* dxj
[XX
] + b
* dxk
[XX
];
389 x
[YY
] = xi
[YY
] + a
* dxj
[YY
] + b
* dxk
[YY
];
390 x
[ZZ
] = xi
[ZZ
] + a
* dxj
[ZZ
] + b
* dxk
[ZZ
];
394 x
[XX
] = c
* xi
[XX
] + a
* xj
[XX
] + b
* xk
[XX
];
395 x
[YY
] = c
* xi
[YY
] + a
* xj
[YY
] + b
* xk
[YY
];
396 x
[ZZ
] = c
* xi
[ZZ
] + a
* xj
[ZZ
] + b
* xk
[ZZ
];
400 /* TOTAL: 17 flops */
403 static void constr_vsite3FD(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
* pbc
)
408 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
409 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
412 /* temp goes from i to a point on the line jk */
413 temp
[XX
] = xij
[XX
] + a
* xjk
[XX
];
414 temp
[YY
] = xij
[YY
] + a
* xjk
[YY
];
415 temp
[ZZ
] = xij
[ZZ
] + a
* xjk
[ZZ
];
418 c
= b
* inverseNorm(temp
);
421 x
[XX
] = xi
[XX
] + c
* temp
[XX
];
422 x
[YY
] = xi
[YY
] + c
* temp
[YY
];
423 x
[ZZ
] = xi
[ZZ
] + c
* temp
[ZZ
];
426 /* TOTAL: 34 flops */
429 static void constr_vsite3FAD(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, const t_pbc
* pbc
)
432 real a1
, b1
, c1
, invdij
;
434 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
435 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
438 invdij
= inverseNorm(xij
);
439 c1
= invdij
* invdij
* iprod(xij
, xjk
);
440 xp
[XX
] = xjk
[XX
] - c1
* xij
[XX
];
441 xp
[YY
] = xjk
[YY
] - c1
* xij
[YY
];
442 xp
[ZZ
] = xjk
[ZZ
] - c1
* xij
[ZZ
];
444 b1
= b
* inverseNorm(xp
);
447 x
[XX
] = xi
[XX
] + a1
* xij
[XX
] + b1
* xp
[XX
];
448 x
[YY
] = xi
[YY
] + a1
* xij
[YY
] + b1
* xp
[YY
];
449 x
[ZZ
] = xi
[ZZ
] + a1
* xij
[ZZ
] + b1
* xp
[ZZ
];
452 /* TOTAL: 63 flops */
456 constr_vsite3OUT(const rvec xi
, const rvec xj
, const rvec xk
, rvec x
, real a
, real b
, real c
, const t_pbc
* pbc
)
460 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
461 pbc_rvec_sub(pbc
, xk
, xi
, xik
);
462 cprod(xij
, xik
, temp
);
465 x
[XX
] = xi
[XX
] + a
* xij
[XX
] + b
* xik
[XX
] + c
* temp
[XX
];
466 x
[YY
] = xi
[YY
] + a
* xij
[YY
] + b
* xik
[YY
] + c
* temp
[YY
];
467 x
[ZZ
] = xi
[ZZ
] + a
* xij
[ZZ
] + b
* xik
[ZZ
] + c
* temp
[ZZ
];
470 /* TOTAL: 33 flops */
473 static void constr_vsite4FD(const rvec xi
,
483 rvec xij
, xjk
, xjl
, temp
;
486 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
487 pbc_rvec_sub(pbc
, xk
, xj
, xjk
);
488 pbc_rvec_sub(pbc
, xl
, xj
, xjl
);
491 /* temp goes from i to a point on the plane jkl */
492 temp
[XX
] = xij
[XX
] + a
* xjk
[XX
] + b
* xjl
[XX
];
493 temp
[YY
] = xij
[YY
] + a
* xjk
[YY
] + b
* xjl
[YY
];
494 temp
[ZZ
] = xij
[ZZ
] + a
* xjk
[ZZ
] + b
* xjl
[ZZ
];
497 d
= c
* inverseNorm(temp
);
500 x
[XX
] = xi
[XX
] + d
* temp
[XX
];
501 x
[YY
] = xi
[YY
] + d
* temp
[YY
];
502 x
[ZZ
] = xi
[ZZ
] + d
* temp
[ZZ
];
505 /* TOTAL: 43 flops */
508 static void constr_vsite4FDN(const rvec xi
,
518 rvec xij
, xik
, xil
, ra
, rb
, rja
, rjb
, rm
;
521 pbc_rvec_sub(pbc
, xj
, xi
, xij
);
522 pbc_rvec_sub(pbc
, xk
, xi
, xik
);
523 pbc_rvec_sub(pbc
, xl
, xi
, xil
);
526 ra
[XX
] = a
* xik
[XX
];
527 ra
[YY
] = a
* xik
[YY
];
528 ra
[ZZ
] = a
* xik
[ZZ
];
530 rb
[XX
] = b
* xil
[XX
];
531 rb
[YY
] = b
* xil
[YY
];
532 rb
[ZZ
] = b
* xil
[ZZ
];
536 rvec_sub(ra
, xij
, rja
);
537 rvec_sub(rb
, xij
, rjb
);
543 d
= c
* inverseNorm(rm
);
546 x
[XX
] = xi
[XX
] + d
* rm
[XX
];
547 x
[YY
] = xi
[YY
] + d
* rm
[YY
];
548 x
[ZZ
] = xi
[ZZ
] + d
* rm
[ZZ
];
551 /* TOTAL: 47 flops */
554 static int constr_vsiten(const t_iatom
* ia
, ArrayRef
<const t_iparams
> ip
, ArrayRef
<RVec
> x
, const t_pbc
* pbc
)
561 n3
= 3 * ip
[ia
[0]].vsiten
.n
;
564 copy_rvec(x
[ai
], x1
);
566 for (int i
= 3; i
< n3
; i
+= 3)
569 a
= ip
[ia
[i
]].vsiten
.a
;
572 pbc_dx_aiuc(pbc
, x
[ai
], x1
, dx
);
576 rvec_sub(x
[ai
], x1
, dx
);
578 dsum
[XX
] += a
* dx
[XX
];
579 dsum
[YY
] += a
* dx
[YY
];
580 dsum
[ZZ
] += a
* dx
[ZZ
];
584 x
[av
][XX
] = x1
[XX
] + dsum
[XX
];
585 x
[av
][YY
] = x1
[YY
] + dsum
[YY
];
586 x
[av
][ZZ
] = x1
[ZZ
] + dsum
[ZZ
];
593 //! PBC modes for vsite construction and spreading
596 all
, //!< Apply normal, simple PBC for all vsites
597 none
//!< No PBC treatment needed
600 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
602 * \param[in] pbcPtr A pointer to a PBC struct or nullptr when no PBC treatment is required
604 static PbcMode
getPbcMode(const t_pbc
* pbcPtr
)
606 if (pbcPtr
== nullptr)
608 return PbcMode::none
;
616 /*! \brief Executes the vsite construction task for a single thread
618 * \param[in,out] x Coordinates to construct vsites for
619 * \param[in] dt Time step, needed when v is not empty
620 * \param[in,out] v When not empty, velocities are generated for virtual sites
621 * \param[in] ip Interaction parameters for all interaction, only vsite parameters are used
622 * \param[in] ilist The interaction lists, only vsites are usesd
623 * \param[in] pbc_null PBC struct, used for PBC distance calculations when !=nullptr
625 static void construct_vsites_thread(ArrayRef
<RVec
> x
,
628 ArrayRef
<const t_iparams
> ip
,
629 ArrayRef
<const InteractionList
> ilist
,
630 const t_pbc
* pbc_null
)
642 const PbcMode pbcMode
= getPbcMode(pbc_null
);
643 /* We need another pbc pointer, as with charge groups we switch per vsite */
644 const t_pbc
* pbc_null2
= pbc_null
;
646 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
648 if (ilist
[ftype
].empty())
654 int nra
= interaction_function
[ftype
].nratoms
;
656 int nr
= ilist
[ftype
].size();
658 const t_iatom
* ia
= ilist
[ftype
].iatoms
.data();
660 for (int i
= 0; i
< nr
;)
663 /* The vsite and constructing atoms */
666 /* Constants for constructing vsites */
667 real a1
= ip
[tp
].vsite
.a
;
668 /* Copy the old position */
670 copy_rvec(x
[avsite
], xv
);
672 /* Construct the vsite depending on type */
677 case F_VSITE1
: constr_vsite1(x
[ai
], x
[avsite
]); break;
680 constr_vsite2(x
[ai
], x
[aj
], x
[avsite
], a1
, pbc_null2
);
684 constr_vsite2FD(x
[ai
], x
[aj
], x
[avsite
], a1
, pbc_null2
);
690 constr_vsite3(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
696 constr_vsite3FD(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
702 constr_vsite3FAD(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, pbc_null2
);
709 constr_vsite3OUT(x
[ai
], x
[aj
], x
[ak
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
717 constr_vsite4FD(x
[ai
], x
[aj
], x
[ak
], x
[al
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
725 constr_vsite4FDN(x
[ai
], x
[aj
], x
[ak
], x
[al
], x
[avsite
], a1
, b1
, c1
, pbc_null2
);
727 case F_VSITEN
: inc
= constr_vsiten(ia
, ip
, x
, pbc_null2
); break;
729 gmx_fatal(FARGS
, "No such vsite type %d in %s, line %d", ftype
, __FILE__
, __LINE__
);
732 if (pbcMode
== PbcMode::all
)
734 /* Keep the vsite in the same periodic image as before */
736 int ishift
= pbc_dx_aiuc(pbc_null
, x
[avsite
], xv
, dx
);
737 if (ishift
!= CENTRAL
)
739 rvec_add(xv
, dx
, x
[avsite
]);
744 /* Calculate velocity of vsite... */
746 rvec_sub(x
[avsite
], xv
, vv
);
747 svmul(inv_dt
, vv
, v
[avsite
]);
750 /* Increment loop variables */
758 /*! \brief Dispatch the vsite construction tasks for all threads
760 * \param[in] threadingInfo Used to divide work over threads when != nullptr
761 * \param[in,out] x Coordinates to construct vsites for
762 * \param[in] dt Time step, needed when v is not empty
763 * \param[in,out] v When not empty, velocities are generated for virtual sites
764 * \param[in] ip Interaction parameters for all interaction, only vsite parameters are used
765 * \param[in] ilist The interaction lists, only vsites are usesd
766 * \param[in] domainInfo Information about PBC and DD
767 * \param[in] box Used for PBC when PBC is set in domainInfo
769 static void construct_vsites(const ThreadingInfo
* threadingInfo
,
773 ArrayRef
<const t_iparams
> ip
,
774 ArrayRef
<const InteractionList
> ilist
,
775 const DomainInfo
& domainInfo
,
778 const bool useDomdec
= domainInfo
.useDomdec();
780 t_pbc pbc
, *pbc_null
;
782 /* We only need to do pbc when we have inter update-group vsites.
783 * Note that with domain decomposition we do not need to apply PBC here
784 * when we have at least 3 domains along each dimension. Currently we
785 * do not optimize this case.
787 if (domainInfo
.pbcType_
!= PbcType::No
&& domainInfo
.useMolPbc_
)
789 /* This is wasting some CPU time as we now do this multiple times
793 clear_ivec(null_ivec
);
794 pbc_null
= set_pbc_dd(&pbc
, domainInfo
.pbcType_
,
795 useDomdec
? domainInfo
.domdec_
->numCells
: null_ivec
, FALSE
, box
);
804 dd_move_x_vsites(*domainInfo
.domdec_
, box
, as_rvec_array(x
.data()));
807 if (threadingInfo
== nullptr || threadingInfo
->numThreads() == 1)
809 construct_vsites_thread(x
, dt
, v
, ip
, ilist
, pbc_null
);
813 #pragma omp parallel num_threads(threadingInfo->numThreads())
817 const int th
= gmx_omp_get_thread_num();
818 const VsiteThread
& tData
= threadingInfo
->threadData(th
);
819 GMX_ASSERT(tData
.rangeStart
>= 0,
820 "The thread data should be initialized before calling construct_vsites");
822 construct_vsites_thread(x
, dt
, v
, ip
, tData
.ilist
, pbc_null
);
823 if (tData
.useInterdependentTask
)
825 /* Here we don't need a barrier (unlike the spreading),
826 * since both tasks only construct vsites from particles,
827 * or local vsites, not from non-local vsites.
829 construct_vsites_thread(x
, dt
, v
, ip
, tData
.idTask
.ilist
, pbc_null
);
832 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
834 /* Now we can construct the vsites that might depend on other vsites */
835 construct_vsites_thread(x
, dt
, v
, ip
, threadingInfo
->threadDataNonLocalDependent().ilist
, pbc_null
);
839 void VirtualSitesHandler::Impl::construct(ArrayRef
<RVec
> x
, real dt
, ArrayRef
<RVec
> v
, const matrix box
) const
841 construct_vsites(&threadingInfo_
, x
, dt
, v
, iparams_
, ilists_
, domainInfo_
, box
);
844 void VirtualSitesHandler::construct(ArrayRef
<RVec
> x
, real dt
, ArrayRef
<RVec
> v
, const matrix box
) const
846 impl_
->construct(x
, dt
, v
, box
);
849 void constructVirtualSites(ArrayRef
<RVec
> x
, ArrayRef
<const t_iparams
> ip
, ArrayRef
<const InteractionList
> ilist
)
853 const DomainInfo domainInfo
;
854 construct_vsites(nullptr, x
, 0, {}, ip
, ilist
, domainInfo
, nullptr);
858 /* Force spreading routines */
860 static void spread_vsite1(const t_iatom ia
[], ArrayRef
<RVec
> f
)
862 const int av
= ia
[1];
863 const int ai
= ia
[2];
868 template<VirialHandling virialHandling
>
869 static void spread_vsite2(const t_iatom ia
[],
871 ArrayRef
<const RVec
> x
,
873 ArrayRef
<RVec
> fshift
,
883 svmul(1 - a
, f
[av
], fi
);
891 if (virialHandling
== VirialHandling::Pbc
)
897 siv
= pbc_dx_aiuc(pbc
, x
[ai
], x
[av
], dx
);
898 sij
= pbc_dx_aiuc(pbc
, x
[ai
], x
[aj
], dx
);
906 if (siv
!= CENTRAL
|| sij
!= CENTRAL
)
908 rvec_inc(fshift
[siv
], f
[av
]);
909 rvec_dec(fshift
[CENTRAL
], fi
);
910 rvec_dec(fshift
[sij
], fj
);
914 /* TOTAL: 13 flops */
917 void constructVirtualSitesGlobal(const gmx_mtop_t
& mtop
, gmx::ArrayRef
<gmx::RVec
> x
)
919 GMX_ASSERT(x
.ssize() >= mtop
.natoms
, "x should contain the whole system");
920 GMX_ASSERT(!mtop
.moleculeBlockIndices
.empty(),
921 "molblock indices are needed in constructVsitesGlobal");
923 for (size_t mb
= 0; mb
< mtop
.molblock
.size(); mb
++)
925 const gmx_molblock_t
& molb
= mtop
.molblock
[mb
];
926 const gmx_moltype_t
& molt
= mtop
.moltype
[molb
.type
];
927 if (vsiteIlistNrCount(molt
.ilist
) > 0)
929 int atomOffset
= mtop
.moleculeBlockIndices
[mb
].globalAtomStart
;
930 for (int mol
= 0; mol
< molb
.nmol
; mol
++)
932 constructVirtualSites(x
.subArray(atomOffset
, molt
.atoms
.nr
), mtop
.ffparams
.iparams
,
934 atomOffset
+= molt
.atoms
.nr
;
940 template<VirialHandling virialHandling
>
941 static void spread_vsite2FD(const t_iatom ia
[],
943 ArrayRef
<const RVec
> x
,
945 ArrayRef
<RVec
> fshift
,
949 const int av
= ia
[1];
950 const int ai
= ia
[2];
951 const int aj
= ia
[3];
953 copy_rvec(f
[av
], fv
);
956 int sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
959 const real invDistance
= inverseNorm(xij
);
960 const real b
= a
* invDistance
;
963 const real fproj
= iprod(xij
, fv
) * invDistance
* invDistance
;
966 fj
[XX
] = b
* (fv
[XX
] - fproj
* xij
[XX
]);
967 fj
[YY
] = b
* (fv
[YY
] - fproj
* xij
[YY
]);
968 fj
[ZZ
] = b
* (fv
[ZZ
] - fproj
* xij
[ZZ
]);
971 /* b is already calculated in constr_vsite2FD
972 storing b somewhere will save flops. */
974 f
[ai
][XX
] += fv
[XX
] - fj
[XX
];
975 f
[ai
][YY
] += fv
[YY
] - fj
[YY
];
976 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
];
982 if (virialHandling
== VirialHandling::Pbc
)
988 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
995 if (svi
!= CENTRAL
|| sji
!= CENTRAL
)
997 rvec_dec(fshift
[svi
], fv
);
998 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
];
999 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
];
1000 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
];
1001 fshift
[sji
][XX
] += fj
[XX
];
1002 fshift
[sji
][YY
] += fj
[YY
];
1003 fshift
[sji
][ZZ
] += fj
[ZZ
];
1007 if (virialHandling
== VirialHandling::NonLinear
)
1009 /* Under this condition, the virial for the current forces is not
1010 * calculated from the redistributed forces. This means that
1011 * the effect of non-linear virtual site constructions on the virial
1012 * needs to be added separately. This contribution can be calculated
1013 * in many ways, but the simplest and cheapest way is to use
1014 * the first constructing atom ai as a reference position in space:
1015 * subtract (xv-xi)*fv and add (xj-xi)*fj.
1019 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1021 for (int i
= 0; i
< DIM
; i
++)
1023 for (int j
= 0; j
< DIM
; j
++)
1025 /* As xix is a linear combination of j and k, use that here */
1026 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * fj
[j
];
1031 /* TOTAL: 38 flops */
1034 template<VirialHandling virialHandling
>
1035 static void spread_vsite3(const t_iatom ia
[],
1038 ArrayRef
<const RVec
> x
,
1040 ArrayRef
<RVec
> fshift
,
1043 rvec fi
, fj
, fk
, dx
;
1051 svmul(1 - a
- b
, f
[av
], fi
);
1052 svmul(a
, f
[av
], fj
);
1053 svmul(b
, f
[av
], fk
);
1056 rvec_inc(f
[ai
], fi
);
1057 rvec_inc(f
[aj
], fj
);
1058 rvec_inc(f
[ak
], fk
);
1061 if (virialHandling
== VirialHandling::Pbc
)
1068 siv
= pbc_dx_aiuc(pbc
, x
[ai
], x
[av
], dx
);
1069 sij
= pbc_dx_aiuc(pbc
, x
[ai
], x
[aj
], dx
);
1070 sik
= pbc_dx_aiuc(pbc
, x
[ai
], x
[ak
], dx
);
1079 if (siv
!= CENTRAL
|| sij
!= CENTRAL
|| sik
!= CENTRAL
)
1081 rvec_inc(fshift
[siv
], f
[av
]);
1082 rvec_dec(fshift
[CENTRAL
], fi
);
1083 rvec_dec(fshift
[sij
], fj
);
1084 rvec_dec(fshift
[sik
], fk
);
1088 /* TOTAL: 20 flops */
1091 template<VirialHandling virialHandling
>
1092 static void spread_vsite3FD(const t_iatom ia
[],
1095 ArrayRef
<const RVec
> x
,
1097 ArrayRef
<RVec
> fshift
,
1102 rvec xvi
, xij
, xjk
, xix
, fv
, temp
;
1103 t_iatom av
, ai
, aj
, ak
;
1110 copy_rvec(f
[av
], fv
);
1112 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1113 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
1116 /* xix goes from i to point x on the line jk */
1117 xix
[XX
] = xij
[XX
] + a
* xjk
[XX
];
1118 xix
[YY
] = xij
[YY
] + a
* xjk
[YY
];
1119 xix
[ZZ
] = xij
[ZZ
] + a
* xjk
[ZZ
];
1122 const real invDistance
= inverseNorm(xix
);
1123 const real c
= b
* invDistance
;
1124 /* 4 + ?10? flops */
1126 fproj
= iprod(xix
, fv
) * invDistance
* invDistance
; /* = (xix . f)/(xix . xix) */
1128 temp
[XX
] = c
* (fv
[XX
] - fproj
* xix
[XX
]);
1129 temp
[YY
] = c
* (fv
[YY
] - fproj
* xix
[YY
]);
1130 temp
[ZZ
] = c
* (fv
[ZZ
] - fproj
* xix
[ZZ
]);
1133 /* c is already calculated in constr_vsite3FD
1134 storing c somewhere will save 26 flops! */
1137 f
[ai
][XX
] += fv
[XX
] - temp
[XX
];
1138 f
[ai
][YY
] += fv
[YY
] - temp
[YY
];
1139 f
[ai
][ZZ
] += fv
[ZZ
] - temp
[ZZ
];
1140 f
[aj
][XX
] += a1
* temp
[XX
];
1141 f
[aj
][YY
] += a1
* temp
[YY
];
1142 f
[aj
][ZZ
] += a1
* temp
[ZZ
];
1143 f
[ak
][XX
] += a
* temp
[XX
];
1144 f
[ak
][YY
] += a
* temp
[YY
];
1145 f
[ak
][ZZ
] += a
* temp
[ZZ
];
1148 if (virialHandling
== VirialHandling::Pbc
)
1153 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1160 if (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
)
1162 rvec_dec(fshift
[svi
], fv
);
1163 fshift
[CENTRAL
][XX
] += fv
[XX
] - (1 + a
) * temp
[XX
];
1164 fshift
[CENTRAL
][YY
] += fv
[YY
] - (1 + a
) * temp
[YY
];
1165 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - (1 + a
) * temp
[ZZ
];
1166 fshift
[sji
][XX
] += temp
[XX
];
1167 fshift
[sji
][YY
] += temp
[YY
];
1168 fshift
[sji
][ZZ
] += temp
[ZZ
];
1169 fshift
[skj
][XX
] += a
* temp
[XX
];
1170 fshift
[skj
][YY
] += a
* temp
[YY
];
1171 fshift
[skj
][ZZ
] += a
* temp
[ZZ
];
1175 if (virialHandling
== VirialHandling::NonLinear
)
1177 /* Under this condition, the virial for the current forces is not
1178 * calculated from the redistributed forces. This means that
1179 * the effect of non-linear virtual site constructions on the virial
1180 * needs to be added separately. This contribution can be calculated
1181 * in many ways, but the simplest and cheapest way is to use
1182 * the first constructing atom ai as a reference position in space:
1183 * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
1187 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1189 for (int i
= 0; i
< DIM
; i
++)
1191 for (int j
= 0; j
< DIM
; j
++)
1193 /* As xix is a linear combination of j and k, use that here */
1194 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xix
[i
] * temp
[j
];
1199 /* TOTAL: 61 flops */
1202 template<VirialHandling virialHandling
>
1203 static void spread_vsite3FAD(const t_iatom ia
[],
1206 ArrayRef
<const RVec
> x
,
1208 ArrayRef
<RVec
> fshift
,
1212 rvec xvi
, xij
, xjk
, xperp
, Fpij
, Fppp
, fv
, f1
, f2
, f3
;
1213 real a1
, b1
, c1
, c2
, invdij
, invdij2
, invdp
, fproj
;
1214 t_iatom av
, ai
, aj
, ak
;
1221 copy_rvec(f
[ia
[1]], fv
);
1223 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1224 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
1227 invdij
= inverseNorm(xij
);
1228 invdij2
= invdij
* invdij
;
1229 c1
= iprod(xij
, xjk
) * invdij2
;
1230 xperp
[XX
] = xjk
[XX
] - c1
* xij
[XX
];
1231 xperp
[YY
] = xjk
[YY
] - c1
* xij
[YY
];
1232 xperp
[ZZ
] = xjk
[ZZ
] - c1
* xij
[ZZ
];
1233 /* xperp in plane ijk, perp. to ij */
1234 invdp
= inverseNorm(xperp
);
1239 /* a1, b1 and c1 are already calculated in constr_vsite3FAD
1240 storing them somewhere will save 45 flops! */
1242 fproj
= iprod(xij
, fv
) * invdij2
;
1243 svmul(fproj
, xij
, Fpij
); /* proj. f on xij */
1244 svmul(iprod(xperp
, fv
) * invdp
* invdp
, xperp
, Fppp
); /* proj. f on xperp */
1245 svmul(b1
* fproj
, xperp
, f3
);
1248 rvec_sub(fv
, Fpij
, f1
); /* f1 = f - Fpij */
1249 rvec_sub(f1
, Fppp
, f2
); /* f2 = f - Fpij - Fppp */
1250 for (int d
= 0; d
< DIM
; d
++)
1258 f
[ai
][XX
] += fv
[XX
] - f1
[XX
] + c1
* f2
[XX
] + f3
[XX
];
1259 f
[ai
][YY
] += fv
[YY
] - f1
[YY
] + c1
* f2
[YY
] + f3
[YY
];
1260 f
[ai
][ZZ
] += fv
[ZZ
] - f1
[ZZ
] + c1
* f2
[ZZ
] + f3
[ZZ
];
1261 f
[aj
][XX
] += f1
[XX
] - c2
* f2
[XX
] - f3
[XX
];
1262 f
[aj
][YY
] += f1
[YY
] - c2
* f2
[YY
] - f3
[YY
];
1263 f
[aj
][ZZ
] += f1
[ZZ
] - c2
* f2
[ZZ
] - f3
[ZZ
];
1264 f
[ak
][XX
] += f2
[XX
];
1265 f
[ak
][YY
] += f2
[YY
];
1266 f
[ak
][ZZ
] += f2
[ZZ
];
1269 if (virialHandling
== VirialHandling::Pbc
)
1275 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1282 if (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
)
1284 rvec_dec(fshift
[svi
], fv
);
1285 fshift
[CENTRAL
][XX
] += fv
[XX
] - f1
[XX
] - (1 - c1
) * f2
[XX
] + f3
[XX
];
1286 fshift
[CENTRAL
][YY
] += fv
[YY
] - f1
[YY
] - (1 - c1
) * f2
[YY
] + f3
[YY
];
1287 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - f1
[ZZ
] - (1 - c1
) * f2
[ZZ
] + f3
[ZZ
];
1288 fshift
[sji
][XX
] += f1
[XX
] - c1
* f2
[XX
] - f3
[XX
];
1289 fshift
[sji
][YY
] += f1
[YY
] - c1
* f2
[YY
] - f3
[YY
];
1290 fshift
[sji
][ZZ
] += f1
[ZZ
] - c1
* f2
[ZZ
] - f3
[ZZ
];
1291 fshift
[skj
][XX
] += f2
[XX
];
1292 fshift
[skj
][YY
] += f2
[YY
];
1293 fshift
[skj
][ZZ
] += f2
[ZZ
];
1297 if (virialHandling
== VirialHandling::NonLinear
)
1300 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1302 for (int i
= 0; i
< DIM
; i
++)
1304 for (int j
= 0; j
< DIM
; j
++)
1306 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1307 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * (f1
[j
] + (1 - c2
) * f2
[j
] - f3
[j
])
1313 /* TOTAL: 113 flops */
1316 template<VirialHandling virialHandling
>
1317 static void spread_vsite3OUT(const t_iatom ia
[],
1321 ArrayRef
<const RVec
> x
,
1323 ArrayRef
<RVec
> fshift
,
1327 rvec xvi
, xij
, xik
, fv
, fj
, fk
;
1337 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1338 ski
= pbc_rvec_sub(pbc
, x
[ak
], x
[ai
], xik
);
1341 copy_rvec(f
[av
], fv
);
1348 fj
[XX
] = a
* fv
[XX
] - xik
[ZZ
] * cfy
+ xik
[YY
] * cfz
;
1349 fj
[YY
] = xik
[ZZ
] * cfx
+ a
* fv
[YY
] - xik
[XX
] * cfz
;
1350 fj
[ZZ
] = -xik
[YY
] * cfx
+ xik
[XX
] * cfy
+ a
* fv
[ZZ
];
1352 fk
[XX
] = b
* fv
[XX
] + xij
[ZZ
] * cfy
- xij
[YY
] * cfz
;
1353 fk
[YY
] = -xij
[ZZ
] * cfx
+ b
* fv
[YY
] + xij
[XX
] * cfz
;
1354 fk
[ZZ
] = xij
[YY
] * cfx
- xij
[XX
] * cfy
+ b
* fv
[ZZ
];
1357 f
[ai
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
];
1358 f
[ai
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
];
1359 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
];
1360 rvec_inc(f
[aj
], fj
);
1361 rvec_inc(f
[ak
], fk
);
1364 if (virialHandling
== VirialHandling::Pbc
)
1369 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1376 if (svi
!= CENTRAL
|| sji
!= CENTRAL
|| ski
!= CENTRAL
)
1378 rvec_dec(fshift
[svi
], fv
);
1379 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
];
1380 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
];
1381 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
];
1382 rvec_inc(fshift
[sji
], fj
);
1383 rvec_inc(fshift
[ski
], fk
);
1387 if (virialHandling
== VirialHandling::NonLinear
)
1391 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1393 for (int i
= 0; i
< DIM
; i
++)
1395 for (int j
= 0; j
< DIM
; j
++)
1397 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * fj
[j
] + xik
[i
] * fk
[j
];
1402 /* TOTAL: 54 flops */
1405 template<VirialHandling virialHandling
>
1406 static void spread_vsite4FD(const t_iatom ia
[],
1410 ArrayRef
<const RVec
> x
,
1412 ArrayRef
<RVec
> fshift
,
1417 rvec xvi
, xij
, xjk
, xjl
, xix
, fv
, temp
;
1418 int av
, ai
, aj
, ak
, al
;
1419 int sji
, skj
, slj
, m
;
1427 sji
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1428 skj
= pbc_rvec_sub(pbc
, x
[ak
], x
[aj
], xjk
);
1429 slj
= pbc_rvec_sub(pbc
, x
[al
], x
[aj
], xjl
);
1432 /* xix goes from i to point x on the plane jkl */
1433 for (m
= 0; m
< DIM
; m
++)
1435 xix
[m
] = xij
[m
] + a
* xjk
[m
] + b
* xjl
[m
];
1439 const real invDistance
= inverseNorm(xix
);
1440 const real d
= c
* invDistance
;
1441 /* 4 + ?10? flops */
1443 copy_rvec(f
[av
], fv
);
1445 fproj
= iprod(xix
, fv
) * invDistance
* invDistance
; /* = (xix . f)/(xix . xix) */
1447 for (m
= 0; m
< DIM
; m
++)
1449 temp
[m
] = d
* (fv
[m
] - fproj
* xix
[m
]);
1453 /* c is already calculated in constr_vsite3FD
1454 storing c somewhere will save 35 flops! */
1457 for (m
= 0; m
< DIM
; m
++)
1459 f
[ai
][m
] += fv
[m
] - temp
[m
];
1460 f
[aj
][m
] += a1
* temp
[m
];
1461 f
[ak
][m
] += a
* temp
[m
];
1462 f
[al
][m
] += b
* temp
[m
];
1466 if (virialHandling
== VirialHandling::Pbc
)
1471 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1478 if (svi
!= CENTRAL
|| sji
!= CENTRAL
|| skj
!= CENTRAL
|| slj
!= CENTRAL
)
1480 rvec_dec(fshift
[svi
], fv
);
1481 for (m
= 0; m
< DIM
; m
++)
1483 fshift
[CENTRAL
][m
] += fv
[m
] - (1 + a
+ b
) * temp
[m
];
1484 fshift
[sji
][m
] += temp
[m
];
1485 fshift
[skj
][m
] += a
* temp
[m
];
1486 fshift
[slj
][m
] += b
* temp
[m
];
1491 if (virialHandling
== VirialHandling::NonLinear
)
1496 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1498 for (i
= 0; i
< DIM
; i
++)
1500 for (j
= 0; j
< DIM
; j
++)
1502 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xix
[i
] * temp
[j
];
1507 /* TOTAL: 77 flops */
1510 template<VirialHandling virialHandling
>
1511 static void spread_vsite4FDN(const t_iatom ia
[],
1515 ArrayRef
<const RVec
> x
,
1517 ArrayRef
<RVec
> fshift
,
1521 rvec xvi
, xij
, xik
, xil
, ra
, rb
, rja
, rjb
, rab
, rm
, rt
;
1522 rvec fv
, fj
, fk
, fl
;
1525 int av
, ai
, aj
, ak
, al
;
1528 /* DEBUG: check atom indices */
1535 copy_rvec(f
[av
], fv
);
1537 sij
= pbc_rvec_sub(pbc
, x
[aj
], x
[ai
], xij
);
1538 sik
= pbc_rvec_sub(pbc
, x
[ak
], x
[ai
], xik
);
1539 sil
= pbc_rvec_sub(pbc
, x
[al
], x
[ai
], xil
);
1542 ra
[XX
] = a
* xik
[XX
];
1543 ra
[YY
] = a
* xik
[YY
];
1544 ra
[ZZ
] = a
* xik
[ZZ
];
1546 rb
[XX
] = b
* xil
[XX
];
1547 rb
[YY
] = b
* xil
[YY
];
1548 rb
[ZZ
] = b
* xil
[ZZ
];
1552 rvec_sub(ra
, xij
, rja
);
1553 rvec_sub(rb
, xij
, rjb
);
1554 rvec_sub(rb
, ra
, rab
);
1557 cprod(rja
, rjb
, rm
);
1560 invrm
= inverseNorm(rm
);
1561 denom
= invrm
* invrm
;
1564 cfx
= c
* invrm
* fv
[XX
];
1565 cfy
= c
* invrm
* fv
[YY
];
1566 cfz
= c
* invrm
* fv
[ZZ
];
1577 fj
[XX
] = (-rm
[XX
] * rt
[XX
]) * cfx
+ (rab
[ZZ
] - rm
[YY
] * rt
[XX
]) * cfy
1578 + (-rab
[YY
] - rm
[ZZ
] * rt
[XX
]) * cfz
;
1579 fj
[YY
] = (-rab
[ZZ
] - rm
[XX
] * rt
[YY
]) * cfx
+ (-rm
[YY
] * rt
[YY
]) * cfy
1580 + (rab
[XX
] - rm
[ZZ
] * rt
[YY
]) * cfz
;
1581 fj
[ZZ
] = (rab
[YY
] - rm
[XX
] * rt
[ZZ
]) * cfx
+ (-rab
[XX
] - rm
[YY
] * rt
[ZZ
]) * cfy
1582 + (-rm
[ZZ
] * rt
[ZZ
]) * cfz
;
1588 rt
[XX
] *= denom
* a
;
1589 rt
[YY
] *= denom
* a
;
1590 rt
[ZZ
] *= denom
* a
;
1593 fk
[XX
] = (-rm
[XX
] * rt
[XX
]) * cfx
+ (-a
* rjb
[ZZ
] - rm
[YY
] * rt
[XX
]) * cfy
1594 + (a
* rjb
[YY
] - rm
[ZZ
] * rt
[XX
]) * cfz
;
1595 fk
[YY
] = (a
* rjb
[ZZ
] - rm
[XX
] * rt
[YY
]) * cfx
+ (-rm
[YY
] * rt
[YY
]) * cfy
1596 + (-a
* rjb
[XX
] - rm
[ZZ
] * rt
[YY
]) * cfz
;
1597 fk
[ZZ
] = (-a
* rjb
[YY
] - rm
[XX
] * rt
[ZZ
]) * cfx
+ (a
* rjb
[XX
] - rm
[YY
] * rt
[ZZ
]) * cfy
1598 + (-rm
[ZZ
] * rt
[ZZ
]) * cfz
;
1604 rt
[XX
] *= denom
* b
;
1605 rt
[YY
] *= denom
* b
;
1606 rt
[ZZ
] *= denom
* b
;
1609 fl
[XX
] = (-rm
[XX
] * rt
[XX
]) * cfx
+ (b
* rja
[ZZ
] - rm
[YY
] * rt
[XX
]) * cfy
1610 + (-b
* rja
[YY
] - rm
[ZZ
] * rt
[XX
]) * cfz
;
1611 fl
[YY
] = (-b
* rja
[ZZ
] - rm
[XX
] * rt
[YY
]) * cfx
+ (-rm
[YY
] * rt
[YY
]) * cfy
1612 + (b
* rja
[XX
] - rm
[ZZ
] * rt
[YY
]) * cfz
;
1613 fl
[ZZ
] = (b
* rja
[YY
] - rm
[XX
] * rt
[ZZ
]) * cfx
+ (-b
* rja
[XX
] - rm
[YY
] * rt
[ZZ
]) * cfy
1614 + (-rm
[ZZ
] * rt
[ZZ
]) * cfz
;
1617 f
[ai
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
] - fl
[XX
];
1618 f
[ai
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
] - fl
[YY
];
1619 f
[ai
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
] - fl
[ZZ
];
1620 rvec_inc(f
[aj
], fj
);
1621 rvec_inc(f
[ak
], fk
);
1622 rvec_inc(f
[al
], fl
);
1625 if (virialHandling
== VirialHandling::Pbc
)
1630 svi
= pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xvi
);
1637 if (svi
!= CENTRAL
|| sij
!= CENTRAL
|| sik
!= CENTRAL
|| sil
!= CENTRAL
)
1639 rvec_dec(fshift
[svi
], fv
);
1640 fshift
[CENTRAL
][XX
] += fv
[XX
] - fj
[XX
] - fk
[XX
] - fl
[XX
];
1641 fshift
[CENTRAL
][YY
] += fv
[YY
] - fj
[YY
] - fk
[YY
] - fl
[YY
];
1642 fshift
[CENTRAL
][ZZ
] += fv
[ZZ
] - fj
[ZZ
] - fk
[ZZ
] - fl
[ZZ
];
1643 rvec_inc(fshift
[sij
], fj
);
1644 rvec_inc(fshift
[sik
], fk
);
1645 rvec_inc(fshift
[sil
], fl
);
1649 if (virialHandling
== VirialHandling::NonLinear
)
1654 pbc_rvec_sub(pbc
, x
[av
], x
[ai
], xiv
);
1656 for (i
= 0; i
< DIM
; i
++)
1658 for (j
= 0; j
< DIM
; j
++)
1660 dxdf
[i
][j
] += -xiv
[i
] * fv
[j
] + xij
[i
] * fj
[j
] + xik
[i
] * fk
[j
] + xil
[i
] * fl
[j
];
1665 /* Total: 207 flops (Yuck!) */
1668 template<VirialHandling virialHandling
>
1669 static int spread_vsiten(const t_iatom ia
[],
1670 ArrayRef
<const t_iparams
> ip
,
1671 ArrayRef
<const RVec
> x
,
1673 ArrayRef
<RVec
> fshift
,
1681 n3
= 3 * ip
[ia
[0]].vsiten
.n
;
1683 copy_rvec(x
[av
], xv
);
1685 for (i
= 0; i
< n3
; i
+= 3)
1690 siv
= pbc_dx_aiuc(pbc
, x
[ai
], xv
, dx
);
1696 a
= ip
[ia
[i
]].vsiten
.a
;
1697 svmul(a
, f
[av
], fi
);
1698 rvec_inc(f
[ai
], fi
);
1700 if (virialHandling
== VirialHandling::Pbc
&& siv
!= CENTRAL
)
1702 rvec_inc(fshift
[siv
], fi
);
1703 rvec_dec(fshift
[CENTRAL
], fi
);
1713 //! Returns the number of virtual sites in the interaction list, for VSITEN the number of atoms
1714 static int vsite_count(ArrayRef
<const InteractionList
> ilist
, int ftype
)
1716 if (ftype
== F_VSITEN
)
1718 return ilist
[ftype
].size() / 3;
1722 return ilist
[ftype
].size() / (1 + interaction_function
[ftype
].nratoms
);
1726 //! Executes the force spreading task for a single thread
1727 template<VirialHandling virialHandling
>
1728 static void spreadForceForThread(ArrayRef
<const RVec
> x
,
1730 ArrayRef
<RVec
> fshift
,
1732 ArrayRef
<const t_iparams
> ip
,
1733 ArrayRef
<const InteractionList
> ilist
,
1734 const t_pbc
* pbc_null
)
1736 const PbcMode pbcMode
= getPbcMode(pbc_null
);
1737 /* We need another pbc pointer, as with charge groups we switch per vsite */
1738 const t_pbc
* pbc_null2
= pbc_null
;
1739 gmx::ArrayRef
<const int> vsite_pbc
;
1741 /* this loop goes backwards to be able to build *
1742 * higher type vsites from lower types */
1743 for (int ftype
= c_ftypeVsiteEnd
- 1; ftype
>= c_ftypeVsiteStart
; ftype
--)
1745 if (ilist
[ftype
].empty())
1751 int nra
= interaction_function
[ftype
].nratoms
;
1753 int nr
= ilist
[ftype
].size();
1755 const t_iatom
* ia
= ilist
[ftype
].iatoms
.data();
1757 if (pbcMode
== PbcMode::all
)
1759 pbc_null2
= pbc_null
;
1762 for (int i
= 0; i
< nr
;)
1766 /* Constants for constructing */
1768 a1
= ip
[tp
].vsite
.a
;
1769 /* Construct the vsite depending on type */
1772 case F_VSITE1
: spread_vsite1(ia
, f
); break;
1774 spread_vsite2
<virialHandling
>(ia
, a1
, x
, f
, fshift
, pbc_null2
);
1777 spread_vsite2FD
<virialHandling
>(ia
, a1
, x
, f
, fshift
, dxdf
, pbc_null2
);
1780 b1
= ip
[tp
].vsite
.b
;
1781 spread_vsite3
<virialHandling
>(ia
, a1
, b1
, x
, f
, fshift
, pbc_null2
);
1784 b1
= ip
[tp
].vsite
.b
;
1785 spread_vsite3FD
<virialHandling
>(ia
, a1
, b1
, x
, f
, fshift
, dxdf
, pbc_null2
);
1788 b1
= ip
[tp
].vsite
.b
;
1789 spread_vsite3FAD
<virialHandling
>(ia
, a1
, b1
, x
, f
, fshift
, dxdf
, pbc_null2
);
1792 b1
= ip
[tp
].vsite
.b
;
1793 c1
= ip
[tp
].vsite
.c
;
1794 spread_vsite3OUT
<virialHandling
>(ia
, a1
, b1
, c1
, x
, f
, fshift
, dxdf
, pbc_null2
);
1797 b1
= ip
[tp
].vsite
.b
;
1798 c1
= ip
[tp
].vsite
.c
;
1799 spread_vsite4FD
<virialHandling
>(ia
, a1
, b1
, c1
, x
, f
, fshift
, dxdf
, pbc_null2
);
1802 b1
= ip
[tp
].vsite
.b
;
1803 c1
= ip
[tp
].vsite
.c
;
1804 spread_vsite4FDN
<virialHandling
>(ia
, a1
, b1
, c1
, x
, f
, fshift
, dxdf
, pbc_null2
);
1807 inc
= spread_vsiten
<virialHandling
>(ia
, ip
, x
, f
, fshift
, pbc_null2
);
1810 gmx_fatal(FARGS
, "No such vsite type %d in %s, line %d", ftype
, __FILE__
, __LINE__
);
1812 clear_rvec(f
[ia
[1]]);
1814 /* Increment loop variables */
1822 //! Wrapper function for calling the templated thread-local spread function
1823 static void spreadForceWrapper(ArrayRef
<const RVec
> x
,
1825 const VirialHandling virialHandling
,
1826 ArrayRef
<RVec
> fshift
,
1828 const bool clearDxdf
,
1829 ArrayRef
<const t_iparams
> ip
,
1830 ArrayRef
<const InteractionList
> ilist
,
1831 const t_pbc
* pbc_null
)
1833 if (virialHandling
== VirialHandling::NonLinear
&& clearDxdf
)
1838 switch (virialHandling
)
1840 case VirialHandling::None
:
1841 spreadForceForThread
<VirialHandling::None
>(x
, f
, fshift
, dxdf
, ip
, ilist
, pbc_null
);
1843 case VirialHandling::Pbc
:
1844 spreadForceForThread
<VirialHandling::Pbc
>(x
, f
, fshift
, dxdf
, ip
, ilist
, pbc_null
);
1846 case VirialHandling::NonLinear
:
1847 spreadForceForThread
<VirialHandling::NonLinear
>(x
, f
, fshift
, dxdf
, ip
, ilist
, pbc_null
);
1852 //! Clears the task force buffer elements that are written by task idTask
1853 static void clearTaskForceBufferUsedElements(InterdependentTask
* idTask
)
1855 int ntask
= idTask
->spreadTask
.size();
1856 for (int ti
= 0; ti
< ntask
; ti
++)
1858 const AtomIndex
* atomList
= &idTask
->atomIndex
[idTask
->spreadTask
[ti
]];
1859 int natom
= atomList
->atom
.size();
1860 RVec
* force
= idTask
->force
.data();
1861 for (int i
= 0; i
< natom
; i
++)
1863 clear_rvec(force
[atomList
->atom
[i
]]);
1868 void VirtualSitesHandler::Impl::spreadForces(ArrayRef
<const RVec
> x
,
1870 const VirialHandling virialHandling
,
1871 ArrayRef
<RVec
> fshift
,
1875 gmx_wallcycle
* wcycle
)
1877 wallcycle_start(wcycle
, ewcVSITESPREAD
);
1879 const bool useDomdec
= domainInfo_
.useDomdec();
1881 t_pbc pbc
, *pbc_null
;
1883 if (domainInfo_
.useMolPbc_
)
1885 /* This is wasting some CPU time as we now do this multiple times
1888 pbc_null
= set_pbc_dd(&pbc
, domainInfo_
.pbcType_
,
1889 useDomdec
? domainInfo_
.domdec_
->numCells
: nullptr, FALSE
, box
);
1898 dd_clear_f_vsites(*domainInfo_
.domdec_
, f
);
1901 const int numThreads
= threadingInfo_
.numThreads();
1903 if (numThreads
== 1)
1906 spreadForceWrapper(x
, f
, virialHandling
, fshift
, dxdf
, true, iparams_
, ilists_
, pbc_null
);
1908 if (virialHandling
== VirialHandling::NonLinear
)
1910 for (int i
= 0; i
< DIM
; i
++)
1912 for (int j
= 0; j
< DIM
; j
++)
1914 virial
[i
][j
] += -0.5 * dxdf
[i
][j
];
1921 /* First spread the vsites that might depend on non-local vsites */
1922 auto& nlDependentVSites
= threadingInfo_
.threadDataNonLocalDependent();
1923 spreadForceWrapper(x
, f
, virialHandling
, fshift
, nlDependentVSites
.dxdf
, true, iparams_
,
1924 nlDependentVSites
.ilist
, pbc_null
);
1926 #pragma omp parallel num_threads(numThreads)
1930 int thread
= gmx_omp_get_thread_num();
1931 VsiteThread
& tData
= threadingInfo_
.threadData(thread
);
1933 ArrayRef
<RVec
> fshift_t
;
1934 if (virialHandling
== VirialHandling::Pbc
)
1942 fshift_t
= tData
.fshift
;
1944 for (int i
= 0; i
< SHIFTS
; i
++)
1946 clear_rvec(fshift_t
[i
]);
1951 if (tData
.useInterdependentTask
)
1953 /* Spread the vsites that spread outside our local range.
1954 * This is done using a thread-local force buffer force.
1955 * First we need to copy the input vsite forces to force.
1957 InterdependentTask
* idTask
= &tData
.idTask
;
1959 /* Clear the buffer elements set by our task during
1960 * the last call to spread_vsite_f.
1962 clearTaskForceBufferUsedElements(idTask
);
1964 int nvsite
= idTask
->vsite
.size();
1965 for (int i
= 0; i
< nvsite
; i
++)
1967 copy_rvec(f
[idTask
->vsite
[i
]], idTask
->force
[idTask
->vsite
[i
]]);
1969 spreadForceWrapper(x
, idTask
->force
, virialHandling
, fshift_t
, tData
.dxdf
, true,
1970 iparams_
, tData
.idTask
.ilist
, pbc_null
);
1972 /* We need a barrier before reducing forces below
1973 * that have been produced by a different thread above.
1977 /* Loop over all thread task and reduce forces they
1978 * produced on atoms that fall in our range.
1979 * Note that atomic reduction would be a simpler solution,
1980 * but that might not have good support on all platforms.
1982 int ntask
= idTask
->reduceTask
.size();
1983 for (int ti
= 0; ti
< ntask
; ti
++)
1985 const InterdependentTask
& idt_foreign
=
1986 threadingInfo_
.threadData(idTask
->reduceTask
[ti
]).idTask
;
1987 const AtomIndex
& atomList
= idt_foreign
.atomIndex
[thread
];
1988 const RVec
* f_foreign
= idt_foreign
.force
.data();
1990 for (int ind
: atomList
.atom
)
1992 rvec_inc(f
[ind
], f_foreign
[ind
]);
1993 /* Clearing of f_foreign is done at the next step */
1996 /* Clear the vsite forces, both in f and force */
1997 for (int i
= 0; i
< nvsite
; i
++)
1999 int ind
= tData
.idTask
.vsite
[i
];
2001 clear_rvec(tData
.idTask
.force
[ind
]);
2005 /* Spread the vsites that spread locally only */
2006 spreadForceWrapper(x
, f
, virialHandling
, fshift_t
, tData
.dxdf
, false, iparams_
,
2007 tData
.ilist
, pbc_null
);
2009 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2012 if (virialHandling
== VirialHandling::Pbc
)
2014 for (int th
= 1; th
< numThreads
; th
++)
2016 for (int i
= 0; i
< SHIFTS
; i
++)
2018 rvec_inc(fshift
[i
], threadingInfo_
.threadData(th
).fshift
[i
]);
2023 if (virialHandling
== VirialHandling::NonLinear
)
2025 for (int th
= 0; th
< numThreads
+ 1; th
++)
2027 /* MSVC doesn't like matrix references, so we use a pointer */
2028 const matrix
& dxdf
= threadingInfo_
.threadData(th
).dxdf
;
2030 for (int i
= 0; i
< DIM
; i
++)
2032 for (int j
= 0; j
< DIM
; j
++)
2034 virial
[i
][j
] += -0.5 * dxdf
[i
][j
];
2043 dd_move_f_vsites(*domainInfo_
.domdec_
, f
, fshift
);
2046 inc_nrnb(nrnb
, eNR_VSITE1
, vsite_count(ilists_
, F_VSITE1
));
2047 inc_nrnb(nrnb
, eNR_VSITE2
, vsite_count(ilists_
, F_VSITE2
));
2048 inc_nrnb(nrnb
, eNR_VSITE2FD
, vsite_count(ilists_
, F_VSITE2FD
));
2049 inc_nrnb(nrnb
, eNR_VSITE3
, vsite_count(ilists_
, F_VSITE3
));
2050 inc_nrnb(nrnb
, eNR_VSITE3FD
, vsite_count(ilists_
, F_VSITE3FD
));
2051 inc_nrnb(nrnb
, eNR_VSITE3FAD
, vsite_count(ilists_
, F_VSITE3FAD
));
2052 inc_nrnb(nrnb
, eNR_VSITE3OUT
, vsite_count(ilists_
, F_VSITE3OUT
));
2053 inc_nrnb(nrnb
, eNR_VSITE4FD
, vsite_count(ilists_
, F_VSITE4FD
));
2054 inc_nrnb(nrnb
, eNR_VSITE4FDN
, vsite_count(ilists_
, F_VSITE4FDN
));
2055 inc_nrnb(nrnb
, eNR_VSITEN
, vsite_count(ilists_
, F_VSITEN
));
2057 wallcycle_stop(wcycle
, ewcVSITESPREAD
);
2060 /*! \brief Returns the an array with group indices for each atom
2062 * \param[in] grouping The paritioning of the atom range into atom groups
2064 static std::vector
<int> makeAtomToGroupMapping(const gmx::RangePartitioning
& grouping
)
2066 std::vector
<int> atomToGroup(grouping
.fullRange().end(), 0);
2068 for (int group
= 0; group
< grouping
.numBlocks(); group
++)
2070 auto block
= grouping
.block(group
);
2071 std::fill(atomToGroup
.begin() + block
.begin(), atomToGroup
.begin() + block
.end(), group
);
2077 int countNonlinearVsites(const gmx_mtop_t
& mtop
)
2079 int numNonlinearVsites
= 0;
2080 for (const gmx_molblock_t
& molb
: mtop
.molblock
)
2082 const gmx_moltype_t
& molt
= mtop
.moltype
[molb
.type
];
2084 for (const auto& ilist
: extractILists(molt
.ilist
, IF_VSITE
))
2086 if (ilist
.functionType
!= F_VSITE2
&& ilist
.functionType
!= F_VSITE3
2087 && ilist
.functionType
!= F_VSITEN
)
2089 numNonlinearVsites
+= molb
.nmol
* ilist
.iatoms
.size() / (1 + NRAL(ilist
.functionType
));
2094 return numNonlinearVsites
;
2097 void VirtualSitesHandler::spreadForces(ArrayRef
<const RVec
> x
,
2099 const VirialHandling virialHandling
,
2100 ArrayRef
<RVec
> fshift
,
2104 gmx_wallcycle
* wcycle
)
2106 impl_
->spreadForces(x
, f
, virialHandling
, fshift
, virial
, nrnb
, box
, wcycle
);
2109 int countInterUpdategroupVsites(const gmx_mtop_t
& mtop
,
2110 gmx::ArrayRef
<const gmx::RangePartitioning
> updateGroupingPerMoleculetype
)
2112 int n_intercg_vsite
= 0;
2113 for (const gmx_molblock_t
& molb
: mtop
.molblock
)
2115 const gmx_moltype_t
& molt
= mtop
.moltype
[molb
.type
];
2117 std::vector
<int> atomToGroup
;
2118 if (!updateGroupingPerMoleculetype
.empty())
2120 atomToGroup
= makeAtomToGroupMapping(updateGroupingPerMoleculetype
[molb
.type
]);
2122 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2124 const int nral
= NRAL(ftype
);
2125 const InteractionList
& il
= molt
.ilist
[ftype
];
2126 for (int i
= 0; i
< il
.size(); i
+= 1 + nral
)
2128 bool isInterGroup
= atomToGroup
.empty();
2131 const int group
= atomToGroup
[il
.iatoms
[1 + i
]];
2132 for (int a
= 1; a
< nral
; a
++)
2134 if (atomToGroup
[il
.iatoms
[1 + a
]] != group
)
2136 isInterGroup
= true;
2143 n_intercg_vsite
+= molb
.nmol
;
2149 return n_intercg_vsite
;
2152 std::unique_ptr
<VirtualSitesHandler
> makeVirtualSitesHandler(const gmx_mtop_t
& mtop
,
2153 const t_commrec
* cr
,
2156 GMX_RELEASE_ASSERT(cr
!= nullptr, "We need a valid commrec");
2158 std::unique_ptr
<VirtualSitesHandler
> vsite
;
2160 /* check if there are vsites */
2162 for (int ftype
= 0; ftype
< F_NRE
; ftype
++)
2164 if (interaction_function
[ftype
].flags
& IF_VSITE
)
2166 GMX_ASSERT(ftype
>= c_ftypeVsiteStart
&& ftype
< c_ftypeVsiteEnd
,
2167 "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2169 nvsite
+= gmx_mtop_ftype_count(&mtop
, ftype
);
2173 GMX_ASSERT(ftype
< c_ftypeVsiteStart
|| ftype
>= c_ftypeVsiteEnd
,
2174 "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2183 return std::make_unique
<VirtualSitesHandler
>(mtop
, cr
->dd
, pbcType
);
2186 ThreadingInfo::ThreadingInfo() : numThreads_(gmx_omp_nthreads_get(emntVSITE
))
2188 if (numThreads_
> 1)
2190 /* We need one extra thread data structure for the overlap vsites */
2191 tData_
.resize(numThreads_
+ 1);
2192 #pragma omp parallel for num_threads(numThreads_) schedule(static)
2193 for (int thread
= 0; thread
< numThreads_
; thread
++)
2197 tData_
[thread
] = std::make_unique
<VsiteThread
>();
2199 InterdependentTask
& idTask
= tData_
[thread
]->idTask
;
2201 idTask
.atomIndex
.resize(numThreads_
);
2203 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2205 if (numThreads_
> 1)
2207 tData_
[numThreads_
] = std::make_unique
<VsiteThread
>();
2212 //! Returns the number of inter update-group vsites
2213 static int getNumInterUpdategroupVsites(const gmx_mtop_t
& mtop
, const gmx_domdec_t
* domdec
)
2215 gmx::ArrayRef
<const gmx::RangePartitioning
> updateGroupingPerMoleculetype
;
2218 updateGroupingPerMoleculetype
= getUpdateGroupingPerMoleculetype(*domdec
);
2221 return countInterUpdategroupVsites(mtop
, updateGroupingPerMoleculetype
);
2224 VirtualSitesHandler::Impl::Impl(const gmx_mtop_t
& mtop
, gmx_domdec_t
* domdec
, const PbcType pbcType
) :
2225 numInterUpdategroupVirtualSites_(getNumInterUpdategroupVsites(mtop
, domdec
)),
2226 domainInfo_({ pbcType
, pbcType
!= PbcType::No
&& numInterUpdategroupVirtualSites_
> 0, domdec
}),
2227 iparams_(mtop
.ffparams
.iparams
)
2231 VirtualSitesHandler::VirtualSitesHandler(const gmx_mtop_t
& mtop
, gmx_domdec_t
* domdec
, const PbcType pbcType
) :
2232 impl_(new Impl(mtop
, domdec
, pbcType
))
2236 //! Flag that atom \p atom which is home in another task, if it has not already been added before
2237 static inline void flagAtom(InterdependentTask
* idTask
, const int atom
, const int numThreads
, const int numAtomsPerThread
)
2239 if (!idTask
->use
[atom
])
2241 idTask
->use
[atom
] = true;
2242 int thread
= atom
/ numAtomsPerThread
;
2243 /* Assign all non-local atom force writes to thread 0 */
2244 if (thread
>= numThreads
)
2248 idTask
->atomIndex
[thread
].atom
.push_back(atom
);
2252 /*! \brief Here we try to assign all vsites that are in our local range.
2254 * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2255 * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2256 * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2257 * but not on other vsites are assigned to task tData->id_task.ilist.
2258 * taskIndex[] is set for all vsites in our range, either to our local tasks
2259 * or to the single last task as taskIndex[]=2*nthreads.
2261 static void assignVsitesToThread(VsiteThread
* tData
,
2265 gmx::ArrayRef
<int> taskIndex
,
2266 ArrayRef
<const InteractionList
> ilist
,
2267 ArrayRef
<const t_iparams
> ip
,
2268 const unsigned short* ptype
)
2270 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2272 tData
->ilist
[ftype
].clear();
2273 tData
->idTask
.ilist
[ftype
].clear();
2275 int nral1
= 1 + NRAL(ftype
);
2277 const int* iat
= ilist
[ftype
].iatoms
.data();
2278 for (int i
= 0; i
< ilist
[ftype
].size();)
2280 if (ftype
== F_VSITEN
)
2282 /* The 3 below is from 1+NRAL(ftype)=3 */
2283 inc
= ip
[iat
[i
]].vsiten
.n
* 3;
2286 if (iat
[1 + i
] < tData
->rangeStart
|| iat
[1 + i
] >= tData
->rangeEnd
)
2288 /* This vsite belongs to a different thread */
2293 /* We would like to assign this vsite to task thread,
2294 * but it might depend on atoms outside the atom range of thread
2295 * or on another vsite not assigned to task thread.
2298 if (ftype
!= F_VSITEN
)
2300 for (int j
= i
+ 2; j
< i
+ nral1
; j
++)
2302 /* Do a range check to avoid a harmless race on taskIndex */
2303 if (iat
[j
] < tData
->rangeStart
|| iat
[j
] >= tData
->rangeEnd
|| taskIndex
[iat
[j
]] != thread
)
2305 if (!tData
->useInterdependentTask
|| ptype
[iat
[j
]] == eptVSite
)
2307 /* At least one constructing atom is a vsite
2308 * that is not assigned to the same thread.
2309 * Put this vsite into a separate task.
2315 /* There are constructing atoms outside our range,
2316 * put this vsite into a second task to be executed
2317 * on the same thread. During construction no barrier
2318 * is needed between the two tasks on the same thread.
2319 * During spreading we need to run this task with
2320 * an additional thread-local intermediate force buffer
2321 * (or atomic reduction) and a barrier between the two
2324 task
= nthread
+ thread
;
2330 for (int j
= i
+ 2; j
< i
+ inc
; j
+= 3)
2332 /* Do a range check to avoid a harmless race on taskIndex */
2333 if (iat
[j
] < tData
->rangeStart
|| iat
[j
] >= tData
->rangeEnd
|| taskIndex
[iat
[j
]] != thread
)
2335 GMX_ASSERT(ptype
[iat
[j
]] != eptVSite
,
2336 "A vsite to be assigned in assignVsitesToThread has a vsite as "
2337 "a constructing atom that does not belong to our task, such "
2338 "vsites should be assigned to the single 'master' task");
2340 task
= nthread
+ thread
;
2345 /* Update this vsite's thread index entry */
2346 taskIndex
[iat
[1 + i
]] = task
;
2348 if (task
== thread
|| task
== nthread
+ thread
)
2350 /* Copy this vsite to the thread data struct of thread */
2351 InteractionList
* il_task
;
2354 il_task
= &tData
->ilist
[ftype
];
2358 il_task
= &tData
->idTask
.ilist
[ftype
];
2360 /* Copy the vsite data to the thread-task local array */
2361 il_task
->push_back(iat
[i
], nral1
- 1, iat
+ i
+ 1);
2362 if (task
== nthread
+ thread
)
2364 /* This vsite write outside our own task force block.
2365 * Put it into the interdependent task list and flag
2366 * the atoms involved for reduction.
2368 tData
->idTask
.vsite
.push_back(iat
[i
+ 1]);
2369 if (ftype
!= F_VSITEN
)
2371 for (int j
= i
+ 2; j
< i
+ nral1
; j
++)
2373 flagAtom(&tData
->idTask
, iat
[j
], nthread
, natperthread
);
2378 for (int j
= i
+ 2; j
< i
+ inc
; j
+= 3)
2380 flagAtom(&tData
->idTask
, iat
[j
], nthread
, natperthread
);
2391 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2392 static void assignVsitesToSingleTask(VsiteThread
* tData
,
2394 gmx::ArrayRef
<const int> taskIndex
,
2395 ArrayRef
<const InteractionList
> ilist
,
2396 ArrayRef
<const t_iparams
> ip
)
2398 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2400 tData
->ilist
[ftype
].clear();
2401 tData
->idTask
.ilist
[ftype
].clear();
2403 int nral1
= 1 + NRAL(ftype
);
2405 const int* iat
= ilist
[ftype
].iatoms
.data();
2406 InteractionList
* il_task
= &tData
->ilist
[ftype
];
2408 for (int i
= 0; i
< ilist
[ftype
].size();)
2410 if (ftype
== F_VSITEN
)
2412 /* The 3 below is from 1+NRAL(ftype)=3 */
2413 inc
= ip
[iat
[i
]].vsiten
.n
* 3;
2415 /* Check if the vsite is assigned to our task */
2416 if (taskIndex
[iat
[1 + i
]] == task
)
2418 /* Copy the vsite data to the thread-task local array */
2419 il_task
->push_back(iat
[i
], inc
- 1, iat
+ i
+ 1);
2427 void ThreadingInfo::setVirtualSites(ArrayRef
<const InteractionList
> ilists
,
2428 ArrayRef
<const t_iparams
> iparams
,
2429 const t_mdatoms
& mdatoms
,
2430 const bool useDomdec
)
2432 if (numThreads_
<= 1)
2438 /* The current way of distributing the vsites over threads in primitive.
2439 * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2440 * without taking into account how the vsites are distributed.
2441 * Without domain decomposition we at least tighten the upper bound
2442 * of the range (useful for common systems such as a vsite-protein
2444 * With domain decomposition, as long as the vsites are distributed
2445 * uniformly in each domain along the major dimension, usually x,
2446 * it will also perform well.
2448 int vsite_atom_range
;
2452 vsite_atom_range
= -1;
2453 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2456 if (ftype
!= F_VSITEN
)
2458 int nral1
= 1 + NRAL(ftype
);
2459 ArrayRef
<const int> iat
= ilists
[ftype
].iatoms
;
2460 for (int i
= 0; i
< ilists
[ftype
].size(); i
+= nral1
)
2462 for (int j
= i
+ 1; j
< i
+ nral1
; j
++)
2464 vsite_atom_range
= std::max(vsite_atom_range
, iat
[j
]);
2472 ArrayRef
<const int> iat
= ilists
[ftype
].iatoms
;
2475 while (i
< ilists
[ftype
].size())
2477 /* The 3 below is from 1+NRAL(ftype)=3 */
2478 vs_ind_end
= i
+ iparams
[iat
[i
]].vsiten
.n
* 3;
2480 vsite_atom_range
= std::max(vsite_atom_range
, iat
[i
+ 1]);
2481 while (i
< vs_ind_end
)
2483 vsite_atom_range
= std::max(vsite_atom_range
, iat
[i
+ 2]);
2491 natperthread
= (vsite_atom_range
+ numThreads_
- 1) / numThreads_
;
2495 /* Any local or not local atom could be involved in virtual sites.
2496 * But since we usually have very few non-local virtual sites
2497 * (only non-local vsites that depend on local vsites),
2498 * we distribute the local atom range equally over the threads.
2499 * When assigning vsites to threads, we should take care that the last
2500 * threads also covers the non-local range.
2502 vsite_atom_range
= mdatoms
.nr
;
2503 natperthread
= (mdatoms
.homenr
+ numThreads_
- 1) / numThreads_
;
2508 fprintf(debug
, "virtual site thread dist: natoms %d, range %d, natperthread %d\n",
2509 mdatoms
.nr
, vsite_atom_range
, natperthread
);
2512 /* To simplify the vsite assignment, we make an index which tells us
2513 * to which task particles, both non-vsites and vsites, are assigned.
2515 taskIndex_
.resize(mdatoms
.nr
);
2517 /* Initialize the task index array. Here we assign the non-vsite
2518 * particles to task=thread, so we easily figure out if vsites
2519 * depend on local and/or non-local particles in assignVsitesToThread.
2523 for (int i
= 0; i
< mdatoms
.nr
; i
++)
2525 if (mdatoms
.ptype
[i
] == eptVSite
)
2527 /* vsites are not assigned to a task yet */
2532 /* assign non-vsite particles to task thread */
2533 taskIndex_
[i
] = thread
;
2535 if (i
== (thread
+ 1) * natperthread
&& thread
< numThreads_
)
2542 #pragma omp parallel num_threads(numThreads_)
2546 int thread
= gmx_omp_get_thread_num();
2547 VsiteThread
& tData
= *tData_
[thread
];
2549 /* Clear the buffer use flags that were set before */
2550 if (tData
.useInterdependentTask
)
2552 InterdependentTask
& idTask
= tData
.idTask
;
2554 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2555 * we clear the force buffer at the next step,
2556 * so we need to do it here as well.
2558 clearTaskForceBufferUsedElements(&idTask
);
2560 idTask
.vsite
.resize(0);
2561 for (int t
= 0; t
< numThreads_
; t
++)
2563 AtomIndex
& atomIndex
= idTask
.atomIndex
[t
];
2564 int natom
= atomIndex
.atom
.size();
2565 for (int i
= 0; i
< natom
; i
++)
2567 idTask
.use
[atomIndex
.atom
[i
]] = false;
2569 atomIndex
.atom
.resize(0);
2574 /* To avoid large f_buf allocations of #threads*vsite_atom_range
2575 * we don't use task2 with more than 200000 atoms. This doesn't
2576 * affect performance, since with such a large range relatively few
2577 * vsites will end up in the separate task.
2578 * Note that useTask2 should be the same for all threads.
2580 tData
.useInterdependentTask
= (vsite_atom_range
<= 200000);
2581 if (tData
.useInterdependentTask
)
2583 size_t natoms_use_in_vsites
= vsite_atom_range
;
2584 InterdependentTask
& idTask
= tData
.idTask
;
2585 /* To avoid resizing and re-clearing every nstlist steps,
2586 * we never down size the force buffer.
2588 if (natoms_use_in_vsites
> idTask
.force
.size() || natoms_use_in_vsites
> idTask
.use
.size())
2590 idTask
.force
.resize(natoms_use_in_vsites
, { 0, 0, 0 });
2591 idTask
.use
.resize(natoms_use_in_vsites
, false);
2595 /* Assign all vsites that can execute independently on threads */
2596 tData
.rangeStart
= thread
* natperthread
;
2597 if (thread
< numThreads_
- 1)
2599 tData
.rangeEnd
= (thread
+ 1) * natperthread
;
2603 /* The last thread should cover up to the end of the range */
2604 tData
.rangeEnd
= mdatoms
.nr
;
2606 assignVsitesToThread(&tData
, thread
, numThreads_
, natperthread
, taskIndex_
, ilists
,
2607 iparams
, mdatoms
.ptype
);
2609 if (tData
.useInterdependentTask
)
2611 /* In the worst case, all tasks write to force ranges of
2612 * all other tasks, leading to #tasks^2 scaling (this is only
2613 * the overhead, the actual flops remain constant).
2614 * But in most cases there is far less coupling. To improve
2615 * scaling at high thread counts we therefore construct
2616 * an index to only loop over the actually affected tasks.
2618 InterdependentTask
& idTask
= tData
.idTask
;
2620 /* Ensure assignVsitesToThread finished on other threads */
2623 idTask
.spreadTask
.resize(0);
2624 idTask
.reduceTask
.resize(0);
2625 for (int t
= 0; t
< numThreads_
; t
++)
2627 /* Do we write to the force buffer of task t? */
2628 if (!idTask
.atomIndex
[t
].atom
.empty())
2630 idTask
.spreadTask
.push_back(t
);
2632 /* Does task t write to our force buffer? */
2633 if (!tData_
[t
]->idTask
.atomIndex
[thread
].atom
.empty())
2635 idTask
.reduceTask
.push_back(t
);
2640 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2642 /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2643 * to a single task that will not run in parallel with other tasks.
2645 assignVsitesToSingleTask(tData_
[numThreads_
].get(), 2 * numThreads_
, taskIndex_
, ilists
, iparams
);
2647 if (debug
&& numThreads_
> 1)
2649 fprintf(debug
, "virtual site useInterdependentTask %d, nuse:\n",
2650 static_cast<int>(tData_
[0]->useInterdependentTask
));
2651 for (int th
= 0; th
< numThreads_
+ 1; th
++)
2653 fprintf(debug
, " %4d", tData_
[th
]->idTask
.nuse
);
2655 fprintf(debug
, "\n");
2657 for (int ftype
= c_ftypeVsiteStart
; ftype
< c_ftypeVsiteEnd
; ftype
++)
2659 if (!ilists
[ftype
].empty())
2661 fprintf(debug
, "%-20s thread dist:", interaction_function
[ftype
].longname
);
2662 for (int th
= 0; th
< numThreads_
+ 1; th
++)
2664 fprintf(debug
, " %4d %4d ", tData_
[th
]->ilist
[ftype
].size(),
2665 tData_
[th
]->idTask
.ilist
[ftype
].size());
2667 fprintf(debug
, "\n");
2673 int nrOrig
= vsiteIlistNrCount(ilists
);
2675 for (int th
= 0; th
< numThreads_
+ 1; th
++)
2677 nrThreaded
+= vsiteIlistNrCount(tData_
[th
]->ilist
) + vsiteIlistNrCount(tData_
[th
]->idTask
.ilist
);
2679 GMX_ASSERT(nrThreaded
== nrOrig
,
2680 "The number of virtual sites assigned to all thread task has to match the total "
2681 "number of virtual sites");
2685 void VirtualSitesHandler::Impl::setVirtualSites(ArrayRef
<const InteractionList
> ilists
,
2686 const t_mdatoms
& mdatoms
)
2690 threadingInfo_
.setVirtualSites(ilists
, iparams_
, mdatoms
, domainInfo_
.useDomdec());
2693 void VirtualSitesHandler::setVirtualSites(ArrayRef
<const InteractionList
> ilists
, const t_mdatoms
& mdatoms
)
2695 impl_
->setVirtualSites(ilists
, mdatoms
);