2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file defines functions for mdrun to call to make a new
38 * domain decomposition, and check it.
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_domdec
46 #include "partition.h"
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/mdsetup.h"
63 #include "gromacs/ewald/pme.h"
64 #include "gromacs/gmxlib/chargegroup.h"
65 #include "gromacs/gmxlib/network.h"
66 #include "gromacs/gmxlib/nrnb.h"
67 #include "gromacs/imd/imd.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/mdlib/forcerec.h"
71 #include "gromacs/mdlib/gmx_omp_nthreads.h"
72 #include "gromacs/mdlib/mdatoms.h"
73 #include "gromacs/mdlib/nsgrid.h"
74 #include "gromacs/mdlib/vsite.h"
75 #include "gromacs/mdtypes/commrec.h"
76 #include "gromacs/mdtypes/forcerec.h"
77 #include "gromacs/mdtypes/inputrec.h"
78 #include "gromacs/mdtypes/md_enums.h"
79 #include "gromacs/mdtypes/nblist.h"
80 #include "gromacs/mdtypes/state.h"
81 #include "gromacs/nbnxm/nbnxm.h"
82 #include "gromacs/pulling/pull.h"
83 #include "gromacs/timing/wallcycle.h"
84 #include "gromacs/topology/mtop_util.h"
85 #include "gromacs/topology/topology.h"
86 #include "gromacs/utility/cstringutil.h"
87 #include "gromacs/utility/fatalerror.h"
88 #include "gromacs/utility/logger.h"
89 #include "gromacs/utility/real.h"
90 #include "gromacs/utility/smalloc.h"
91 #include "gromacs/utility/strconvert.h"
92 #include "gromacs/utility/stringstream.h"
93 #include "gromacs/utility/stringutil.h"
94 #include "gromacs/utility/textwriter.h"
97 #include "cellsizes.h"
98 #include "distribute.h"
99 #include "domdec_constraints.h"
100 #include "domdec_internal.h"
101 #include "domdec_vsite.h"
103 #include "redistribute.h"
106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
108 * There is a bit of overhead with DLB and it's difficult to achieve
109 * a load imbalance of less than 2% with DLB.
111 #define DD_PERF_LOSS_DLB_ON 0.02
113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
114 #define DD_PERF_LOSS_WARN 0.05
117 //! Debug helper printing a DD zone
118 static void print_ddzone(FILE *fp
, int d
, int i
, int j
, gmx_ddzone_t
*zone
)
120 fprintf(fp
, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
122 zone
->min0
, zone
->max1
,
123 zone
->mch0
, zone
->mch0
,
124 zone
->p1_0
, zone
->p1_1
);
127 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
128 * takes the extremes over all home and remote zones in the halo
129 * and returns the results in cell_ns_x0 and cell_ns_x1.
130 * Note: only used with the group cut-off scheme.
132 static void dd_move_cellx(gmx_domdec_t
*dd
,
133 const gmx_ddbox_t
*ddbox
,
137 constexpr int c_ddZoneCommMaxNumZones
= 5;
138 gmx_ddzone_t buf_s
[c_ddZoneCommMaxNumZones
];
139 gmx_ddzone_t buf_r
[c_ddZoneCommMaxNumZones
];
140 gmx_ddzone_t buf_e
[c_ddZoneCommMaxNumZones
];
141 gmx_domdec_comm_t
*comm
= dd
->comm
;
145 for (int d
= 1; d
< dd
->ndim
; d
++)
147 int dim
= dd
->dim
[d
];
148 gmx_ddzone_t
&zp
= (d
== 1) ? comm
->zone_d1
[0] : comm
->zone_d2
[0][0];
150 /* Copy the base sizes of the home zone */
151 zp
.min0
= cell_ns_x0
[dim
];
152 zp
.max1
= cell_ns_x1
[dim
];
153 zp
.min1
= cell_ns_x1
[dim
];
154 zp
.mch0
= cell_ns_x0
[dim
];
155 zp
.mch1
= cell_ns_x1
[dim
];
156 zp
.p1_0
= cell_ns_x0
[dim
];
157 zp
.p1_1
= cell_ns_x1
[dim
];
161 gmx::ArrayRef
<DDCellsizesWithDlb
> cellsizes
= comm
->cellsizesWithDlb
;
163 /* Loop backward over the dimensions and aggregate the extremes
166 for (int d
= dd
->ndim
- 2; d
>= 0; d
--)
168 const int dim
= dd
->dim
[d
];
169 const bool applyPbc
= (dim
< ddbox
->npbcdim
);
171 /* Use an rvec to store two reals */
172 extr_s
[d
][0] = cellsizes
[d
+ 1].fracLower
;
173 extr_s
[d
][1] = cellsizes
[d
+ 1].fracUpper
;
174 extr_s
[d
][2] = cellsizes
[d
+ 1].fracUpper
;
177 GMX_ASSERT(pos
< c_ddZoneCommMaxNumZones
, "The buffers should be sufficiently large");
178 /* Store the extremes in the backward sending buffer,
179 * so they get updated separately from the forward communication.
181 for (int d1
= d
; d1
< dd
->ndim
-1; d1
++)
183 gmx_ddzone_t
&buf
= buf_s
[pos
];
185 /* We invert the order to be able to use the same loop for buf_e */
186 buf
.min0
= extr_s
[d1
][1];
187 buf
.max1
= extr_s
[d1
][0];
188 buf
.min1
= extr_s
[d1
][2];
191 /* Store the cell corner of the dimension we communicate along */
192 buf
.p1_0
= comm
->cell_x0
[dim
];
198 buf_s
[pos
] = (dd
->ndim
== 2) ? comm
->zone_d1
[0] : comm
->zone_d2
[0][0];
201 if (dd
->ndim
== 3 && d
== 0)
203 buf_s
[pos
] = comm
->zone_d2
[0][1];
205 buf_s
[pos
] = comm
->zone_d1
[0];
209 /* We only need to communicate the extremes
210 * in the forward direction
212 int numPulses
= comm
->cd
[d
].numPulses();
216 /* Take the minimum to avoid double communication */
217 numPulsesMin
= std::min(numPulses
, dd
->nc
[dim
] - 1 - numPulses
);
221 /* Without PBC we should really not communicate over
222 * the boundaries, but implementing that complicates
223 * the communication setup and therefore we simply
224 * do all communication, but ignore some data.
226 numPulsesMin
= numPulses
;
228 for (int pulse
= 0; pulse
< numPulsesMin
; pulse
++)
230 /* Communicate the extremes forward */
231 bool receiveValidData
= (applyPbc
|| dd
->ci
[dim
] > 0);
233 int numElements
= dd
->ndim
- d
- 1;
234 ddSendrecv(dd
, d
, dddirForward
,
235 extr_s
+ d
, numElements
,
236 extr_r
+ d
, numElements
);
238 if (receiveValidData
)
240 for (int d1
= d
; d1
< dd
->ndim
- 1; d1
++)
242 extr_s
[d1
][0] = std::max(extr_s
[d1
][0], extr_r
[d1
][0]);
243 extr_s
[d1
][1] = std::min(extr_s
[d1
][1], extr_r
[d1
][1]);
244 extr_s
[d1
][2] = std::min(extr_s
[d1
][2], extr_r
[d1
][2]);
249 const int numElementsInBuffer
= pos
;
250 for (int pulse
= 0; pulse
< numPulses
; pulse
++)
252 /* Communicate all the zone information backward */
253 bool receiveValidData
= (applyPbc
|| dd
->ci
[dim
] < dd
->nc
[dim
] - 1);
255 static_assert(sizeof(gmx_ddzone_t
) == c_ddzoneNumReals
*sizeof(real
), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
257 int numReals
= numElementsInBuffer
*c_ddzoneNumReals
;
258 ddSendrecv(dd
, d
, dddirBackward
,
259 gmx::arrayRefFromArray(&buf_s
[0].min0
, numReals
),
260 gmx::arrayRefFromArray(&buf_r
[0].min0
, numReals
));
265 for (int d1
= d
+ 1; d1
< dd
->ndim
; d1
++)
267 /* Determine the decrease of maximum required
268 * communication height along d1 due to the distance along d,
269 * this avoids a lot of useless atom communication.
271 real dist_d
= comm
->cell_x1
[dim
] - buf_r
[0].p1_0
;
274 if (ddbox
->tric_dir
[dim
])
276 /* c is the off-diagonal coupling between the cell planes
277 * along directions d and d1.
279 c
= ddbox
->v
[dim
][dd
->dim
[d1
]][dim
];
285 real det
= (1 + c
*c
)*comm
->cutoff
*comm
->cutoff
- dist_d
*dist_d
;
288 dh
[d1
] = comm
->cutoff
- (c
*dist_d
+ std::sqrt(det
))/(1 + c
*c
);
292 /* A negative value signals out of range */
298 /* Accumulate the extremes over all pulses */
299 for (int i
= 0; i
< numElementsInBuffer
; i
++)
307 if (receiveValidData
)
309 buf_e
[i
].min0
= std::min(buf_e
[i
].min0
, buf_r
[i
].min0
);
310 buf_e
[i
].max1
= std::max(buf_e
[i
].max1
, buf_r
[i
].max1
);
311 buf_e
[i
].min1
= std::min(buf_e
[i
].min1
, buf_r
[i
].min1
);
315 if (dd
->ndim
== 3 && d
== 0 && i
== numElementsInBuffer
- 1)
323 if (receiveValidData
&& dh
[d1
] >= 0)
325 buf_e
[i
].mch0
= std::max(buf_e
[i
].mch0
, buf_r
[i
].mch0
-dh
[d1
]);
326 buf_e
[i
].mch1
= std::max(buf_e
[i
].mch1
, buf_r
[i
].mch1
-dh
[d1
]);
329 /* Copy the received buffer to the send buffer,
330 * to pass the data through with the next pulse.
334 if (((applyPbc
|| dd
->ci
[dim
] + numPulses
< dd
->nc
[dim
]) && pulse
== numPulses
- 1) ||
335 (!applyPbc
&& dd
->ci
[dim
] + 1 + pulse
== dd
->nc
[dim
] - 1))
337 /* Store the extremes */
340 for (int d1
= d
; d1
< dd
->ndim
-1; d1
++)
342 extr_s
[d1
][1] = std::min(extr_s
[d1
][1], buf_e
[pos
].min0
);
343 extr_s
[d1
][0] = std::max(extr_s
[d1
][0], buf_e
[pos
].max1
);
344 extr_s
[d1
][2] = std::min(extr_s
[d1
][2], buf_e
[pos
].min1
);
348 if (d
== 1 || (d
== 0 && dd
->ndim
== 3))
350 for (int i
= d
; i
< 2; i
++)
352 comm
->zone_d2
[1-d
][i
] = buf_e
[pos
];
358 comm
->zone_d1
[1] = buf_e
[pos
];
364 if (d
== 1 || (d
== 0 && dd
->ndim
== 3))
366 for (int i
= d
; i
< 2; i
++)
368 comm
->zone_d2
[1 - d
][i
].dataSet
= 0;
373 comm
->zone_d1
[1].dataSet
= 0;
381 int dim
= dd
->dim
[1];
382 for (int i
= 0; i
< 2; i
++)
384 if (comm
->zone_d1
[i
].dataSet
!= 0)
388 print_ddzone(debug
, 1, i
, 0, &comm
->zone_d1
[i
]);
390 cell_ns_x0
[dim
] = std::min(cell_ns_x0
[dim
], comm
->zone_d1
[i
].min0
);
391 cell_ns_x1
[dim
] = std::max(cell_ns_x1
[dim
], comm
->zone_d1
[i
].max1
);
397 int dim
= dd
->dim
[2];
398 for (int i
= 0; i
< 2; i
++)
400 for (int j
= 0; j
< 2; j
++)
402 if (comm
->zone_d2
[i
][j
].dataSet
!= 0)
406 print_ddzone(debug
, 2, i
, j
, &comm
->zone_d2
[i
][j
]);
408 cell_ns_x0
[dim
] = std::min(cell_ns_x0
[dim
], comm
->zone_d2
[i
][j
].min0
);
409 cell_ns_x1
[dim
] = std::max(cell_ns_x1
[dim
], comm
->zone_d2
[i
][j
].max1
);
414 for (int d
= 1; d
< dd
->ndim
; d
++)
416 cellsizes
[d
].fracLowerMax
= extr_s
[d
-1][0];
417 cellsizes
[d
].fracUpperMin
= extr_s
[d
-1][1];
420 fprintf(debug
, "Cell fraction d %d, max0 %f, min1 %f\n",
421 d
, cellsizes
[d
].fracLowerMax
, cellsizes
[d
].fracUpperMin
);
426 //! Sets the charge-group zones to be equal to the home zone.
427 static void set_zones_ncg_home(gmx_domdec_t
*dd
)
429 gmx_domdec_zones_t
*zones
;
432 zones
= &dd
->comm
->zones
;
434 zones
->cg_range
[0] = 0;
435 for (i
= 1; i
< zones
->n
+1; i
++)
437 zones
->cg_range
[i
] = dd
->ncg_home
;
439 /* zone_ncg1[0] should always be equal to ncg_home */
440 dd
->comm
->zone_ncg1
[0] = dd
->ncg_home
;
443 //! Restore atom groups for the charge groups.
444 static void restoreAtomGroups(gmx_domdec_t
*dd
,
445 const int *gcgs_index
, const t_state
*state
)
447 gmx::ArrayRef
<const int> atomGroupsState
= state
->cg_gl
;
449 std::vector
<int> &globalAtomGroupIndices
= dd
->globalAtomGroupIndices
;
450 gmx::RangePartitioning
&atomGrouping
= dd
->atomGrouping_
;
452 globalAtomGroupIndices
.resize(atomGroupsState
.size());
453 atomGrouping
.clear();
455 /* Copy back the global charge group indices from state
456 * and rebuild the local charge group to atom index.
458 for (gmx::index i
= 0; i
< atomGroupsState
.ssize(); i
++)
460 const int atomGroupGlobal
= atomGroupsState
[i
];
461 const int groupSize
= gcgs_index
[atomGroupGlobal
+ 1] - gcgs_index
[atomGroupGlobal
];
462 globalAtomGroupIndices
[i
] = atomGroupGlobal
;
463 atomGrouping
.appendBlock(groupSize
);
466 dd
->ncg_home
= atomGroupsState
.size();
467 dd
->comm
->atomRanges
.setEnd(DDAtomRanges::Type::Home
, atomGrouping
.fullRange().end());
469 set_zones_ncg_home(dd
);
472 //! Sets the cginfo structures.
473 static void dd_set_cginfo(gmx::ArrayRef
<const int> index_gl
, int cg0
, int cg1
,
474 t_forcerec
*fr
, char *bLocalCG
)
476 cginfo_mb_t
*cginfo_mb
;
482 cginfo_mb
= fr
->cginfo_mb
;
485 for (cg
= cg0
; cg
< cg1
; cg
++)
487 cginfo
[cg
] = ddcginfo(cginfo_mb
, index_gl
[cg
]);
491 if (bLocalCG
!= nullptr)
493 for (cg
= cg0
; cg
< cg1
; cg
++)
495 bLocalCG
[index_gl
[cg
]] = TRUE
;
500 //! Makes the mappings between global and local atom indices during DD repartioning.
501 static void make_dd_indices(gmx_domdec_t
*dd
,
502 const int *gcgs_index
, int cg_start
)
504 const int numZones
= dd
->comm
->zones
.n
;
505 const int *zone2cg
= dd
->comm
->zones
.cg_range
;
506 const int *zone_ncg1
= dd
->comm
->zone_ncg1
;
507 gmx::ArrayRef
<const int> globalAtomGroupIndices
= dd
->globalAtomGroupIndices
;
508 const gmx_bool bCGs
= dd
->comm
->bCGs
;
510 std::vector
<int> &globalAtomIndices
= dd
->globalAtomIndices
;
511 gmx_ga2la_t
&ga2la
= *dd
->ga2la
;
513 if (zone2cg
[1] != dd
->ncg_home
)
515 gmx_incons("dd->ncg_zone is not up to date");
518 /* Make the local to global and global to local atom index */
519 int a
= dd
->atomGrouping().subRange(cg_start
, cg_start
).begin();
520 globalAtomIndices
.resize(a
);
521 for (int zone
= 0; zone
< numZones
; zone
++)
532 int cg1
= zone2cg
[zone
+1];
533 int cg1_p1
= cg0
+ zone_ncg1
[zone
];
535 for (int cg
= cg0
; cg
< cg1
; cg
++)
540 /* Signal that this cg is from more than one pulse away */
543 int cg_gl
= globalAtomGroupIndices
[cg
];
546 for (int a_gl
= gcgs_index
[cg_gl
]; a_gl
< gcgs_index
[cg_gl
+1]; a_gl
++)
548 globalAtomIndices
.push_back(a_gl
);
549 ga2la
.insert(a_gl
, {a
, zone1
});
555 globalAtomIndices
.push_back(cg_gl
);
556 ga2la
.insert(cg_gl
, {a
, zone1
});
563 //! Checks the charge-group assignements.
564 static int check_bLocalCG(gmx_domdec_t
*dd
, int ncg_sys
, const char *bLocalCG
,
568 if (bLocalCG
== nullptr)
572 for (size_t i
= 0; i
< dd
->globalAtomGroupIndices
.size(); i
++)
574 if (!bLocalCG
[dd
->globalAtomGroupIndices
[i
]])
577 "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd
->rank
, where
, i
+ 1, dd
->globalAtomGroupIndices
[i
] + 1, dd
->ncg_home
);
582 for (int i
= 0; i
< ncg_sys
; i
++)
589 if (ngl
!= dd
->globalAtomGroupIndices
.size())
591 fprintf(stderr
, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd
->rank
, where
, ngl
, dd
->globalAtomGroupIndices
.size());
598 //! Checks whether global and local atom indices are consistent.
599 static void check_index_consistency(gmx_domdec_t
*dd
,
600 int natoms_sys
, int ncg_sys
,
605 const int numAtomsInZones
= dd
->comm
->atomRanges
.end(DDAtomRanges::Type::Zones
);
607 if (dd
->comm
->DD_debug
> 1)
609 std::vector
<int> have(natoms_sys
);
610 for (int a
= 0; a
< numAtomsInZones
; a
++)
612 int globalAtomIndex
= dd
->globalAtomIndices
[a
];
613 if (have
[globalAtomIndex
] > 0)
615 fprintf(stderr
, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd
->rank
, globalAtomIndex
+ 1, have
[globalAtomIndex
], a
+1);
619 have
[globalAtomIndex
] = a
+ 1;
624 std::vector
<int> have(numAtomsInZones
);
627 for (int i
= 0; i
< natoms_sys
; i
++)
629 if (const auto entry
= dd
->ga2la
->find(i
))
631 const int a
= entry
->la
;
632 if (a
>= numAtomsInZones
)
634 fprintf(stderr
, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd
->rank
, i
+1, a
+1, numAtomsInZones
);
640 if (dd
->globalAtomIndices
[a
] != i
)
642 fprintf(stderr
, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd
->rank
, i
+1, a
+1, dd
->globalAtomIndices
[a
]+1);
649 if (ngl
!= numAtomsInZones
)
652 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
653 dd
->rank
, where
, ngl
, numAtomsInZones
);
655 for (int a
= 0; a
< numAtomsInZones
; a
++)
660 "DD rank %d, %s: local atom %d, global %d has no global index\n",
661 dd
->rank
, where
, a
+ 1, dd
->globalAtomIndices
[a
] + 1);
665 nerr
+= check_bLocalCG(dd
, ncg_sys
, dd
->comm
->bLocalCG
, where
);
669 gmx_fatal(FARGS
, "DD rank %d, %s: %d atom(group) index inconsistencies",
670 dd
->rank
, where
, nerr
);
674 //! Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart
675 static void clearDDStateIndices(gmx_domdec_t
*dd
,
679 gmx_ga2la_t
&ga2la
= *dd
->ga2la
;
683 /* Clear the whole list without the overhead of searching */
688 const int numAtomsInZones
= dd
->comm
->atomRanges
.end(DDAtomRanges::Type::Zones
);
689 for (int i
= 0; i
< numAtomsInZones
; i
++)
691 ga2la
.erase(dd
->globalAtomIndices
[i
]);
695 char *bLocalCG
= dd
->comm
->bLocalCG
;
698 for (size_t atomGroup
= atomGroupStart
; atomGroup
< dd
->globalAtomGroupIndices
.size(); atomGroup
++)
700 bLocalCG
[dd
->globalAtomGroupIndices
[atomGroup
]] = FALSE
;
704 dd_clear_local_vsite_indices(dd
);
708 dd_clear_local_constraint_indices(dd
);
712 bool check_grid_jump(int64_t step
,
713 const gmx_domdec_t
*dd
,
715 const gmx_ddbox_t
*ddbox
,
718 gmx_domdec_comm_t
*comm
= dd
->comm
;
719 bool invalid
= false;
721 for (int d
= 1; d
< dd
->ndim
; d
++)
723 const DDCellsizesWithDlb
&cellsizes
= comm
->cellsizesWithDlb
[d
];
724 const int dim
= dd
->dim
[d
];
725 const real limit
= grid_jump_limit(comm
, cutoff
, d
);
726 real bfac
= ddbox
->box_size
[dim
];
727 if (ddbox
->tric_dir
[dim
])
729 bfac
*= ddbox
->skew_fac
[dim
];
731 if ((cellsizes
.fracUpper
- cellsizes
.fracLowerMax
)*bfac
< limit
||
732 (cellsizes
.fracLower
- cellsizes
.fracUpperMin
)*bfac
> -limit
)
740 /* This error should never be triggered under normal
741 * circumstances, but you never know ...
743 gmx_fatal(FARGS
, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
744 gmx_step_str(step
, buf
),
745 dim2char(dim
), dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
752 //! Return the duration of force calculations on this rank.
753 static float dd_force_load(gmx_domdec_comm_t
*comm
)
762 load
*= 1.0 + (comm
->eFlop
- 1)*(0.1*rand()/RAND_MAX
- 0.05);
767 load
= comm
->cycl
[ddCyclF
];
768 if (comm
->cycl_n
[ddCyclF
] > 1)
770 /* Subtract the maximum of the last n cycle counts
771 * to get rid of possible high counts due to other sources,
772 * for instance system activity, that would otherwise
773 * affect the dynamic load balancing.
775 load
-= comm
->cycl_max
[ddCyclF
];
779 if (comm
->cycl_n
[ddCyclWaitGPU
] && comm
->nrank_gpu_shared
> 1)
781 float gpu_wait
, gpu_wait_sum
;
783 gpu_wait
= comm
->cycl
[ddCyclWaitGPU
];
784 if (comm
->cycl_n
[ddCyclF
] > 1)
786 /* We should remove the WaitGPU time of the same MD step
787 * as the one with the maximum F time, since the F time
788 * and the wait time are not independent.
789 * Furthermore, the step for the max F time should be chosen
790 * the same on all ranks that share the same GPU.
791 * But to keep the code simple, we remove the average instead.
792 * The main reason for artificially long times at some steps
793 * is spurious CPU activity or MPI time, so we don't expect
794 * that changes in the GPU wait time matter a lot here.
796 gpu_wait
*= (comm
->cycl_n
[ddCyclF
] - 1)/static_cast<float>(comm
->cycl_n
[ddCyclF
]);
798 /* Sum the wait times over the ranks that share the same GPU */
799 MPI_Allreduce(&gpu_wait
, &gpu_wait_sum
, 1, MPI_FLOAT
, MPI_SUM
,
800 comm
->mpi_comm_gpu_shared
);
801 /* Replace the wait time by the average over the ranks */
802 load
+= -gpu_wait
+ gpu_wait_sum
/comm
->nrank_gpu_shared
;
810 //! Runs cell size checks and communicates the boundaries.
811 static void comm_dd_ns_cell_sizes(gmx_domdec_t
*dd
,
813 rvec cell_ns_x0
, rvec cell_ns_x1
,
816 gmx_domdec_comm_t
*comm
;
821 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
823 dim
= dd
->dim
[dim_ind
];
825 /* Without PBC we don't have restrictions on the outer cells */
826 if (!(dim
>= ddbox
->npbcdim
&&
827 (dd
->ci
[dim
] == 0 || dd
->ci
[dim
] == dd
->nc
[dim
] - 1)) &&
829 (comm
->cell_x1
[dim
] - comm
->cell_x0
[dim
])*ddbox
->skew_fac
[dim
] <
830 comm
->cellsize_min
[dim
])
833 gmx_fatal(FARGS
, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
834 gmx_step_str(step
, buf
), dim2char(dim
),
835 comm
->cell_x1
[dim
] - comm
->cell_x0
[dim
],
836 ddbox
->skew_fac
[dim
],
837 dd
->comm
->cellsize_min
[dim
],
838 dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
842 if ((isDlbOn(dd
->comm
) && dd
->ndim
> 1) || ddbox
->nboundeddim
< DIM
)
844 /* Communicate the boundaries and update cell_ns_x0/1 */
845 dd_move_cellx(dd
, ddbox
, cell_ns_x0
, cell_ns_x1
);
846 if (isDlbOn(dd
->comm
) && dd
->ndim
> 1)
848 check_grid_jump(step
, dd
, dd
->comm
->cutoff
, ddbox
, TRUE
);
853 //! Compute and communicate to determine the load distribution across PP ranks.
854 static void get_load_distribution(gmx_domdec_t
*dd
, gmx_wallcycle_t wcycle
)
856 gmx_domdec_comm_t
*comm
;
858 float cell_frac
= 0, sbuf
[DD_NLOAD_MAX
];
863 fprintf(debug
, "get_load_distribution start\n");
866 wallcycle_start(wcycle
, ewcDDCOMMLOAD
);
870 bSepPME
= (dd
->pme_nodeid
>= 0);
872 if (dd
->ndim
== 0 && bSepPME
)
874 /* Without decomposition, but with PME nodes, we need the load */
875 comm
->load
[0].mdf
= comm
->cycl
[ddCyclPPduringPME
];
876 comm
->load
[0].pme
= comm
->cycl
[ddCyclPME
];
879 for (int d
= dd
->ndim
- 1; d
>= 0; d
--)
881 const DDCellsizesWithDlb
*cellsizes
= (isDlbOn(dd
->comm
) ? &comm
->cellsizesWithDlb
[d
] : nullptr);
882 const int dim
= dd
->dim
[d
];
883 /* Check if we participate in the communication in this dimension */
884 if (d
== dd
->ndim
-1 ||
885 (dd
->ci
[dd
->dim
[d
+1]] == 0 && dd
->ci
[dd
->dim
[dd
->ndim
-1]] == 0))
887 load
= &comm
->load
[d
];
888 if (isDlbOn(dd
->comm
))
890 cell_frac
= cellsizes
->fracUpper
- cellsizes
->fracLower
;
895 sbuf
[pos
++] = dd_force_load(comm
);
896 sbuf
[pos
++] = sbuf
[0];
897 if (isDlbOn(dd
->comm
))
899 sbuf
[pos
++] = sbuf
[0];
900 sbuf
[pos
++] = cell_frac
;
903 sbuf
[pos
++] = cellsizes
->fracLowerMax
;
904 sbuf
[pos
++] = cellsizes
->fracUpperMin
;
909 sbuf
[pos
++] = comm
->cycl
[ddCyclPPduringPME
];
910 sbuf
[pos
++] = comm
->cycl
[ddCyclPME
];
915 sbuf
[pos
++] = comm
->load
[d
+1].sum
;
916 sbuf
[pos
++] = comm
->load
[d
+1].max
;
917 if (isDlbOn(dd
->comm
))
919 sbuf
[pos
++] = comm
->load
[d
+1].sum_m
;
920 sbuf
[pos
++] = comm
->load
[d
+1].cvol_min
*cell_frac
;
921 sbuf
[pos
++] = comm
->load
[d
+1].flags
;
924 sbuf
[pos
++] = cellsizes
->fracLowerMax
;
925 sbuf
[pos
++] = cellsizes
->fracUpperMin
;
930 sbuf
[pos
++] = comm
->load
[d
+1].mdf
;
931 sbuf
[pos
++] = comm
->load
[d
+1].pme
;
935 /* Communicate a row in DD direction d.
936 * The communicators are setup such that the root always has rank 0.
939 MPI_Gather(sbuf
, load
->nload
*sizeof(float), MPI_BYTE
,
940 load
->load
, load
->nload
*sizeof(float), MPI_BYTE
,
941 0, comm
->mpi_comm_load
[d
]);
943 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
945 /* We are the master along this row, process this row */
946 RowMaster
*rowMaster
= nullptr;
950 rowMaster
= cellsizes
->rowMaster
.get();
960 for (int i
= 0; i
< dd
->nc
[dim
]; i
++)
962 load
->sum
+= load
->load
[pos
++];
963 load
->max
= std::max(load
->max
, load
->load
[pos
]);
965 if (isDlbOn(dd
->comm
))
967 if (rowMaster
->dlbIsLimited
)
969 /* This direction could not be load balanced properly,
970 * therefore we need to use the maximum iso the average load.
972 load
->sum_m
= std::max(load
->sum_m
, load
->load
[pos
]);
976 load
->sum_m
+= load
->load
[pos
];
979 load
->cvol_min
= std::min(load
->cvol_min
, load
->load
[pos
]);
983 load
->flags
= gmx::roundToInt(load
->load
[pos
++]);
987 rowMaster
->bounds
[i
].cellFracLowerMax
= load
->load
[pos
++];
988 rowMaster
->bounds
[i
].cellFracUpperMin
= load
->load
[pos
++];
993 load
->mdf
= std::max(load
->mdf
, load
->load
[pos
]);
995 load
->pme
= std::max(load
->pme
, load
->load
[pos
]);
999 if (isDlbOn(comm
) && rowMaster
->dlbIsLimited
)
1001 load
->sum_m
*= dd
->nc
[dim
];
1002 load
->flags
|= (1<<d
);
1010 comm
->nload
+= dd_load_count(comm
);
1011 comm
->load_step
+= comm
->cycl
[ddCyclStep
];
1012 comm
->load_sum
+= comm
->load
[0].sum
;
1013 comm
->load_max
+= comm
->load
[0].max
;
1016 for (int d
= 0; d
< dd
->ndim
; d
++)
1018 if (comm
->load
[0].flags
& (1<<d
))
1020 comm
->load_lim
[d
]++;
1026 comm
->load_mdf
+= comm
->load
[0].mdf
;
1027 comm
->load_pme
+= comm
->load
[0].pme
;
1031 wallcycle_stop(wcycle
, ewcDDCOMMLOAD
);
1035 fprintf(debug
, "get_load_distribution finished\n");
1039 /*! \brief Return the relative performance loss on the total run time
1040 * due to the force calculation load imbalance. */
1041 static float dd_force_load_fraction(gmx_domdec_t
*dd
)
1043 if (dd
->comm
->nload
> 0 && dd
->comm
->load_step
> 0)
1045 return dd
->comm
->load_sum
/(dd
->comm
->load_step
*dd
->nnodes
);
1053 /*! \brief Return the relative performance loss on the total run time
1054 * due to the force calculation load imbalance. */
1055 static float dd_force_imb_perf_loss(gmx_domdec_t
*dd
)
1057 if (dd
->comm
->nload
> 0 && dd
->comm
->load_step
> 0)
1060 (dd
->comm
->load_max
*dd
->nnodes
- dd
->comm
->load_sum
)/
1061 (dd
->comm
->load_step
*dd
->nnodes
);
1069 //! Print load-balance report e.g. at the end of a run.
1070 static void print_dd_load_av(FILE *fplog
, gmx_domdec_t
*dd
)
1072 gmx_domdec_comm_t
*comm
= dd
->comm
;
1074 /* Only the master rank prints loads and only if we measured loads */
1075 if (!DDMASTER(dd
) || comm
->nload
== 0)
1081 int numPpRanks
= dd
->nnodes
;
1082 int numPmeRanks
= (dd
->pme_nodeid
>= 0) ? comm
->npmenodes
: 0;
1083 int numRanks
= numPpRanks
+ numPmeRanks
;
1084 float lossFraction
= 0;
1086 /* Print the average load imbalance and performance loss */
1087 if (dd
->nnodes
> 1 && comm
->load_sum
> 0)
1089 float imbalance
= comm
->load_max
*numPpRanks
/comm
->load_sum
- 1;
1090 lossFraction
= dd_force_imb_perf_loss(dd
);
1092 std::string msg
= "\nDynamic load balancing report:\n";
1093 std::string dlbStateStr
;
1095 switch (dd
->comm
->dlbState
)
1097 case DlbState::offUser
:
1098 dlbStateStr
= "DLB was off during the run per user request.";
1100 case DlbState::offForever
:
1101 /* Currectly this can happen due to performance loss observed, cell size
1102 * limitations or incompatibility with other settings observed during
1103 * determineInitialDlbState(). */
1104 dlbStateStr
= "DLB got disabled because it was unsuitable to use.";
1106 case DlbState::offCanTurnOn
:
1107 dlbStateStr
= "DLB was off during the run due to low measured imbalance.";
1109 case DlbState::offTemporarilyLocked
:
1110 dlbStateStr
= "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1112 case DlbState::onCanTurnOff
:
1113 dlbStateStr
= "DLB was turned on during the run due to measured imbalance.";
1115 case DlbState::onUser
:
1116 dlbStateStr
= "DLB was permanently on during the run per user request.";
1119 GMX_ASSERT(false, "Undocumented DLB state");
1122 msg
+= " " + dlbStateStr
+ "\n";
1123 msg
+= gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance
*100);
1124 msg
+= gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1125 gmx::roundToInt(dd_force_load_fraction(dd
)*100));
1126 msg
+= gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1128 fprintf(fplog
, "%s", msg
.c_str());
1129 fprintf(stderr
, "\n%s", msg
.c_str());
1132 /* Print during what percentage of steps the load balancing was limited */
1133 bool dlbWasLimited
= false;
1136 sprintf(buf
, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1137 for (int d
= 0; d
< dd
->ndim
; d
++)
1139 int limitPercentage
= (200*comm
->load_lim
[d
] + 1)/(2*comm
->nload
);
1140 sprintf(buf
+strlen(buf
), " %c %d %%",
1141 dim2char(dd
->dim
[d
]), limitPercentage
);
1142 if (limitPercentage
>= 50)
1144 dlbWasLimited
= true;
1147 sprintf(buf
+ strlen(buf
), "\n");
1148 fprintf(fplog
, "%s", buf
);
1149 fprintf(stderr
, "%s", buf
);
1152 /* Print the performance loss due to separate PME - PP rank imbalance */
1153 float lossFractionPme
= 0;
1154 if (numPmeRanks
> 0 && comm
->load_mdf
> 0 && comm
->load_step
> 0)
1156 float pmeForceRatio
= comm
->load_pme
/comm
->load_mdf
;
1157 lossFractionPme
= (comm
->load_pme
- comm
->load_mdf
)/comm
->load_step
;
1158 if (lossFractionPme
<= 0)
1160 lossFractionPme
*= numPmeRanks
/static_cast<float>(numRanks
);
1164 lossFractionPme
*= numPpRanks
/static_cast<float>(numRanks
);
1166 sprintf(buf
, " Average PME mesh/force load: %5.3f\n", pmeForceRatio
);
1167 fprintf(fplog
, "%s", buf
);
1168 fprintf(stderr
, "%s", buf
);
1169 sprintf(buf
, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme
)*100);
1170 fprintf(fplog
, "%s", buf
);
1171 fprintf(stderr
, "%s", buf
);
1173 fprintf(fplog
, "\n");
1174 fprintf(stderr
, "\n");
1176 if (lossFraction
>= DD_PERF_LOSS_WARN
)
1178 std::string message
= gmx::formatString(
1179 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1180 " in the domain decomposition.\n", lossFraction
*100);
1182 bool hadSuggestion
= false;
1185 message
+= " You might want to use dynamic load balancing (option -dlb.)\n";
1186 hadSuggestion
= true;
1188 else if (dlbWasLimited
)
1190 message
+= " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1191 hadSuggestion
= true;
1193 message
+= gmx::formatString(
1194 " You can %sconsider manually changing the decomposition (option -dd);\n"
1195 " e.g. by using fewer domains along the box dimension in which there is\n"
1196 " considerable inhomogeneity in the simulated system.",
1197 hadSuggestion
? "also " : "");
1200 fprintf(fplog
, "%s\n", message
.c_str());
1201 fprintf(stderr
, "%s\n", message
.c_str());
1203 if (numPmeRanks
> 0 && std::fabs(lossFractionPme
) >= DD_PERF_LOSS_WARN
)
1206 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1207 " had %s work to do than the PP ranks.\n"
1208 " You might want to %s the number of PME ranks\n"
1209 " or %s the cut-off and the grid spacing.\n",
1210 std::fabs(lossFractionPme
*100),
1211 (lossFractionPme
< 0) ? "less" : "more",
1212 (lossFractionPme
< 0) ? "decrease" : "increase",
1213 (lossFractionPme
< 0) ? "decrease" : "increase");
1214 fprintf(fplog
, "%s\n", buf
);
1215 fprintf(stderr
, "%s\n", buf
);
1219 //! Return the minimum communication volume.
1220 static float dd_vol_min(gmx_domdec_t
*dd
)
1222 return dd
->comm
->load
[0].cvol_min
*dd
->nnodes
;
1225 //! Return the DD load flags.
1226 static int dd_load_flags(gmx_domdec_t
*dd
)
1228 return dd
->comm
->load
[0].flags
;
1231 //! Return the reported load imbalance in force calculations.
1232 static float dd_f_imbal(gmx_domdec_t
*dd
)
1234 if (dd
->comm
->load
[0].sum
> 0)
1236 return dd
->comm
->load
[0].max
*dd
->nnodes
/dd
->comm
->load
[0].sum
- 1.0f
;
1240 /* Something is wrong in the cycle counting, report no load imbalance */
1245 //! Returns DD load balance report.
1247 dd_print_load(gmx_domdec_t
*dd
,
1250 gmx::StringOutputStream stream
;
1251 gmx::TextWriter
log(&stream
);
1253 int flags
= dd_load_flags(dd
);
1256 log
.writeString("DD load balancing is limited by minimum cell size in dimension");
1257 for (int d
= 0; d
< dd
->ndim
; d
++)
1261 log
.writeStringFormatted(" %c", dim2char(dd
->dim
[d
]));
1264 log
.ensureLineBreak();
1266 log
.writeString("DD step " + gmx::toString(step
));
1267 if (isDlbOn(dd
->comm
))
1269 log
.writeStringFormatted(" vol min/aver %5.3f%c",
1270 dd_vol_min(dd
), flags
? '!' : ' ');
1274 log
.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd
)*100);
1276 if (dd
->comm
->cycl_n
[ddCyclPME
])
1278 log
.writeStringFormatted(" pme mesh/force %5.3f", dd_pme_f_ratio(dd
));
1280 log
.ensureLineBreak();
1281 return stream
.toString();
1284 //! Prints DD load balance report in mdrun verbose mode.
1285 static void dd_print_load_verbose(gmx_domdec_t
*dd
)
1287 if (isDlbOn(dd
->comm
))
1289 fprintf(stderr
, "vol %4.2f%c ",
1290 dd_vol_min(dd
), dd_load_flags(dd
) ? '!' : ' ');
1294 fprintf(stderr
, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd
)*100));
1296 if (dd
->comm
->cycl_n
[ddCyclPME
])
1298 fprintf(stderr
, "pme/F %4.2f ", dd_pme_f_ratio(dd
));
1302 //! Turns on dynamic load balancing if possible and needed.
1303 static void turn_on_dlb(const gmx::MDLogger
&mdlog
,
1307 gmx_domdec_comm_t
*comm
= dd
->comm
;
1309 real cellsize_min
= comm
->cellsize_min
[dd
->dim
[0]];
1310 for (int d
= 1; d
< dd
->ndim
; d
++)
1312 cellsize_min
= std::min(cellsize_min
, comm
->cellsize_min
[dd
->dim
[d
]]);
1315 /* Turn off DLB if we're too close to the cell size limit. */
1316 if (cellsize_min
< comm
->cellsize_limit
*1.05)
1318 GMX_LOG(mdlog
.info
).appendTextFormatted(
1319 "step %s Measured %.1f %% performance loss due to load imbalance, "
1320 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1321 "Will no longer try dynamic load balancing.",
1322 gmx::toString(step
).c_str(), dd_force_imb_perf_loss(dd
)*100);
1324 comm
->dlbState
= DlbState::offForever
;
1328 GMX_LOG(mdlog
.info
).appendTextFormatted(
1329 "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1330 gmx::toString(step
).c_str(), dd_force_imb_perf_loss(dd
)*100);
1331 comm
->dlbState
= DlbState::onCanTurnOff
;
1333 /* Store the non-DLB performance, so we can check if DLB actually
1334 * improves performance.
1336 GMX_RELEASE_ASSERT(comm
->cycl_n
[ddCyclStep
] > 0, "When we turned on DLB, we should have measured cycles");
1337 comm
->cyclesPerStepBeforeDLB
= comm
->cycl
[ddCyclStep
]/comm
->cycl_n
[ddCyclStep
];
1341 /* We can set the required cell size info here,
1342 * so we do not need to communicate this.
1343 * The grid is completely uniform.
1345 for (int d
= 0; d
< dd
->ndim
; d
++)
1347 RowMaster
*rowMaster
= comm
->cellsizesWithDlb
[d
].rowMaster
.get();
1351 comm
->load
[d
].sum_m
= comm
->load
[d
].sum
;
1353 int nc
= dd
->nc
[dd
->dim
[d
]];
1354 for (int i
= 0; i
< nc
; i
++)
1356 rowMaster
->cellFrac
[i
] = i
/static_cast<real
>(nc
);
1359 rowMaster
->bounds
[i
].cellFracLowerMax
= i
/static_cast<real
>(nc
);
1360 rowMaster
->bounds
[i
].cellFracUpperMin
= (i
+ 1)/static_cast<real
>(nc
);
1363 rowMaster
->cellFrac
[nc
] = 1.0;
1368 //! Turns off dynamic load balancing (but leave it able to turn back on).
1369 static void turn_off_dlb(const gmx::MDLogger
&mdlog
,
1373 GMX_LOG(mdlog
.info
).appendText(
1374 "step " + gmx::toString(step
) + " Turning off dynamic load balancing, because it is degrading performance.");
1375 dd
->comm
->dlbState
= DlbState::offCanTurnOn
;
1376 dd
->comm
->haveTurnedOffDlb
= true;
1377 dd
->comm
->ddPartioningCountFirstDlbOff
= dd
->ddp_count
;
1380 //! Turns off dynamic load balancing permanently.
1381 static void turn_off_dlb_forever(const gmx::MDLogger
&mdlog
,
1385 GMX_RELEASE_ASSERT(dd
->comm
->dlbState
== DlbState::offCanTurnOn
, "Can only turn off DLB forever when it was in the can-turn-on state");
1386 GMX_LOG(mdlog
.info
).appendText(
1387 "step " + gmx::toString(step
) + " Will no longer try dynamic load balancing, as it degraded performance.");
1388 dd
->comm
->dlbState
= DlbState::offForever
;
1391 void set_dd_dlb_max_cutoff(t_commrec
*cr
, real cutoff
)
1393 gmx_domdec_comm_t
*comm
;
1395 comm
= cr
->dd
->comm
;
1397 /* Turn on the DLB limiting (might have been on already) */
1398 comm
->bPMELoadBalDLBLimits
= TRUE
;
1400 /* Change the cut-off limit */
1401 comm
->PMELoadBal_max_cutoff
= cutoff
;
1405 fprintf(debug
, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1406 comm
->PMELoadBal_max_cutoff
);
1410 //! Merges charge-group buffers.
1411 static void merge_cg_buffers(int ncell
,
1412 gmx_domdec_comm_dim_t
*cd
, int pulse
,
1414 gmx::ArrayRef
<int> index_gl
,
1416 rvec
*cg_cm
, rvec
*recv_vr
,
1417 gmx::ArrayRef
<int> cgindex
,
1418 cginfo_mb_t
*cginfo_mb
, int *cginfo
)
1420 gmx_domdec_ind_t
*ind
, *ind_p
;
1421 int p
, cell
, c
, cg
, cg0
, cg1
, cg_gl
, nat
;
1422 int shift
, shift_at
;
1424 ind
= &cd
->ind
[pulse
];
1426 /* First correct the already stored data */
1427 shift
= ind
->nrecv
[ncell
];
1428 for (cell
= ncell
-1; cell
>= 0; cell
--)
1430 shift
-= ind
->nrecv
[cell
];
1433 /* Move the cg's present from previous grid pulses */
1434 cg0
= ncg_cell
[ncell
+cell
];
1435 cg1
= ncg_cell
[ncell
+cell
+1];
1436 cgindex
[cg1
+shift
] = cgindex
[cg1
];
1437 for (cg
= cg1
-1; cg
>= cg0
; cg
--)
1439 index_gl
[cg
+shift
] = index_gl
[cg
];
1440 copy_rvec(cg_cm
[cg
], cg_cm
[cg
+shift
]);
1441 cgindex
[cg
+shift
] = cgindex
[cg
];
1442 cginfo
[cg
+shift
] = cginfo
[cg
];
1444 /* Correct the already stored send indices for the shift */
1445 for (p
= 1; p
<= pulse
; p
++)
1447 ind_p
= &cd
->ind
[p
];
1449 for (c
= 0; c
< cell
; c
++)
1451 cg0
+= ind_p
->nsend
[c
];
1453 cg1
= cg0
+ ind_p
->nsend
[cell
];
1454 for (cg
= cg0
; cg
< cg1
; cg
++)
1456 ind_p
->index
[cg
] += shift
;
1462 /* Merge in the communicated buffers */
1466 for (cell
= 0; cell
< ncell
; cell
++)
1468 cg1
= ncg_cell
[ncell
+cell
+1] + shift
;
1471 /* Correct the old cg indices */
1472 for (cg
= ncg_cell
[ncell
+cell
]; cg
< cg1
; cg
++)
1474 cgindex
[cg
+1] += shift_at
;
1477 for (cg
= 0; cg
< ind
->nrecv
[cell
]; cg
++)
1479 /* Copy this charge group from the buffer */
1480 index_gl
[cg1
] = recv_i
[cg0
];
1481 copy_rvec(recv_vr
[cg0
], cg_cm
[cg1
]);
1482 /* Add it to the cgindex */
1483 cg_gl
= index_gl
[cg1
];
1484 cginfo
[cg1
] = ddcginfo(cginfo_mb
, cg_gl
);
1485 nat
= GET_CGINFO_NATOMS(cginfo
[cg1
]);
1486 cgindex
[cg1
+1] = cgindex
[cg1
] + nat
;
1491 shift
+= ind
->nrecv
[cell
];
1492 ncg_cell
[ncell
+cell
+1] = cg1
;
1496 //! Makes a range partitioning for the atom groups wthin a cell
1497 static void make_cell2at_index(gmx_domdec_comm_dim_t
*cd
,
1500 const gmx::RangePartitioning
&atomGroups
)
1502 /* Store the atom block boundaries for easy copying of communication buffers
1504 int g
= atomGroupStart
;
1505 for (int zone
= 0; zone
< nzone
; zone
++)
1507 for (gmx_domdec_ind_t
&ind
: cd
->ind
)
1509 const auto range
= atomGroups
.subRange(g
, g
+ ind
.nrecv
[zone
]);
1510 ind
.cell2at0
[zone
] = range
.begin();
1511 ind
.cell2at1
[zone
] = range
.end();
1512 g
+= ind
.nrecv
[zone
];
1517 //! Returns whether a link is missing.
1518 static gmx_bool
missing_link(t_blocka
*link
, int cg_gl
, const char *bLocalCG
)
1524 for (i
= link
->index
[cg_gl
]; i
< link
->index
[cg_gl
+1]; i
++)
1526 if (!bLocalCG
[link
->a
[i
]])
1535 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1537 //! The corners for the non-bonded communication.
1539 //! Corner for rounding.
1541 //! Corners for rounding.
1543 //! Corners for bounded communication.
1545 //! Corner for rounding for bonded communication.
1549 //! Determine the corners of the domain(s) we are communicating with.
1551 set_dd_corners(const gmx_domdec_t
*dd
,
1552 int dim0
, int dim1
, int dim2
,
1556 const gmx_domdec_comm_t
*comm
;
1557 const gmx_domdec_zones_t
*zones
;
1562 zones
= &comm
->zones
;
1564 /* Keep the compiler happy */
1568 /* The first dimension is equal for all cells */
1569 c
->c
[0][0] = comm
->cell_x0
[dim0
];
1572 c
->bc
[0] = c
->c
[0][0];
1577 /* This cell row is only seen from the first row */
1578 c
->c
[1][0] = comm
->cell_x0
[dim1
];
1579 /* All rows can see this row */
1580 c
->c
[1][1] = comm
->cell_x0
[dim1
];
1581 if (isDlbOn(dd
->comm
))
1583 c
->c
[1][1] = std::max(comm
->cell_x0
[dim1
], comm
->zone_d1
[1].mch0
);
1586 /* For the multi-body distance we need the maximum */
1587 c
->bc
[1] = std::max(comm
->cell_x0
[dim1
], comm
->zone_d1
[1].p1_0
);
1590 /* Set the upper-right corner for rounding */
1591 c
->cr0
= comm
->cell_x1
[dim0
];
1596 for (j
= 0; j
< 4; j
++)
1598 c
->c
[2][j
] = comm
->cell_x0
[dim2
];
1600 if (isDlbOn(dd
->comm
))
1602 /* Use the maximum of the i-cells that see a j-cell */
1603 for (i
= 0; i
< zones
->nizone
; i
++)
1605 for (j
= zones
->izone
[i
].j0
; j
< zones
->izone
[i
].j1
; j
++)
1610 std::max(c
->c
[2][j
-4],
1611 comm
->zone_d2
[zones
->shift
[i
][dim0
]][zones
->shift
[i
][dim1
]].mch0
);
1617 /* For the multi-body distance we need the maximum */
1618 c
->bc
[2] = comm
->cell_x0
[dim2
];
1619 for (i
= 0; i
< 2; i
++)
1621 for (j
= 0; j
< 2; j
++)
1623 c
->bc
[2] = std::max(c
->bc
[2], comm
->zone_d2
[i
][j
].p1_0
);
1629 /* Set the upper-right corner for rounding */
1630 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1631 * Only cell (0,0,0) can see cell 7 (1,1,1)
1633 c
->cr1
[0] = comm
->cell_x1
[dim1
];
1634 c
->cr1
[3] = comm
->cell_x1
[dim1
];
1635 if (isDlbOn(dd
->comm
))
1637 c
->cr1
[0] = std::max(comm
->cell_x1
[dim1
], comm
->zone_d1
[1].mch1
);
1640 /* For the multi-body distance we need the maximum */
1641 c
->bcr1
= std::max(comm
->cell_x1
[dim1
], comm
->zone_d1
[1].p1_1
);
1648 /*! \brief Add the atom groups we need to send in this pulse from this
1649 * zone to \p localAtomGroups and \p work. */
1651 get_zone_pulse_cgs(gmx_domdec_t
*dd
,
1652 int zonei
, int zone
,
1654 gmx::ArrayRef
<const int> globalAtomGroupIndices
,
1655 const gmx::RangePartitioning
&atomGroups
,
1656 int dim
, int dim_ind
,
1657 int dim0
, int dim1
, int dim2
,
1658 real r_comm2
, real r_bcomm2
,
1660 bool distanceIsTriclinic
,
1662 real skew_fac2_d
, real skew_fac_01
,
1663 rvec
*v_d
, rvec
*v_0
, rvec
*v_1
,
1664 const dd_corners_t
*c
,
1665 const rvec sf2_round
,
1666 gmx_bool bDistBonded
,
1672 std::vector
<int> *localAtomGroups
,
1673 dd_comm_setup_work_t
*work
)
1675 gmx_domdec_comm_t
*comm
;
1677 gmx_bool bDistMB_pulse
;
1679 real r2
, rb2
, r
, tric_sh
;
1686 bScrew
= (dd
->bScrewPBC
&& dim
== XX
);
1688 bDistMB_pulse
= (bDistMB
&& bDistBonded
);
1690 /* Unpack the work data */
1691 std::vector
<int> &ibuf
= work
->atomGroupBuffer
;
1692 std::vector
<gmx::RVec
> &vbuf
= work
->positionBuffer
;
1696 for (cg
= cg0
; cg
< cg1
; cg
++)
1700 if (!distanceIsTriclinic
)
1702 /* Rectangular direction, easy */
1703 r
= cg_cm
[cg
][dim
] - c
->c
[dim_ind
][zone
];
1710 r
= cg_cm
[cg
][dim
] - c
->bc
[dim_ind
];
1716 /* Rounding gives at most a 16% reduction
1717 * in communicated atoms
1719 if (dim_ind
>= 1 && (zonei
== 1 || zonei
== 2))
1721 r
= cg_cm
[cg
][dim0
] - c
->cr0
;
1722 /* This is the first dimension, so always r >= 0 */
1729 if (dim_ind
== 2 && (zonei
== 2 || zonei
== 3))
1731 r
= cg_cm
[cg
][dim1
] - c
->cr1
[zone
];
1738 r
= cg_cm
[cg
][dim1
] - c
->bcr1
;
1748 /* Triclinic direction, more complicated */
1751 /* Rounding, conservative as the skew_fac multiplication
1752 * will slightly underestimate the distance.
1754 if (dim_ind
>= 1 && (zonei
== 1 || zonei
== 2))
1756 rn
[dim0
] = cg_cm
[cg
][dim0
] - c
->cr0
;
1757 for (i
= dim0
+1; i
< DIM
; i
++)
1759 rn
[dim0
] -= cg_cm
[cg
][i
]*v_0
[i
][dim0
];
1761 r2
= rn
[dim0
]*rn
[dim0
]*sf2_round
[dim0
];
1764 rb
[dim0
] = rn
[dim0
];
1767 /* Take care that the cell planes along dim0 might not
1768 * be orthogonal to those along dim1 and dim2.
1770 for (i
= 1; i
<= dim_ind
; i
++)
1773 if (normal
[dim0
][dimd
] > 0)
1775 rn
[dimd
] -= rn
[dim0
]*normal
[dim0
][dimd
];
1778 rb
[dimd
] -= rb
[dim0
]*normal
[dim0
][dimd
];
1783 if (dim_ind
== 2 && (zonei
== 2 || zonei
== 3))
1785 rn
[dim1
] += cg_cm
[cg
][dim1
] - c
->cr1
[zone
];
1787 for (i
= dim1
+1; i
< DIM
; i
++)
1789 tric_sh
-= cg_cm
[cg
][i
]*v_1
[i
][dim1
];
1791 rn
[dim1
] += tric_sh
;
1794 r2
+= rn
[dim1
]*rn
[dim1
]*sf2_round
[dim1
];
1795 /* Take care of coupling of the distances
1796 * to the planes along dim0 and dim1 through dim2.
1798 r2
-= rn
[dim0
]*rn
[dim1
]*skew_fac_01
;
1799 /* Take care that the cell planes along dim1
1800 * might not be orthogonal to that along dim2.
1802 if (normal
[dim1
][dim2
] > 0)
1804 rn
[dim2
] -= rn
[dim1
]*normal
[dim1
][dim2
];
1810 cg_cm
[cg
][dim1
] - c
->bcr1
+ tric_sh
;
1813 rb2
+= rb
[dim1
]*rb
[dim1
]*sf2_round
[dim1
];
1814 /* Take care of coupling of the distances
1815 * to the planes along dim0 and dim1 through dim2.
1817 rb2
-= rb
[dim0
]*rb
[dim1
]*skew_fac_01
;
1818 /* Take care that the cell planes along dim1
1819 * might not be orthogonal to that along dim2.
1821 if (normal
[dim1
][dim2
] > 0)
1823 rb
[dim2
] -= rb
[dim1
]*normal
[dim1
][dim2
];
1828 /* The distance along the communication direction */
1829 rn
[dim
] += cg_cm
[cg
][dim
] - c
->c
[dim_ind
][zone
];
1831 for (i
= dim
+1; i
< DIM
; i
++)
1833 tric_sh
-= cg_cm
[cg
][i
]*v_d
[i
][dim
];
1838 r2
+= rn
[dim
]*rn
[dim
]*skew_fac2_d
;
1839 /* Take care of coupling of the distances
1840 * to the planes along dim0 and dim1 through dim2.
1842 if (dim_ind
== 1 && zonei
== 1)
1844 r2
-= rn
[dim0
]*rn
[dim
]*skew_fac_01
;
1850 rb
[dim
] += cg_cm
[cg
][dim
] - c
->bc
[dim_ind
] + tric_sh
;
1853 rb2
+= rb
[dim
]*rb
[dim
]*skew_fac2_d
;
1854 /* Take care of coupling of the distances
1855 * to the planes along dim0 and dim1 through dim2.
1857 if (dim_ind
== 1 && zonei
== 1)
1859 rb2
-= rb
[dim0
]*rb
[dim
]*skew_fac_01
;
1867 ((bDistMB
&& rb2
< r_bcomm2
) ||
1868 (bDist2B
&& r2
< r_bcomm2
)) &&
1870 (GET_CGINFO_BOND_INTER(cginfo
[cg
]) &&
1871 missing_link(comm
->cglink
, globalAtomGroupIndices
[cg
],
1874 /* Store the local and global atom group indices and position */
1875 localAtomGroups
->push_back(cg
);
1876 ibuf
.push_back(globalAtomGroupIndices
[cg
]);
1880 if (dd
->ci
[dim
] == 0)
1882 /* Correct cg_cm for pbc */
1883 rvec_add(cg_cm
[cg
], box
[dim
], posPbc
);
1886 posPbc
[YY
] = box
[YY
][YY
] - posPbc
[YY
];
1887 posPbc
[ZZ
] = box
[ZZ
][ZZ
] - posPbc
[ZZ
];
1892 copy_rvec(cg_cm
[cg
], posPbc
);
1894 vbuf
.emplace_back(posPbc
[XX
], posPbc
[YY
], posPbc
[ZZ
]);
1896 nat
+= atomGroups
.block(cg
).size();
1901 work
->nsend_zone
= nsend_z
;
1905 static void clearCommSetupData(dd_comm_setup_work_t
*work
)
1907 work
->localAtomGroupBuffer
.clear();
1908 work
->atomGroupBuffer
.clear();
1909 work
->positionBuffer
.clear();
1911 work
->nsend_zone
= 0;
1914 //! Prepare DD communication.
1915 static void setup_dd_communication(gmx_domdec_t
*dd
,
1916 matrix box
, gmx_ddbox_t
*ddbox
,
1919 PaddedVector
<gmx::RVec
> *f
)
1921 int dim_ind
, dim
, dim0
, dim1
, dim2
, dimd
, nat_tot
;
1922 int nzone
, nzone_send
, zone
, zonei
, cg0
, cg1
;
1923 int c
, i
, cg
, cg_gl
, nrcg
;
1924 int *zone_cg_range
, pos_cg
;
1925 gmx_domdec_comm_t
*comm
;
1926 gmx_domdec_zones_t
*zones
;
1927 gmx_domdec_comm_dim_t
*cd
;
1928 cginfo_mb_t
*cginfo_mb
;
1929 gmx_bool bBondComm
, bDist2B
, bDistMB
, bDistBonded
;
1930 dd_corners_t corners
;
1931 rvec
*cg_cm
, *normal
, *v_d
, *v_0
= nullptr, *v_1
= nullptr;
1932 real skew_fac2_d
, skew_fac_01
;
1937 fprintf(debug
, "Setting up DD communication\n");
1942 if (comm
->dth
.empty())
1944 /* Initialize the thread data.
1945 * This can not be done in init_domain_decomposition,
1946 * as the numbers of threads is determined later.
1948 int numThreads
= gmx_omp_nthreads_get(emntDomdec
);
1949 comm
->dth
.resize(numThreads
);
1952 switch (fr
->cutoff_scheme
)
1958 cg_cm
= state
->x
.rvec_array();
1961 gmx_incons("unimplemented");
1964 bBondComm
= comm
->bBondComm
;
1966 /* Do we need to determine extra distances for multi-body bondeds? */
1967 bDistMB
= (comm
->bInterCGMultiBody
&& isDlbOn(dd
->comm
) && dd
->ndim
> 1);
1969 /* Do we need to determine extra distances for only two-body bondeds? */
1970 bDist2B
= (bBondComm
&& !bDistMB
);
1972 const real r_comm2
= gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm
, comm
->cutoff
));
1973 const real r_bcomm2
= gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm
, comm
->cutoff_mbody
));
1977 fprintf(debug
, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm
), std::sqrt(r_bcomm2
));
1980 zones
= &comm
->zones
;
1983 dim1
= (dd
->ndim
>= 2 ? dd
->dim
[1] : -1);
1984 dim2
= (dd
->ndim
>= 3 ? dd
->dim
[2] : -1);
1986 set_dd_corners(dd
, dim0
, dim1
, dim2
, bDistMB
, &corners
);
1988 /* Triclinic stuff */
1989 normal
= ddbox
->normal
;
1993 v_0
= ddbox
->v
[dim0
];
1994 if (ddbox
->tric_dir
[dim0
] && ddbox
->tric_dir
[dim1
])
1996 /* Determine the coupling coefficient for the distances
1997 * to the cell planes along dim0 and dim1 through dim2.
1998 * This is required for correct rounding.
2001 ddbox
->v
[dim0
][dim1
+1][dim0
]*ddbox
->v
[dim1
][dim1
+1][dim1
];
2004 fprintf(debug
, "\nskew_fac_01 %f\n", skew_fac_01
);
2010 v_1
= ddbox
->v
[dim1
];
2013 zone_cg_range
= zones
->cg_range
;
2014 cginfo_mb
= fr
->cginfo_mb
;
2016 zone_cg_range
[0] = 0;
2017 zone_cg_range
[1] = dd
->ncg_home
;
2018 comm
->zone_ncg1
[0] = dd
->ncg_home
;
2019 pos_cg
= dd
->ncg_home
;
2021 nat_tot
= comm
->atomRanges
.numHomeAtoms();
2023 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
2025 dim
= dd
->dim
[dim_ind
];
2026 cd
= &comm
->cd
[dim_ind
];
2028 /* Check if we need to compute triclinic distances along this dim */
2029 bool distanceIsTriclinic
= false;
2030 for (i
= 0; i
<= dim_ind
; i
++)
2032 if (ddbox
->tric_dir
[dd
->dim
[i
]])
2034 distanceIsTriclinic
= true;
2038 if (dim
>= ddbox
->npbcdim
&& dd
->ci
[dim
] == 0)
2040 /* No pbc in this dimension, the first node should not comm. */
2048 v_d
= ddbox
->v
[dim
];
2049 skew_fac2_d
= gmx::square(ddbox
->skew_fac
[dim
]);
2051 cd
->receiveInPlace
= true;
2052 for (int p
= 0; p
< cd
->numPulses(); p
++)
2054 /* Only atoms communicated in the first pulse are used
2055 * for multi-body bonded interactions or for bBondComm.
2057 bDistBonded
= ((bDistMB
|| bDist2B
) && p
== 0);
2059 gmx_domdec_ind_t
*ind
= &cd
->ind
[p
];
2061 /* Thread 0 writes in the global index array */
2063 clearCommSetupData(&comm
->dth
[0]);
2065 for (zone
= 0; zone
< nzone_send
; zone
++)
2067 if (dim_ind
> 0 && distanceIsTriclinic
)
2069 /* Determine slightly more optimized skew_fac's
2071 * This reduces the number of communicated atoms
2072 * by about 10% for 3D DD of rhombic dodecahedra.
2074 for (dimd
= 0; dimd
< dim
; dimd
++)
2076 sf2_round
[dimd
] = 1;
2077 if (ddbox
->tric_dir
[dimd
])
2079 for (i
= dd
->dim
[dimd
]+1; i
< DIM
; i
++)
2081 /* If we are shifted in dimension i
2082 * and the cell plane is tilted forward
2083 * in dimension i, skip this coupling.
2085 if (!(zones
->shift
[nzone
+zone
][i
] &&
2086 ddbox
->v
[dimd
][i
][dimd
] >= 0))
2089 gmx::square(ddbox
->v
[dimd
][i
][dimd
]);
2092 sf2_round
[dimd
] = 1/sf2_round
[dimd
];
2097 zonei
= zone_perm
[dim_ind
][zone
];
2100 /* Here we permutate the zones to obtain a convenient order
2101 * for neighbor searching
2103 cg0
= zone_cg_range
[zonei
];
2104 cg1
= zone_cg_range
[zonei
+1];
2108 /* Look only at the cg's received in the previous grid pulse
2110 cg1
= zone_cg_range
[nzone
+zone
+1];
2111 cg0
= cg1
- cd
->ind
[p
-1].nrecv
[zone
];
2114 const int numThreads
= gmx::ssize(comm
->dth
);
2115 #pragma omp parallel for num_threads(numThreads) schedule(static)
2116 for (int th
= 0; th
< numThreads
; th
++)
2120 dd_comm_setup_work_t
&work
= comm
->dth
[th
];
2122 /* Retain data accumulated into buffers of thread 0 */
2125 clearCommSetupData(&work
);
2128 int cg0_th
= cg0
+ ((cg1
- cg0
)* th
)/numThreads
;
2129 int cg1_th
= cg0
+ ((cg1
- cg0
)*(th
+1))/numThreads
;
2131 /* Get the cg's for this pulse in this zone */
2132 get_zone_pulse_cgs(dd
, zonei
, zone
, cg0_th
, cg1_th
,
2133 dd
->globalAtomGroupIndices
,
2135 dim
, dim_ind
, dim0
, dim1
, dim2
,
2137 box
, distanceIsTriclinic
,
2138 normal
, skew_fac2_d
, skew_fac_01
,
2139 v_d
, v_0
, v_1
, &corners
, sf2_round
,
2140 bDistBonded
, bBondComm
,
2143 th
== 0 ? &ind
->index
: &work
.localAtomGroupBuffer
,
2146 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2149 std::vector
<int> &atomGroups
= comm
->dth
[0].atomGroupBuffer
;
2150 std::vector
<gmx::RVec
> &positions
= comm
->dth
[0].positionBuffer
;
2151 ind
->nsend
[zone
] = comm
->dth
[0].nsend_zone
;
2152 /* Append data of threads>=1 to the communication buffers */
2153 for (int th
= 1; th
< numThreads
; th
++)
2155 const dd_comm_setup_work_t
&dth
= comm
->dth
[th
];
2157 ind
->index
.insert(ind
->index
.end(), dth
.localAtomGroupBuffer
.begin(), dth
.localAtomGroupBuffer
.end());
2158 atomGroups
.insert(atomGroups
.end(), dth
.atomGroupBuffer
.begin(), dth
.atomGroupBuffer
.end());
2159 positions
.insert(positions
.end(), dth
.positionBuffer
.begin(), dth
.positionBuffer
.end());
2160 comm
->dth
[0].nat
+= dth
.nat
;
2161 ind
->nsend
[zone
] += dth
.nsend_zone
;
2164 /* Clear the counts in case we do not have pbc */
2165 for (zone
= nzone_send
; zone
< nzone
; zone
++)
2167 ind
->nsend
[zone
] = 0;
2169 ind
->nsend
[nzone
] = ind
->index
.size();
2170 ind
->nsend
[nzone
+ 1] = comm
->dth
[0].nat
;
2171 /* Communicate the number of cg's and atoms to receive */
2172 ddSendrecv(dd
, dim_ind
, dddirBackward
,
2173 ind
->nsend
, nzone
+2,
2174 ind
->nrecv
, nzone
+2);
2178 /* We can receive in place if only the last zone is not empty */
2179 for (zone
= 0; zone
< nzone
-1; zone
++)
2181 if (ind
->nrecv
[zone
] > 0)
2183 cd
->receiveInPlace
= false;
2188 int receiveBufferSize
= 0;
2189 if (!cd
->receiveInPlace
)
2191 receiveBufferSize
= ind
->nrecv
[nzone
];
2193 /* These buffer are actually only needed with in-place */
2194 DDBufferAccess
<int> globalAtomGroupBuffer(comm
->intBuffer
, receiveBufferSize
);
2195 DDBufferAccess
<gmx::RVec
> rvecBuffer(comm
->rvecBuffer
, receiveBufferSize
);
2197 dd_comm_setup_work_t
&work
= comm
->dth
[0];
2199 /* Make space for the global cg indices */
2200 int numAtomGroupsNew
= pos_cg
+ ind
->nrecv
[nzone
];
2201 dd
->globalAtomGroupIndices
.resize(numAtomGroupsNew
);
2202 /* Communicate the global cg indices */
2203 gmx::ArrayRef
<int> integerBufferRef
;
2204 if (cd
->receiveInPlace
)
2206 integerBufferRef
= gmx::arrayRefFromArray(dd
->globalAtomGroupIndices
.data() + pos_cg
, ind
->nrecv
[nzone
]);
2210 integerBufferRef
= globalAtomGroupBuffer
.buffer
;
2212 ddSendrecv
<int>(dd
, dim_ind
, dddirBackward
,
2213 work
.atomGroupBuffer
, integerBufferRef
);
2215 /* Make space for cg_cm */
2216 dd_check_alloc_ncg(fr
, state
, f
, pos_cg
+ ind
->nrecv
[nzone
]);
2217 if (fr
->cutoff_scheme
== ecutsGROUP
)
2223 cg_cm
= state
->x
.rvec_array();
2225 /* Communicate cg_cm */
2226 gmx::ArrayRef
<gmx::RVec
> rvecBufferRef
;
2227 if (cd
->receiveInPlace
)
2229 rvecBufferRef
= gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec
*>(cg_cm
+ pos_cg
), ind
->nrecv
[nzone
]);
2233 rvecBufferRef
= rvecBuffer
.buffer
;
2235 ddSendrecv
<gmx::RVec
>(dd
, dim_ind
, dddirBackward
,
2236 work
.positionBuffer
, rvecBufferRef
);
2238 /* Make the charge group index */
2239 if (cd
->receiveInPlace
)
2241 zone
= (p
== 0 ? 0 : nzone
- 1);
2242 while (zone
< nzone
)
2244 for (cg
= 0; cg
< ind
->nrecv
[zone
]; cg
++)
2246 cg_gl
= dd
->globalAtomGroupIndices
[pos_cg
];
2247 fr
->cginfo
[pos_cg
] = ddcginfo(cginfo_mb
, cg_gl
);
2248 nrcg
= GET_CGINFO_NATOMS(fr
->cginfo
[pos_cg
]);
2249 dd
->atomGrouping_
.appendBlock(nrcg
);
2252 /* Update the charge group presence,
2253 * so we can use it in the next pass of the loop.
2255 comm
->bLocalCG
[cg_gl
] = TRUE
;
2261 comm
->zone_ncg1
[nzone
+zone
] = ind
->nrecv
[zone
];
2264 zone_cg_range
[nzone
+zone
] = pos_cg
;
2269 /* This part of the code is never executed with bBondComm. */
2270 std::vector
<int> &atomGroupsIndex
= dd
->atomGrouping_
.rawIndex();
2271 atomGroupsIndex
.resize(numAtomGroupsNew
+ 1);
2273 merge_cg_buffers(nzone
, cd
, p
, zone_cg_range
,
2274 dd
->globalAtomGroupIndices
, integerBufferRef
.data(),
2275 cg_cm
, as_rvec_array(rvecBufferRef
.data()),
2277 fr
->cginfo_mb
, fr
->cginfo
);
2278 pos_cg
+= ind
->nrecv
[nzone
];
2280 nat_tot
+= ind
->nrecv
[nzone
+1];
2282 if (!cd
->receiveInPlace
)
2284 /* Store the atom block for easy copying of communication buffers */
2285 make_cell2at_index(cd
, nzone
, zone_cg_range
[nzone
], dd
->atomGrouping());
2290 comm
->atomRanges
.setEnd(DDAtomRanges::Type::Zones
, nat_tot
);
2294 /* We don't need to update cginfo, since that was alrady done above.
2295 * So we pass NULL for the forcerec.
2297 dd_set_cginfo(dd
->globalAtomGroupIndices
,
2298 dd
->ncg_home
, dd
->globalAtomGroupIndices
.size(),
2299 nullptr, comm
->bLocalCG
);
2304 fprintf(debug
, "Finished setting up DD communication, zones:");
2305 for (c
= 0; c
< zones
->n
; c
++)
2307 fprintf(debug
, " %d", zones
->cg_range
[c
+1]-zones
->cg_range
[c
]);
2309 fprintf(debug
, "\n");
2313 //! Set boundaries for the charge group range.
2314 static void set_cg_boundaries(gmx_domdec_zones_t
*zones
)
2318 for (c
= 0; c
< zones
->nizone
; c
++)
2320 zones
->izone
[c
].cg1
= zones
->cg_range
[c
+1];
2321 zones
->izone
[c
].jcg0
= zones
->cg_range
[zones
->izone
[c
].j0
];
2322 zones
->izone
[c
].jcg1
= zones
->cg_range
[zones
->izone
[c
].j1
];
2326 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2328 * Also sets the atom density for the home zone when \p zone_start=0.
2329 * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2330 * how many charge groups will move but are still part of the current range.
2331 * \todo When converting domdec to use proper classes, all these variables
2332 * should be private and a method should return the correct count
2333 * depending on an internal state.
2335 * \param[in,out] dd The domain decomposition struct
2336 * \param[in] box The box
2337 * \param[in] ddbox The domain decomposition box struct
2338 * \param[in] zone_start The start of the zone range to set sizes for
2339 * \param[in] zone_end The end of the zone range to set sizes for
2340 * \param[in] numMovedChargeGroupsInHomeZone The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2342 static void set_zones_size(gmx_domdec_t
*dd
,
2343 matrix box
, const gmx_ddbox_t
*ddbox
,
2344 int zone_start
, int zone_end
,
2345 int numMovedChargeGroupsInHomeZone
)
2347 gmx_domdec_comm_t
*comm
;
2348 gmx_domdec_zones_t
*zones
;
2357 zones
= &comm
->zones
;
2359 /* Do we need to determine extra distances for multi-body bondeds? */
2360 bDistMB
= (comm
->bInterCGMultiBody
&& isDlbOn(dd
->comm
) && dd
->ndim
> 1);
2362 for (z
= zone_start
; z
< zone_end
; z
++)
2364 /* Copy cell limits to zone limits.
2365 * Valid for non-DD dims and non-shifted dims.
2367 copy_rvec(comm
->cell_x0
, zones
->size
[z
].x0
);
2368 copy_rvec(comm
->cell_x1
, zones
->size
[z
].x1
);
2371 for (d
= 0; d
< dd
->ndim
; d
++)
2375 for (z
= 0; z
< zones
->n
; z
++)
2377 /* With a staggered grid we have different sizes
2378 * for non-shifted dimensions.
2380 if (isDlbOn(dd
->comm
) && zones
->shift
[z
][dim
] == 0)
2384 zones
->size
[z
].x0
[dim
] = comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].min0
;
2385 zones
->size
[z
].x1
[dim
] = comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].max1
;
2389 zones
->size
[z
].x0
[dim
] = comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].min0
;
2390 zones
->size
[z
].x1
[dim
] = comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].max1
;
2396 rcmbs
= comm
->cutoff_mbody
;
2397 if (ddbox
->tric_dir
[dim
])
2399 rcs
/= ddbox
->skew_fac
[dim
];
2400 rcmbs
/= ddbox
->skew_fac
[dim
];
2403 /* Set the lower limit for the shifted zone dimensions */
2404 for (z
= zone_start
; z
< zone_end
; z
++)
2406 if (zones
->shift
[z
][dim
] > 0)
2409 if (!isDlbOn(dd
->comm
) || d
== 0)
2411 zones
->size
[z
].x0
[dim
] = comm
->cell_x1
[dim
];
2412 zones
->size
[z
].x1
[dim
] = comm
->cell_x1
[dim
] + rcs
;
2416 /* Here we take the lower limit of the zone from
2417 * the lowest domain of the zone below.
2421 zones
->size
[z
].x0
[dim
] =
2422 comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].min1
;
2428 zones
->size
[z
].x0
[dim
] =
2429 zones
->size
[zone_perm
[2][z
-4]].x0
[dim
];
2433 zones
->size
[z
].x0
[dim
] =
2434 comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].min1
;
2437 /* A temporary limit, is updated below */
2438 zones
->size
[z
].x1
[dim
] = zones
->size
[z
].x0
[dim
];
2442 for (zi
= 0; zi
< zones
->nizone
; zi
++)
2444 if (zones
->shift
[zi
][dim
] == 0)
2446 /* This takes the whole zone into account.
2447 * With multiple pulses this will lead
2448 * to a larger zone then strictly necessary.
2450 zones
->size
[z
].x1
[dim
] = std::max(zones
->size
[z
].x1
[dim
],
2451 zones
->size
[zi
].x1
[dim
]+rcmbs
);
2459 /* Loop over the i-zones to set the upper limit of each
2462 for (zi
= 0; zi
< zones
->nizone
; zi
++)
2464 if (zones
->shift
[zi
][dim
] == 0)
2466 /* We should only use zones up to zone_end */
2467 int jZoneEnd
= std::min(zones
->izone
[zi
].j1
, zone_end
);
2468 for (z
= zones
->izone
[zi
].j0
; z
< jZoneEnd
; z
++)
2470 if (zones
->shift
[z
][dim
] > 0)
2472 zones
->size
[z
].x1
[dim
] = std::max(zones
->size
[z
].x1
[dim
],
2473 zones
->size
[zi
].x1
[dim
]+rcs
);
2480 for (z
= zone_start
; z
< zone_end
; z
++)
2482 /* Initialization only required to keep the compiler happy */
2483 rvec corner_min
= {0, 0, 0}, corner_max
= {0, 0, 0}, corner
;
2486 /* To determine the bounding box for a zone we need to find
2487 * the extreme corners of 4, 2 or 1 corners.
2489 nc
= 1 << (ddbox
->nboundeddim
- 1);
2491 for (c
= 0; c
< nc
; c
++)
2493 /* Set up a zone corner at x=0, ignoring trilinic couplings */
2497 corner
[YY
] = zones
->size
[z
].x0
[YY
];
2501 corner
[YY
] = zones
->size
[z
].x1
[YY
];
2505 corner
[ZZ
] = zones
->size
[z
].x0
[ZZ
];
2509 corner
[ZZ
] = zones
->size
[z
].x1
[ZZ
];
2511 if (dd
->ndim
== 1 && dd
->dim
[0] < ZZ
&& ZZ
< dd
->npbcdim
&&
2512 box
[ZZ
][1 - dd
->dim
[0]] != 0)
2514 /* With 1D domain decomposition the cg's are not in
2515 * the triclinic box, but triclinic x-y and rectangular y/x-z.
2516 * Shift the corner of the z-vector back to along the box
2517 * vector of dimension d, so it will later end up at 0 along d.
2518 * This can affect the location of this corner along dd->dim[0]
2519 * through the matrix operation below if box[d][dd->dim[0]]!=0.
2521 int d
= 1 - dd
->dim
[0];
2523 corner
[d
] -= corner
[ZZ
]*box
[ZZ
][d
]/box
[ZZ
][ZZ
];
2525 /* Apply the triclinic couplings */
2526 assert(ddbox
->npbcdim
<= DIM
);
2527 for (i
= YY
; i
< ddbox
->npbcdim
; i
++)
2529 for (j
= XX
; j
< i
; j
++)
2531 corner
[j
] += corner
[i
]*box
[i
][j
]/box
[i
][i
];
2536 copy_rvec(corner
, corner_min
);
2537 copy_rvec(corner
, corner_max
);
2541 for (i
= 0; i
< DIM
; i
++)
2543 corner_min
[i
] = std::min(corner_min
[i
], corner
[i
]);
2544 corner_max
[i
] = std::max(corner_max
[i
], corner
[i
]);
2548 /* Copy the extreme cornes without offset along x */
2549 for (i
= 0; i
< DIM
; i
++)
2551 zones
->size
[z
].bb_x0
[i
] = corner_min
[i
];
2552 zones
->size
[z
].bb_x1
[i
] = corner_max
[i
];
2554 /* Add the offset along x */
2555 zones
->size
[z
].bb_x0
[XX
] += zones
->size
[z
].x0
[XX
];
2556 zones
->size
[z
].bb_x1
[XX
] += zones
->size
[z
].x1
[XX
];
2559 if (zone_start
== 0)
2562 for (dim
= 0; dim
< DIM
; dim
++)
2564 vol
*= zones
->size
[0].x1
[dim
] - zones
->size
[0].x0
[dim
];
2566 zones
->dens_zone0
= (zones
->cg_range
[1] - zones
->cg_range
[0] - numMovedChargeGroupsInHomeZone
)/vol
;
2571 for (z
= zone_start
; z
< zone_end
; z
++)
2573 fprintf(debug
, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
2575 zones
->size
[z
].x0
[XX
], zones
->size
[z
].x1
[XX
],
2576 zones
->size
[z
].x0
[YY
], zones
->size
[z
].x1
[YY
],
2577 zones
->size
[z
].x0
[ZZ
], zones
->size
[z
].x1
[ZZ
]);
2578 fprintf(debug
, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
2580 zones
->size
[z
].bb_x0
[XX
], zones
->size
[z
].bb_x1
[XX
],
2581 zones
->size
[z
].bb_x0
[YY
], zones
->size
[z
].bb_x1
[YY
],
2582 zones
->size
[z
].bb_x0
[ZZ
], zones
->size
[z
].bb_x1
[ZZ
]);
2587 /*! \brief Order data in \p dataToSort according to \p sort
2589 * Note: both buffers should have at least \p sort.size() elements.
2591 template <typename T
>
2593 orderVector(gmx::ArrayRef
<const gmx_cgsort_t
> sort
,
2594 gmx::ArrayRef
<T
> dataToSort
,
2595 gmx::ArrayRef
<T
> sortBuffer
)
2597 GMX_ASSERT(dataToSort
.size() >= sort
.size(), "The vector needs to be sufficiently large");
2598 GMX_ASSERT(sortBuffer
.size() >= sort
.size(), "The sorting buffer needs to be sufficiently large");
2600 /* Order the data into the temporary buffer */
2602 for (const gmx_cgsort_t
&entry
: sort
)
2604 sortBuffer
[i
++] = dataToSort
[entry
.ind
];
2607 /* Copy back to the original array */
2608 std::copy(sortBuffer
.begin(), sortBuffer
.begin() + sort
.size(),
2609 dataToSort
.begin());
2612 /*! \brief Order data in \p dataToSort according to \p sort
2614 * Note: \p vectorToSort should have at least \p sort.size() elements,
2615 * \p workVector is resized when it is too small.
2617 template <typename T
>
2619 orderVector(gmx::ArrayRef
<const gmx_cgsort_t
> sort
,
2620 gmx::ArrayRef
<T
> vectorToSort
,
2621 std::vector
<T
> *workVector
)
2623 if (gmx::index(workVector
->size()) < sort
.ssize())
2625 workVector
->resize(sort
.size());
2627 orderVector
<T
>(sort
, vectorToSort
, *workVector
);
2630 //! Order vectors of atoms.
2631 static void order_vec_atom(const gmx::RangePartitioning
*atomGroups
,
2632 gmx::ArrayRef
<const gmx_cgsort_t
> sort
,
2633 gmx::ArrayRef
<gmx::RVec
> v
,
2634 gmx::ArrayRef
<gmx::RVec
> buf
)
2636 if (atomGroups
== nullptr)
2638 /* Avoid the useless loop of the atoms within a cg */
2639 orderVector(sort
, v
, buf
);
2644 /* Order the data */
2646 for (const gmx_cgsort_t
&entry
: sort
)
2648 for (int i
: atomGroups
->block(entry
.ind
))
2650 copy_rvec(v
[i
], buf
[a
]);
2656 /* Copy back to the original array */
2657 for (int a
= 0; a
< atot
; a
++)
2659 copy_rvec(buf
[a
], v
[a
]);
2663 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2664 static void dd_sort_order_nbnxn(const t_forcerec
*fr
,
2665 std::vector
<gmx_cgsort_t
> *sort
)
2667 gmx::ArrayRef
<const int> atomOrder
= fr
->nbv
->getLocalAtomOrder();
2669 /* Using push_back() instead of this resize results in much slower code */
2670 sort
->resize(atomOrder
.size());
2671 gmx::ArrayRef
<gmx_cgsort_t
> buffer
= *sort
;
2672 size_t numSorted
= 0;
2673 for (int i
: atomOrder
)
2677 /* The values of nsc and ind_gl are not used in this case */
2678 buffer
[numSorted
++].ind
= i
;
2681 sort
->resize(numSorted
);
2684 //! Returns the sorting state for DD.
2685 static void dd_sort_state(gmx_domdec_t
*dd
, rvec
*cgcm
, t_forcerec
*fr
, t_state
*state
)
2687 gmx_domdec_sort_t
*sort
= dd
->comm
->sort
.get();
2689 dd_sort_order_nbnxn(fr
, &sort
->sorted
);
2691 const gmx::RangePartitioning
&atomGrouping
= dd
->atomGrouping();
2693 /* We alloc with the old size, since cgindex is still old */
2694 GMX_ASSERT(atomGrouping
.numBlocks() == dd
->ncg_home
, "atomGroups and dd should be consistent");
2695 DDBufferAccess
<gmx::RVec
> rvecBuffer(dd
->comm
->rvecBuffer
, atomGrouping
.fullRange().end());
2697 const gmx::RangePartitioning
*atomGroupsPtr
= (dd
->comm
->bCGs
? &atomGrouping
: nullptr);
2699 /* Set the new home atom/charge group count */
2700 dd
->ncg_home
= sort
->sorted
.size();
2703 fprintf(debug
, "Set the new home %s count to %d\n",
2704 dd
->comm
->bCGs
? "charge group" : "atom",
2708 /* Reorder the state */
2709 gmx::ArrayRef
<const gmx_cgsort_t
> cgsort
= sort
->sorted
;
2710 GMX_RELEASE_ASSERT(cgsort
.ssize() == dd
->ncg_home
, "We should sort all the home atom groups");
2712 if (state
->flags
& (1 << estX
))
2714 order_vec_atom(atomGroupsPtr
, cgsort
, state
->x
, rvecBuffer
.buffer
);
2716 if (state
->flags
& (1 << estV
))
2718 order_vec_atom(atomGroupsPtr
, cgsort
, state
->v
, rvecBuffer
.buffer
);
2720 if (state
->flags
& (1 << estCGP
))
2722 order_vec_atom(atomGroupsPtr
, cgsort
, state
->cg_p
, rvecBuffer
.buffer
);
2725 if (fr
->cutoff_scheme
== ecutsGROUP
)
2728 gmx::ArrayRef
<gmx::RVec
> cgcmRef
= gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec
*>(cgcm
[0]), cgsort
.size());
2729 orderVector(cgsort
, cgcmRef
, rvecBuffer
.buffer
);
2732 /* Reorder the global cg index */
2733 orderVector
<int>(cgsort
, dd
->globalAtomGroupIndices
, &sort
->intBuffer
);
2734 /* Reorder the cginfo */
2735 orderVector
<int>(cgsort
, gmx::arrayRefFromArray(fr
->cginfo
, cgsort
.size()), &sort
->intBuffer
);
2736 /* Rebuild the local cg index */
2739 /* We make a new, ordered atomGroups object and assign it to
2740 * the old one. This causes some allocation overhead, but saves
2741 * a copy back of the whole index.
2743 gmx::RangePartitioning ordered
;
2744 for (const gmx_cgsort_t
&entry
: cgsort
)
2746 ordered
.appendBlock(atomGrouping
.block(entry
.ind
).size());
2748 dd
->atomGrouping_
= ordered
;
2752 dd
->atomGrouping_
.setAllBlocksSizeOne(dd
->ncg_home
);
2754 /* Set the home atom number */
2755 dd
->comm
->atomRanges
.setEnd(DDAtomRanges::Type::Home
, dd
->atomGrouping().fullRange().end());
2757 /* The atoms are now exactly in grid order, update the grid order */
2758 fr
->nbv
->setLocalAtomOrder();
2761 //! Accumulates load statistics.
2762 static void add_dd_statistics(gmx_domdec_t
*dd
)
2764 gmx_domdec_comm_t
*comm
= dd
->comm
;
2766 for (int i
= 0; i
< static_cast<int>(DDAtomRanges::Type::Number
); i
++)
2768 auto range
= static_cast<DDAtomRanges::Type
>(i
);
2770 comm
->atomRanges
.end(range
) - comm
->atomRanges
.start(range
);
2775 void reset_dd_statistics_counters(gmx_domdec_t
*dd
)
2777 gmx_domdec_comm_t
*comm
= dd
->comm
;
2779 /* Reset all the statistics and counters for total run counting */
2780 for (int i
= 0; i
< static_cast<int>(DDAtomRanges::Type::Number
); i
++)
2782 comm
->sum_nat
[i
] = 0;
2786 comm
->load_step
= 0;
2789 clear_ivec(comm
->load_lim
);
2794 void print_dd_statistics(const t_commrec
*cr
, const t_inputrec
*ir
, FILE *fplog
)
2796 gmx_domdec_comm_t
*comm
= cr
->dd
->comm
;
2798 const int numRanges
= static_cast<int>(DDAtomRanges::Type::Number
);
2799 gmx_sumd(numRanges
, comm
->sum_nat
, cr
);
2801 if (fplog
== nullptr)
2806 fprintf(fplog
, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
2808 for (int i
= static_cast<int>(DDAtomRanges::Type::Zones
); i
< numRanges
; i
++)
2810 auto range
= static_cast<DDAtomRanges::Type
>(i
);
2811 double av
= comm
->sum_nat
[i
]/comm
->ndecomp
;
2814 case DDAtomRanges::Type::Zones
:
2816 " av. #atoms communicated per step for force: %d x %.1f\n",
2819 case DDAtomRanges::Type::Vsites
:
2820 if (cr
->dd
->vsite_comm
)
2823 " av. #atoms communicated per step for vsites: %d x %.1f\n",
2824 (EEL_PME(ir
->coulombtype
) || ir
->coulombtype
== eelEWALD
) ? 3 : 2,
2828 case DDAtomRanges::Type::Constraints
:
2829 if (cr
->dd
->constraint_comm
)
2832 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
2833 1 + ir
->nLincsIter
, av
);
2837 gmx_incons(" Unknown type for DD statistics");
2840 fprintf(fplog
, "\n");
2842 if (comm
->bRecordLoad
&& EI_DYNAMICS(ir
->eI
))
2844 print_dd_load_av(fplog
, cr
->dd
);
2848 // TODO Remove fplog when group scheme and charge groups are gone
2849 void dd_partition_system(FILE *fplog
,
2850 const gmx::MDLogger
&mdlog
,
2852 const t_commrec
*cr
,
2853 gmx_bool bMasterState
,
2855 t_state
*state_global
,
2856 const gmx_mtop_t
&top_global
,
2857 const t_inputrec
*ir
,
2858 gmx::ImdSession
*imdSession
,
2859 t_state
*state_local
,
2860 PaddedVector
<gmx::RVec
> *f
,
2861 gmx::MDAtoms
*mdAtoms
,
2862 gmx_localtop_t
*top_local
,
2865 gmx::Constraints
*constr
,
2867 gmx_wallcycle
*wcycle
,
2871 gmx_domdec_comm_t
*comm
;
2872 gmx_ddbox_t ddbox
= {0};
2874 int64_t step_pcoupl
;
2875 rvec cell_ns_x0
, cell_ns_x1
;
2876 int ncgindex_set
, ncg_moved
, nat_f_novirsum
;
2877 gmx_bool bBoxChanged
, bNStGlobalComm
, bDoDLB
, bCheckWhetherToTurnDlbOn
, bLogLoad
;
2883 wallcycle_start(wcycle
, ewcDOMDEC
);
2888 // TODO if the update code becomes accessible here, use
2889 // upd->deform for this logic.
2890 bBoxChanged
= (bMasterState
|| inputrecDeform(ir
));
2891 if (ir
->epc
!= epcNO
)
2893 /* With nstpcouple > 1 pressure coupling happens.
2894 * one step after calculating the pressure.
2895 * Box scaling happens at the end of the MD step,
2896 * after the DD partitioning.
2897 * We therefore have to do DLB in the first partitioning
2898 * after an MD step where P-coupling occurred.
2899 * We need to determine the last step in which p-coupling occurred.
2900 * MRS -- need to validate this for vv?
2902 int n
= ir
->nstpcouple
;
2905 step_pcoupl
= step
- 1;
2909 step_pcoupl
= ((step
- 1)/n
)*n
+ 1;
2911 if (step_pcoupl
>= comm
->partition_step
)
2917 bNStGlobalComm
= (step
% nstglobalcomm
== 0);
2925 /* Should we do dynamic load balacing this step?
2926 * Since it requires (possibly expensive) global communication,
2927 * we might want to do DLB less frequently.
2929 if (bBoxChanged
|| ir
->epc
!= epcNO
)
2931 bDoDLB
= bBoxChanged
;
2935 bDoDLB
= bNStGlobalComm
;
2939 /* Check if we have recorded loads on the nodes */
2940 if (comm
->bRecordLoad
&& dd_load_count(comm
) > 0)
2942 bCheckWhetherToTurnDlbOn
= dd_dlb_get_should_check_whether_to_turn_dlb_on(dd
);
2944 /* Print load every nstlog, first and last step to the log file */
2945 bLogLoad
= ((ir
->nstlog
> 0 && step
% ir
->nstlog
== 0) ||
2946 comm
->n_load_collect
== 0 ||
2948 (step
+ ir
->nstlist
> ir
->init_step
+ ir
->nsteps
)));
2950 /* Avoid extra communication due to verbose screen output
2951 * when nstglobalcomm is set.
2953 if (bDoDLB
|| bLogLoad
|| bCheckWhetherToTurnDlbOn
||
2954 (bVerbose
&& (ir
->nstlist
== 0 || nstglobalcomm
<= ir
->nstlist
)))
2956 get_load_distribution(dd
, wcycle
);
2961 GMX_LOG(mdlog
.info
).asParagraph().appendText(dd_print_load(dd
, step
-1));
2965 dd_print_load_verbose(dd
);
2968 comm
->n_load_collect
++;
2974 /* Add the measured cycles to the running average */
2975 const float averageFactor
= 0.1f
;
2976 comm
->cyclesPerStepDlbExpAverage
=
2977 (1 - averageFactor
)*comm
->cyclesPerStepDlbExpAverage
+
2978 averageFactor
*comm
->cycl
[ddCyclStep
]/comm
->cycl_n
[ddCyclStep
];
2980 if (comm
->dlbState
== DlbState::onCanTurnOff
&&
2981 dd
->comm
->n_load_have
% c_checkTurnDlbOffInterval
== c_checkTurnDlbOffInterval
- 1)
2983 gmx_bool turnOffDlb
;
2986 /* If the running averaged cycles with DLB are more
2987 * than before we turned on DLB, turn off DLB.
2988 * We will again run and check the cycles without DLB
2989 * and we can then decide if to turn off DLB forever.
2991 turnOffDlb
= (comm
->cyclesPerStepDlbExpAverage
>
2992 comm
->cyclesPerStepBeforeDLB
);
2994 dd_bcast(dd
, sizeof(turnOffDlb
), &turnOffDlb
);
2997 /* To turn off DLB, we need to redistribute the atoms */
2998 dd_collect_state(dd
, state_local
, state_global
);
2999 bMasterState
= TRUE
;
3000 turn_off_dlb(mdlog
, dd
, step
);
3004 else if (bCheckWhetherToTurnDlbOn
)
3006 gmx_bool turnOffDlbForever
= FALSE
;
3007 gmx_bool turnOnDlb
= FALSE
;
3009 /* Since the timings are node dependent, the master decides */
3012 /* If we recently turned off DLB, we want to check if
3013 * performance is better without DLB. We want to do this
3014 * ASAP to minimize the chance that external factors
3015 * slowed down the DLB step are gone here and we
3016 * incorrectly conclude that DLB was causing the slowdown.
3017 * So we measure one nstlist block, no running average.
3019 if (comm
->haveTurnedOffDlb
&&
3020 comm
->cycl
[ddCyclStep
]/comm
->cycl_n
[ddCyclStep
] <
3021 comm
->cyclesPerStepDlbExpAverage
)
3023 /* After turning off DLB we ran nstlist steps in fewer
3024 * cycles than with DLB. This likely means that DLB
3025 * in not benefical, but this could be due to a one
3026 * time unlucky fluctuation, so we require two such
3027 * observations in close succession to turn off DLB
3030 if (comm
->dlbSlowerPartitioningCount
> 0 &&
3031 dd
->ddp_count
< comm
->dlbSlowerPartitioningCount
+ 10*c_checkTurnDlbOnInterval
)
3033 turnOffDlbForever
= TRUE
;
3035 comm
->haveTurnedOffDlb
= false;
3036 /* Register when we last measured DLB slowdown */
3037 comm
->dlbSlowerPartitioningCount
= dd
->ddp_count
;
3041 /* Here we check if the max PME rank load is more than 0.98
3042 * the max PP force load. If so, PP DLB will not help,
3043 * since we are (almost) limited by PME. Furthermore,
3044 * DLB will cause a significant extra x/f redistribution
3045 * cost on the PME ranks, which will then surely result
3046 * in lower total performance.
3048 if (cr
->npmenodes
> 0 &&
3049 dd_pme_f_ratio(dd
) > 1 - DD_PERF_LOSS_DLB_ON
)
3055 turnOnDlb
= (dd_force_imb_perf_loss(dd
) >= DD_PERF_LOSS_DLB_ON
);
3061 gmx_bool turnOffDlbForever
;
3065 turnOffDlbForever
, turnOnDlb
3067 dd_bcast(dd
, sizeof(bools
), &bools
);
3068 if (bools
.turnOffDlbForever
)
3070 turn_off_dlb_forever(mdlog
, dd
, step
);
3072 else if (bools
.turnOnDlb
)
3074 turn_on_dlb(mdlog
, dd
, step
);
3079 comm
->n_load_have
++;
3082 cgs_gl
= &comm
->cgs_gl
;
3087 /* Clear the old state */
3088 clearDDStateIndices(dd
, 0, 0);
3091 auto xGlobal
= positionsFromStatePointer(state_global
);
3093 set_ddbox(*dd
, true,
3094 DDMASTER(dd
) ? state_global
->box
: nullptr,
3098 distributeState(mdlog
, dd
, top_global
, state_global
, ddbox
, state_local
, f
);
3100 dd_make_local_cgs(dd
, &top_local
->cgs
);
3102 /* Ensure that we have space for the new distribution */
3103 dd_check_alloc_ncg(fr
, state_local
, f
, dd
->ncg_home
);
3105 if (fr
->cutoff_scheme
== ecutsGROUP
)
3107 calc_cgcm(fplog
, 0, dd
->ncg_home
,
3108 &top_local
->cgs
, state_local
->x
.rvec_array(), fr
->cg_cm
);
3111 inc_nrnb(nrnb
, eNR_CGCM
, comm
->atomRanges
.numHomeAtoms());
3113 dd_set_cginfo(dd
->globalAtomGroupIndices
, 0, dd
->ncg_home
, fr
, comm
->bLocalCG
);
3115 else if (state_local
->ddp_count
!= dd
->ddp_count
)
3117 if (state_local
->ddp_count
> dd
->ddp_count
)
3119 gmx_fatal(FARGS
, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
")", state_local
->ddp_count
, dd
->ddp_count
);
3122 if (state_local
->ddp_count_cg_gl
!= state_local
->ddp_count
)
3124 gmx_fatal(FARGS
, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local
->ddp_count_cg_gl
, state_local
->ddp_count
);
3127 /* Clear the old state */
3128 clearDDStateIndices(dd
, 0, 0);
3130 /* Restore the atom group indices from state_local */
3131 restoreAtomGroups(dd
, cgs_gl
->index
, state_local
);
3132 make_dd_indices(dd
, cgs_gl
->index
, 0);
3133 ncgindex_set
= dd
->ncg_home
;
3135 if (fr
->cutoff_scheme
== ecutsGROUP
)
3137 /* Redetermine the cg COMs */
3138 calc_cgcm(fplog
, 0, dd
->ncg_home
,
3139 &top_local
->cgs
, state_local
->x
.rvec_array(), fr
->cg_cm
);
3142 inc_nrnb(nrnb
, eNR_CGCM
, comm
->atomRanges
.numHomeAtoms());
3144 dd_set_cginfo(dd
->globalAtomGroupIndices
, 0, dd
->ncg_home
, fr
, comm
->bLocalCG
);
3146 set_ddbox(*dd
, bMasterState
, state_local
->box
,
3147 true, state_local
->x
, &ddbox
);
3149 bRedist
= isDlbOn(comm
);
3153 /* We have the full state, only redistribute the cgs */
3155 /* Clear the non-home indices */
3156 clearDDStateIndices(dd
, dd
->ncg_home
, comm
->atomRanges
.numHomeAtoms());
3159 /* To avoid global communication, we do not recompute the extent
3160 * of the system for dims without pbc. Therefore we need to copy
3161 * the previously computed values when we do not communicate.
3163 if (!bNStGlobalComm
)
3165 copy_rvec(comm
->box0
, ddbox
.box0
);
3166 copy_rvec(comm
->box_size
, ddbox
.box_size
);
3168 set_ddbox(*dd
, bMasterState
, state_local
->box
,
3169 bNStGlobalComm
, state_local
->x
, &ddbox
);
3174 /* Copy needed for dim's without pbc when avoiding communication */
3175 copy_rvec(ddbox
.box0
, comm
->box0
);
3176 copy_rvec(ddbox
.box_size
, comm
->box_size
);
3178 set_dd_cell_sizes(dd
, &ddbox
, dynamic_dd_box(*dd
), bMasterState
, bDoDLB
,
3181 if (comm
->nstDDDumpGrid
> 0 && step
% comm
->nstDDDumpGrid
== 0)
3183 write_dd_grid_pdb("dd_grid", step
, dd
, state_local
->box
, &ddbox
);
3186 if (comm
->useUpdateGroups
)
3188 comm
->updateGroupsCog
->addCogs(gmx::arrayRefFromArray(dd
->globalAtomGroupIndices
.data(), dd
->ncg_home
),
3192 /* Check if we should sort the charge groups */
3193 const bool bSortCG
= (bMasterState
|| bRedist
);
3195 /* When repartitioning we mark atom groups that will move to neighboring
3196 * DD cells, but we do not move them right away for performance reasons.
3197 * Thus we need to keep track of how many charge groups will move for
3198 * obtaining correct local charge group / atom counts.
3203 wallcycle_sub_start(wcycle
, ewcsDD_REDIST
);
3205 ncgindex_set
= dd
->ncg_home
;
3206 dd_redistribute_cg(fplog
, step
, dd
, ddbox
.tric_dir
,
3210 GMX_RELEASE_ASSERT(bSortCG
, "Sorting is required after redistribution");
3212 if (comm
->useUpdateGroups
)
3214 comm
->updateGroupsCog
->addCogs(gmx::arrayRefFromArray(dd
->globalAtomGroupIndices
.data(), dd
->ncg_home
),
3218 wallcycle_sub_stop(wcycle
, ewcsDD_REDIST
);
3221 get_nsgrid_boundaries(ddbox
.nboundeddim
, state_local
->box
,
3223 &comm
->cell_x0
, &comm
->cell_x1
,
3224 dd
->ncg_home
, fr
->cg_cm
,
3225 cell_ns_x0
, cell_ns_x1
, &grid_density
);
3229 comm_dd_ns_cell_sizes(dd
, &ddbox
, cell_ns_x0
, cell_ns_x1
, step
);
3232 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
3233 copy_ivec(ddbox
.tric_dir
, comm
->tric_dir
);
3237 wallcycle_sub_start(wcycle
, ewcsDD_GRID
);
3239 /* Sort the state on charge group position.
3240 * This enables exact restarts from this step.
3241 * It also improves performance by about 15% with larger numbers
3242 * of atoms per node.
3245 /* Fill the ns grid with the home cell,
3246 * so we can sort with the indices.
3248 set_zones_ncg_home(dd
);
3250 set_zones_size(dd
, state_local
->box
, &ddbox
, 0, 1, ncg_moved
);
3252 nbnxn_put_on_grid(fr
->nbv
.get(), state_local
->box
,
3254 comm
->zones
.size
[0].bb_x0
,
3255 comm
->zones
.size
[0].bb_x1
,
3256 comm
->updateGroupsCog
.get(),
3258 comm
->zones
.dens_zone0
,
3261 ncg_moved
, bRedist
? comm
->movedBuffer
.data() : nullptr);
3265 fprintf(debug
, "Step %s, sorting the %d home charge groups\n",
3266 gmx_step_str(step
, sbuf
), dd
->ncg_home
);
3268 dd_sort_state(dd
, fr
->cg_cm
, fr
, state_local
);
3270 /* After sorting and compacting we set the correct size */
3271 dd_resize_state(state_local
, f
, comm
->atomRanges
.numHomeAtoms());
3273 /* Rebuild all the indices */
3277 wallcycle_sub_stop(wcycle
, ewcsDD_GRID
);
3281 /* With the group scheme the sorting array is part of the DD state,
3282 * but it just got out of sync, so mark as invalid by emptying it.
3284 if (ir
->cutoff_scheme
== ecutsGROUP
)
3286 comm
->sort
->sorted
.clear();
3290 if (comm
->useUpdateGroups
)
3292 /* The update groups cog's are invalid after sorting
3293 * and need to be cleared before the next partitioning anyhow.
3295 comm
->updateGroupsCog
->clear();
3298 wallcycle_sub_start(wcycle
, ewcsDD_SETUPCOMM
);
3300 /* Setup up the communication and communicate the coordinates */
3301 setup_dd_communication(dd
, state_local
->box
, &ddbox
, fr
, state_local
, f
);
3303 /* Set the indices */
3304 make_dd_indices(dd
, cgs_gl
->index
, ncgindex_set
);
3306 /* Set the charge group boundaries for neighbor searching */
3307 set_cg_boundaries(&comm
->zones
);
3309 if (fr
->cutoff_scheme
== ecutsVERLET
)
3311 /* When bSortCG=true, we have already set the size for zone 0 */
3312 set_zones_size(dd
, state_local
->box
, &ddbox
,
3313 bSortCG
? 1 : 0, comm
->zones
.n
,
3317 wallcycle_sub_stop(wcycle
, ewcsDD_SETUPCOMM
);
3320 write_dd_pdb("dd_home",step,"dump",top_global,cr,
3321 -1,state_local->x.rvec_array(),state_local->box);
3324 wallcycle_sub_start(wcycle
, ewcsDD_MAKETOP
);
3326 /* Extract a local topology from the global topology */
3327 for (int i
= 0; i
< dd
->ndim
; i
++)
3329 np
[dd
->dim
[i
]] = comm
->cd
[i
].numPulses();
3331 dd_make_local_top(dd
, &comm
->zones
, dd
->npbcdim
, state_local
->box
,
3332 comm
->cellsize_min
, np
,
3334 fr
->cutoff_scheme
== ecutsGROUP
? fr
->cg_cm
: state_local
->x
.rvec_array(),
3335 top_global
, top_local
);
3337 wallcycle_sub_stop(wcycle
, ewcsDD_MAKETOP
);
3339 wallcycle_sub_start(wcycle
, ewcsDD_MAKECONSTR
);
3341 /* Set up the special atom communication */
3342 int n
= comm
->atomRanges
.end(DDAtomRanges::Type::Zones
);
3343 for (int i
= static_cast<int>(DDAtomRanges::Type::Zones
) + 1; i
< static_cast<int>(DDAtomRanges::Type::Number
); i
++)
3345 auto range
= static_cast<DDAtomRanges::Type
>(i
);
3348 case DDAtomRanges::Type::Vsites
:
3349 if (vsite
&& vsite
->numInterUpdategroupVsites
)
3351 n
= dd_make_local_vsites(dd
, n
, top_local
->idef
.il
);
3354 case DDAtomRanges::Type::Constraints
:
3355 if (dd
->splitConstraints
|| dd
->splitSettles
)
3357 /* Only for inter-cg constraints we need special code */
3358 n
= dd_make_local_constraints(dd
, n
, &top_global
, fr
->cginfo
,
3359 constr
, ir
->nProjOrder
,
3360 top_local
->idef
.il
);
3364 gmx_incons("Unknown special atom type setup");
3366 comm
->atomRanges
.setEnd(range
, n
);
3369 wallcycle_sub_stop(wcycle
, ewcsDD_MAKECONSTR
);
3371 wallcycle_sub_start(wcycle
, ewcsDD_TOPOTHER
);
3373 /* Make space for the extra coordinates for virtual site
3374 * or constraint communication.
3376 state_local
->natoms
= comm
->atomRanges
.numAtomsTotal();
3378 dd_resize_state(state_local
, f
, state_local
->natoms
);
3380 if (fr
->haveDirectVirialContributions
)
3382 if (vsite
&& vsite
->numInterUpdategroupVsites
)
3384 nat_f_novirsum
= comm
->atomRanges
.end(DDAtomRanges::Type::Vsites
);
3388 if (EEL_FULL(ir
->coulombtype
) && dd
->n_intercg_excl
> 0)
3390 nat_f_novirsum
= comm
->atomRanges
.end(DDAtomRanges::Type::Zones
);
3394 nat_f_novirsum
= comm
->atomRanges
.numHomeAtoms();
3403 /* Set the number of atoms required for the force calculation.
3404 * Forces need to be constrained when doing energy
3405 * minimization. For simple simulations we could avoid some
3406 * allocation, zeroing and copying, but this is probably not worth
3407 * the complications and checking.
3409 forcerec_set_ranges(fr
, dd
->ncg_home
, dd
->globalAtomGroupIndices
.size(),
3410 comm
->atomRanges
.end(DDAtomRanges::Type::Zones
),
3411 comm
->atomRanges
.end(DDAtomRanges::Type::Constraints
),
3414 /* Update atom data for mdatoms and several algorithms */
3415 mdAlgorithmsSetupAtomData(cr
, ir
, top_global
, top_local
, fr
,
3416 nullptr, mdAtoms
, constr
, vsite
, nullptr);
3418 auto mdatoms
= mdAtoms
->mdatoms();
3419 if (!thisRankHasDuty(cr
, DUTY_PME
))
3421 /* Send the charges and/or c6/sigmas to our PME only node */
3422 gmx_pme_send_parameters(cr
,
3424 mdatoms
->nChargePerturbed
!= 0, mdatoms
->nTypePerturbed
!= 0,
3425 mdatoms
->chargeA
, mdatoms
->chargeB
,
3426 mdatoms
->sqrt_c6A
, mdatoms
->sqrt_c6B
,
3427 mdatoms
->sigmaA
, mdatoms
->sigmaB
,
3428 dd_pme_maxshift_x(dd
), dd_pme_maxshift_y(dd
));
3433 /* Update the local pull groups */
3434 dd_make_local_pull_groups(cr
, ir
->pull_work
);
3437 if (dd
->atomSets
!= nullptr)
3439 /* Update the local atom sets */
3440 dd
->atomSets
->setIndicesInDomainDecomposition(*(dd
->ga2la
));
3443 /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3444 imdSession
->dd_make_local_IMD_atoms(dd
);
3446 add_dd_statistics(dd
);
3448 /* Make sure we only count the cycles for this DD partitioning */
3449 clear_dd_cycle_counts(dd
);
3451 /* Because the order of the atoms might have changed since
3452 * the last vsite construction, we need to communicate the constructing
3453 * atom coordinates again (for spreading the forces this MD step).
3455 dd_move_x_vsites(dd
, state_local
->box
, state_local
->x
.rvec_array());
3457 wallcycle_sub_stop(wcycle
, ewcsDD_TOPOTHER
);
3459 if (comm
->nstDDDump
> 0 && step
% comm
->nstDDDump
== 0)
3461 dd_move_x(dd
, state_local
->box
, state_local
->x
, nullWallcycle
);
3462 write_dd_pdb("dd_dump", step
, "dump", &top_global
, cr
,
3463 -1, state_local
->x
.rvec_array(), state_local
->box
);
3466 /* Store the partitioning step */
3467 comm
->partition_step
= step
;
3469 /* Increase the DD partitioning counter */
3471 /* The state currently matches this DD partitioning count, store it */
3472 state_local
->ddp_count
= dd
->ddp_count
;
3475 /* The DD master node knows the complete cg distribution,
3476 * store the count so we can possibly skip the cg info communication.
3478 comm
->master_cg_ddp_count
= (bSortCG
? 0 : dd
->ddp_count
);
3481 if (comm
->DD_debug
> 0)
3483 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3484 check_index_consistency(dd
, top_global
.natoms
, ncg_mtop(&top_global
),
3485 "after partitioning");
3488 wallcycle_stop(wcycle
, ewcDOMDEC
);
3491 /*! \brief Check whether bonded interactions are missing, if appropriate */
3492 void checkNumberOfBondedInteractions(const gmx::MDLogger
&mdlog
,
3494 int totalNumberOfBondedInteractions
,
3495 const gmx_mtop_t
*top_global
,
3496 const gmx_localtop_t
*top_local
,
3497 const t_state
*state
,
3498 bool *shouldCheckNumberOfBondedInteractions
)
3500 if (*shouldCheckNumberOfBondedInteractions
)
3502 if (totalNumberOfBondedInteractions
!= cr
->dd
->nbonded_global
)
3504 dd_print_missing_interactions(mdlog
, cr
, totalNumberOfBondedInteractions
, top_global
, top_local
, state
); // Does not return
3506 *shouldCheckNumberOfBondedInteractions
= false;