Merge branch release-2016
[gromacs.git] / src / gromacs / ewald / pme.cpp
blob0829acd5f7bbd1570dc24491a4368f338bf96dc3
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /*! \internal \file
39 * \brief This file contains function definitions necessary for
40 * computing energies and forces for the PME long-ranged part (Coulomb
41 * and LJ).
43 * \author Erik Lindahl <erik@kth.se>
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
47 /* IMPORTANT FOR DEVELOPERS:
49 * Triclinic pme stuff isn't entirely trivial, and we've experienced
50 * some bugs during development (many of them due to me). To avoid
51 * this in the future, please check the following things if you make
52 * changes in this file:
54 * 1. You should obtain identical (at least to the PME precision)
55 * energies, forces, and virial for
56 * a rectangular box and a triclinic one where the z (or y) axis is
57 * tilted a whole box side. For instance you could use these boxes:
59 * rectangular triclinic
60 * 2 0 0 2 0 0
61 * 0 2 0 0 2 0
62 * 0 0 6 2 2 6
64 * 2. You should check the energy conservation in a triclinic box.
66 * It might seem an overkill, but better safe than sorry.
67 * /Erik 001109
70 #include "gmxpre.h"
72 #include "pme.h"
74 #include "config.h"
76 #include <assert.h>
77 #include <math.h>
78 #include <stdio.h>
79 #include <stdlib.h>
80 #include <string.h>
82 #include <algorithm>
84 #include "gromacs/fft/parallel_3dfft.h"
85 #include "gromacs/fileio/pdbio.h"
86 #include "gromacs/gmxlib/network.h"
87 #include "gromacs/gmxlib/nrnb.h"
88 #include "gromacs/math/gmxcomplex.h"
89 #include "gromacs/math/invertmatrix.h"
90 #include "gromacs/math/units.h"
91 #include "gromacs/math/vec.h"
92 #include "gromacs/math/vectypes.h"
93 #include "gromacs/mdtypes/commrec.h"
94 #include "gromacs/mdtypes/forcerec.h"
95 #include "gromacs/mdtypes/inputrec.h"
96 #include "gromacs/mdtypes/md_enums.h"
97 #include "gromacs/pbcutil/pbc.h"
98 #include "gromacs/timing/cyclecounter.h"
99 #include "gromacs/timing/wallcycle.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/basedefinitions.h"
102 #include "gromacs/utility/exceptions.h"
103 #include "gromacs/utility/fatalerror.h"
104 #include "gromacs/utility/gmxmpi.h"
105 #include "gromacs/utility/gmxomp.h"
106 #include "gromacs/utility/real.h"
107 #include "gromacs/utility/smalloc.h"
108 #include "gromacs/utility/stringutil.h"
109 #include "gromacs/utility/unique_cptr.h"
111 #include "calculate-spline-moduli.h"
112 #include "pme-gather.h"
113 #include "pme-grid.h"
114 #include "pme-internal.h"
115 #include "pme-redistribute.h"
116 #include "pme-solve.h"
117 #include "pme-spline-work.h"
118 #include "pme-spread.h"
120 /*! \brief Number of bytes in a cache line.
122 * Must also be a multiple of the SIMD and SIMD4 register size, to
123 * preserve alignment.
125 const int gmxCacheLineSize = 64;
127 //! Set up coordinate communication
128 static void setup_coordinate_communication(pme_atomcomm_t *atc)
130 int nslab, n, i;
131 int fw, bw;
133 nslab = atc->nslab;
135 n = 0;
136 for (i = 1; i <= nslab/2; i++)
138 fw = (atc->nodeid + i) % nslab;
139 bw = (atc->nodeid - i + nslab) % nslab;
140 if (n < nslab - 1)
142 atc->node_dest[n] = fw;
143 atc->node_src[n] = bw;
144 n++;
146 if (n < nslab - 1)
148 atc->node_dest[n] = bw;
149 atc->node_src[n] = fw;
150 n++;
155 /*! \brief Round \p n up to the next multiple of \p f */
156 static int mult_up(int n, int f)
158 return ((n + f - 1)/f)*f;
161 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
162 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
164 int nma, nmi;
165 double n1, n2, n3;
167 nma = pme->nnodes_major;
168 nmi = pme->nnodes_minor;
170 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
171 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
172 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
174 /* pme_solve is roughly double the cost of an fft */
176 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
179 /*! \brief Initialize atom communication data structure */
180 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
181 int dimind, gmx_bool bSpread)
183 int thread;
185 atc->dimind = dimind;
186 atc->nslab = 1;
187 atc->nodeid = 0;
188 atc->pd_nalloc = 0;
189 #if GMX_MPI
190 if (pme->nnodes > 1)
192 atc->mpi_comm = pme->mpi_comm_d[dimind];
193 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
194 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
196 if (debug)
198 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
200 #endif
202 atc->bSpread = bSpread;
203 atc->pme_order = pme->pme_order;
205 if (atc->nslab > 1)
207 snew(atc->node_dest, atc->nslab);
208 snew(atc->node_src, atc->nslab);
209 setup_coordinate_communication(atc);
211 snew(atc->count_thread, pme->nthread);
212 for (thread = 0; thread < pme->nthread; thread++)
214 snew(atc->count_thread[thread], atc->nslab);
216 atc->count = atc->count_thread[0];
217 snew(atc->rcount, atc->nslab);
218 snew(atc->buf_index, atc->nslab);
221 atc->nthread = pme->nthread;
222 if (atc->nthread > 1)
224 snew(atc->thread_plist, atc->nthread);
226 snew(atc->spline, atc->nthread);
227 for (thread = 0; thread < atc->nthread; thread++)
229 if (atc->nthread > 1)
231 snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
232 atc->thread_plist[thread].n += gmxCacheLineSize;
237 /*! \brief Destroy an atom communication data structure and its child structs */
238 static void destroy_atomcomm(pme_atomcomm_t *atc)
240 sfree(atc->pd);
241 if (atc->nslab > 1)
243 sfree(atc->node_dest);
244 sfree(atc->node_src);
245 for (int i = 0; i < atc->nthread; i++)
247 sfree(atc->count_thread[i]);
249 sfree(atc->count_thread);
250 sfree(atc->rcount);
251 sfree(atc->buf_index);
253 sfree(atc->x);
254 sfree(atc->coefficient);
255 sfree(atc->f);
257 sfree(atc->idx);
258 sfree(atc->fractx);
260 sfree(atc->thread_idx);
261 for (int i = 0; i < atc->nthread; i++)
263 if (atc->nthread > 1)
265 int *n_ptr = atc->thread_plist[i].n - gmxCacheLineSize;
266 sfree(n_ptr);
267 sfree(atc->thread_plist[i].i);
269 sfree(atc->spline[i].ind);
270 for (int d = 0; d < ZZ; d++)
272 sfree(atc->spline[i].theta[d]);
273 sfree(atc->spline[i].dtheta[d]);
275 sfree_aligned(atc->spline[i].ptr_dtheta_z);
276 sfree_aligned(atc->spline[i].ptr_theta_z);
278 if (atc->nthread > 1)
280 sfree(atc->thread_plist);
282 sfree(atc->spline);
285 /*! \brief Initialize data structure for communication */
286 static void
287 init_overlap_comm(pme_overlap_t * ol,
288 int norder,
289 #if GMX_MPI
290 MPI_Comm comm,
291 #endif
292 int nnodes,
293 int nodeid,
294 int ndata,
295 int commplainsize)
297 int b, i;
298 pme_grid_comm_t *pgc;
299 gmx_bool bCont;
300 int fft_start, fft_end, send_index1, recv_index1;
301 #if GMX_MPI
302 MPI_Status stat;
304 ol->mpi_comm = comm;
305 #endif
307 ol->nnodes = nnodes;
308 ol->nodeid = nodeid;
310 /* Linear translation of the PME grid won't affect reciprocal space
311 * calculations, so to optimize we only interpolate "upwards",
312 * which also means we only have to consider overlap in one direction.
313 * I.e., particles on this node might also be spread to grid indices
314 * that belong to higher nodes (modulo nnodes)
317 snew(ol->s2g0, ol->nnodes+1);
318 snew(ol->s2g1, ol->nnodes);
319 if (debug)
321 fprintf(debug, "PME slab boundaries:");
323 for (i = 0; i < nnodes; i++)
325 /* s2g0 the local interpolation grid start.
326 * s2g1 the local interpolation grid end.
327 * Since in calc_pidx we divide particles, and not grid lines,
328 * spatially uniform along dimension x or y, we need to round
329 * s2g0 down and s2g1 up.
331 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
332 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
334 if (debug)
336 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
339 ol->s2g0[nnodes] = ndata;
340 if (debug)
342 fprintf(debug, "\n");
345 /* Determine with how many nodes we need to communicate the grid overlap */
346 b = 0;
349 b++;
350 bCont = FALSE;
351 for (i = 0; i < nnodes; i++)
353 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
354 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
356 bCont = TRUE;
360 while (bCont && b < nnodes);
361 ol->noverlap_nodes = b - 1;
363 snew(ol->send_id, ol->noverlap_nodes);
364 snew(ol->recv_id, ol->noverlap_nodes);
365 for (b = 0; b < ol->noverlap_nodes; b++)
367 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
368 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
370 snew(ol->comm_data, ol->noverlap_nodes);
372 ol->send_size = 0;
373 for (b = 0; b < ol->noverlap_nodes; b++)
375 pgc = &ol->comm_data[b];
376 /* Send */
377 fft_start = ol->s2g0[ol->send_id[b]];
378 fft_end = ol->s2g0[ol->send_id[b]+1];
379 if (ol->send_id[b] < nodeid)
381 fft_start += ndata;
382 fft_end += ndata;
384 send_index1 = ol->s2g1[nodeid];
385 send_index1 = std::min(send_index1, fft_end);
386 pgc->send_index0 = fft_start;
387 pgc->send_nindex = std::max(0, send_index1 - pgc->send_index0);
388 ol->send_size += pgc->send_nindex;
390 /* We always start receiving to the first index of our slab */
391 fft_start = ol->s2g0[ol->nodeid];
392 fft_end = ol->s2g0[ol->nodeid+1];
393 recv_index1 = ol->s2g1[ol->recv_id[b]];
394 if (ol->recv_id[b] > nodeid)
396 recv_index1 -= ndata;
398 recv_index1 = std::min(recv_index1, fft_end);
399 pgc->recv_index0 = fft_start;
400 pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
403 #if GMX_MPI
404 /* Communicate the buffer sizes to receive */
405 for (b = 0; b < ol->noverlap_nodes; b++)
407 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
408 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
409 ol->mpi_comm, &stat);
411 #endif
413 /* For non-divisible grid we need pme_order iso pme_order-1 */
414 snew(ol->sendbuf, norder*commplainsize);
415 snew(ol->recvbuf, norder*commplainsize);
418 /*! \brief Destroy data structure for communication */
419 static void
420 destroy_overlap_comm(const pme_overlap_t *ol)
422 sfree(ol->s2g0);
423 sfree(ol->s2g1);
424 sfree(ol->send_id);
425 sfree(ol->recv_id);
426 sfree(ol->comm_data);
427 sfree(ol->sendbuf);
428 sfree(ol->recvbuf);
431 int minimalPmeGridSize(int pmeOrder)
433 /* The actual grid size limitations are:
434 * serial: >= pme_order
435 * DD, no OpenMP: >= 2*(pme_order - 1)
436 * DD, OpenMP: >= pme_order + 1
437 * But we use the maximum for simplicity since in practice there is not
438 * much performance difference between pme_order and 2*(pme_order -1).
440 int minimalSize = 2*(pmeOrder - 1);
442 GMX_RELEASE_ASSERT(pmeOrder >= 3, "pmeOrder has to be >= 3");
443 GMX_RELEASE_ASSERT(minimalSize >= pmeOrder + 1, "The grid size should be >= pmeOrder + 1");
445 return minimalSize;
448 void gmx_pme_check_restrictions(int pme_order,
449 int nkx, int nky, int nkz,
450 int nnodes_major,
451 gmx_bool bUseThreads,
452 gmx_bool bFatal,
453 gmx_bool *bValidSettings)
455 if (pme_order > PME_ORDER_MAX)
457 if (!bFatal)
459 *bValidSettings = FALSE;
460 return;
463 std::string message = gmx::formatString(
464 "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
465 pme_order, PME_ORDER_MAX);
466 GMX_THROW(InconsistentInputError(message));
469 const int minGridSize = minimalPmeGridSize(pme_order);
470 if (nkx < minGridSize ||
471 nky < minGridSize ||
472 nkz < minGridSize)
474 if (!bFatal)
476 *bValidSettings = FALSE;
477 return;
479 std::string message = gmx::formatString(
480 "The PME grid sizes need to be >= 2*(pme_order-1) (%d)",
481 minGridSize);
482 GMX_THROW(InconsistentInputError(message));
485 /* Check for a limitation of the (current) sum_fftgrid_dd code.
486 * We only allow multiple communication pulses in dim 1, not in dim 0.
488 if (bUseThreads && (nkx < nnodes_major*pme_order &&
489 nkx != nnodes_major*(pme_order - 1)))
491 if (!bFatal)
493 *bValidSettings = FALSE;
494 return;
496 gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
497 nkx/(double)nnodes_major, pme_order);
500 if (bValidSettings != nullptr)
502 *bValidSettings = TRUE;
505 return;
508 /*! \brief Round \p enumerator */
509 static int div_round_up(int enumerator, int denominator)
511 return (enumerator + denominator - 1)/denominator;
514 int gmx_pme_init(struct gmx_pme_t **pmedata,
515 t_commrec * cr,
516 int nnodes_major,
517 int nnodes_minor,
518 const t_inputrec * ir,
519 int homenr,
520 gmx_bool bFreeEnergy_q,
521 gmx_bool bFreeEnergy_lj,
522 gmx_bool bReproducible,
523 real ewaldcoeff_q,
524 real ewaldcoeff_lj,
525 int nthread)
527 int use_threads, sum_use_threads, i;
528 ivec ndata;
530 if (debug)
532 fprintf(debug, "Creating PME data structures.\n");
535 gmx_pme_t *pmeRaw = nullptr;
536 snew(pmeRaw, 1);
537 unique_cptr<gmx_pme_t, gmx_pme_destroy> pme(pmeRaw);
539 pme->sum_qgrid_tmp = nullptr;
540 pme->sum_qgrid_dd_tmp = nullptr;
542 pme->buf_nalloc = 0;
544 pme->nnodes = 1;
545 pme->bPPnode = TRUE;
547 pme->nnodes_major = nnodes_major;
548 pme->nnodes_minor = nnodes_minor;
550 #if GMX_MPI
551 if (nnodes_major*nnodes_minor > 1)
553 pme->mpi_comm = cr->mpi_comm_mygroup;
555 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
556 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
557 if (pme->nnodes != nnodes_major*nnodes_minor)
559 gmx_incons("PME rank count mismatch");
562 else
564 pme->mpi_comm = MPI_COMM_NULL;
566 #endif
568 if (pme->nnodes == 1)
570 #if GMX_MPI
571 pme->mpi_comm_d[0] = MPI_COMM_NULL;
572 pme->mpi_comm_d[1] = MPI_COMM_NULL;
573 #endif
574 pme->ndecompdim = 0;
575 pme->nodeid_major = 0;
576 pme->nodeid_minor = 0;
577 #if GMX_MPI
578 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
579 #endif
581 else
583 if (nnodes_minor == 1)
585 #if GMX_MPI
586 pme->mpi_comm_d[0] = pme->mpi_comm;
587 pme->mpi_comm_d[1] = MPI_COMM_NULL;
588 #endif
589 pme->ndecompdim = 1;
590 pme->nodeid_major = pme->nodeid;
591 pme->nodeid_minor = 0;
594 else if (nnodes_major == 1)
596 #if GMX_MPI
597 pme->mpi_comm_d[0] = MPI_COMM_NULL;
598 pme->mpi_comm_d[1] = pme->mpi_comm;
599 #endif
600 pme->ndecompdim = 1;
601 pme->nodeid_major = 0;
602 pme->nodeid_minor = pme->nodeid;
604 else
606 if (pme->nnodes % nnodes_major != 0)
608 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
610 pme->ndecompdim = 2;
612 #if GMX_MPI
613 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
614 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
615 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
616 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
618 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
619 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
620 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
621 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
622 #endif
624 pme->bPPnode = (cr->duty & DUTY_PP);
627 pme->nthread = nthread;
629 /* Check if any of the PME MPI ranks uses threads */
630 use_threads = (pme->nthread > 1 ? 1 : 0);
631 #if GMX_MPI
632 if (pme->nnodes > 1)
634 MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
635 MPI_SUM, pme->mpi_comm);
637 else
638 #endif
640 sum_use_threads = use_threads;
642 pme->bUseThreads = (sum_use_threads > 0);
644 if (ir->ePBC == epbcSCREW)
646 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
649 /* NOTE:
650 * It is likely that the current gmx_pme_do() routine supports calculating
651 * only Coulomb or LJ while gmx_pme_init() configures for both,
652 * but that has never been tested.
653 * It is likely that the current gmx_pme_do() routine supports calculating,
654 * not calculating free-energy for Coulomb and/or LJ while gmx_pme_init()
655 * configures with free-energy, but that has never been tested.
657 pme->doCoulomb = EEL_PME(ir->coulombtype);
658 pme->doLJ = EVDW_PME(ir->vdwtype);
659 pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
660 pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
661 pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
662 pme->nkx = ir->nkx;
663 pme->nky = ir->nky;
664 pme->nkz = ir->nkz;
665 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != nullptr);
666 pme->pme_order = ir->pme_order;
667 pme->ewaldcoeff_q = ewaldcoeff_q;
668 pme->ewaldcoeff_lj = ewaldcoeff_lj;
670 /* Always constant electrostatics coefficients */
671 pme->epsilon_r = ir->epsilon_r;
673 /* Always constant LJ coefficients */
674 pme->ljpme_combination_rule = ir->ljpme_combination_rule;
676 /* If we violate restrictions, generate a fatal error here */
677 gmx_pme_check_restrictions(pme->pme_order,
678 pme->nkx, pme->nky, pme->nkz,
679 pme->nnodes_major,
680 pme->bUseThreads,
681 TRUE,
682 nullptr);
684 if (pme->nnodes > 1)
686 double imbal;
688 #if GMX_MPI
689 MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
690 MPI_Type_commit(&(pme->rvec_mpi));
691 #endif
693 /* Note that the coefficient spreading and force gathering, which usually
694 * takes about the same amount of time as FFT+solve_pme,
695 * is always fully load balanced
696 * (unless the coefficient distribution is inhomogeneous).
699 imbal = estimate_pme_load_imbalance(pme.get());
700 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
702 fprintf(stderr,
703 "\n"
704 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
705 " For optimal PME load balancing\n"
706 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
707 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
708 "\n",
709 (int)((imbal-1)*100 + 0.5),
710 pme->nkx, pme->nky, pme->nnodes_major,
711 pme->nky, pme->nkz, pme->nnodes_minor);
715 /* For non-divisible grid we need pme_order iso pme_order-1 */
716 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
717 * y is always copied through a buffer: we don't need padding in z,
718 * but we do need the overlap in x because of the communication order.
720 init_overlap_comm(&pme->overlap[0], pme->pme_order,
721 #if GMX_MPI
722 pme->mpi_comm_d[0],
723 #endif
724 pme->nnodes_major, pme->nodeid_major,
725 pme->nkx,
726 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
728 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
729 * We do this with an offset buffer of equal size, so we need to allocate
730 * extra for the offset. That's what the (+1)*pme->nkz is for.
732 init_overlap_comm(&pme->overlap[1], pme->pme_order,
733 #if GMX_MPI
734 pme->mpi_comm_d[1],
735 #endif
736 pme->nnodes_minor, pme->nodeid_minor,
737 pme->nky,
738 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
740 /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
741 * Note that gmx_pme_check_restrictions checked for this already.
743 if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
745 gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
748 snew(pme->bsp_mod[XX], pme->nkx);
749 snew(pme->bsp_mod[YY], pme->nky);
750 snew(pme->bsp_mod[ZZ], pme->nkz);
752 /* The required size of the interpolation grid, including overlap.
753 * The allocated size (pmegrid_n?) might be slightly larger.
755 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
756 pme->overlap[0].s2g0[pme->nodeid_major];
757 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
758 pme->overlap[1].s2g0[pme->nodeid_minor];
759 pme->pmegrid_nz_base = pme->nkz;
760 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
761 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
763 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
764 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
765 pme->pmegrid_start_iz = 0;
767 make_gridindex_to_localindex(pme->nkx,
768 pme->pmegrid_start_ix,
769 pme->pmegrid_nx - (pme->pme_order-1),
770 &pme->nnx, &pme->fshx);
771 make_gridindex_to_localindex(pme->nky,
772 pme->pmegrid_start_iy,
773 pme->pmegrid_ny - (pme->pme_order-1),
774 &pme->nny, &pme->fshy);
775 make_gridindex_to_localindex(pme->nkz,
776 pme->pmegrid_start_iz,
777 pme->pmegrid_nz_base,
778 &pme->nnz, &pme->fshz);
780 pme->spline_work = make_pme_spline_work(pme->pme_order);
782 ndata[0] = pme->nkx;
783 ndata[1] = pme->nky;
784 ndata[2] = pme->nkz;
785 /* It doesn't matter if we allocate too many grids here,
786 * we only allocate and use the ones we need.
788 if (pme->doLJ)
790 pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
792 else
794 pme->ngrids = DO_Q;
796 snew(pme->fftgrid, pme->ngrids);
797 snew(pme->cfftgrid, pme->ngrids);
798 snew(pme->pfft_setup, pme->ngrids);
800 for (i = 0; i < pme->ngrids; ++i)
802 if ((i < DO_Q && pme->doCoulomb && (i == 0 ||
803 bFreeEnergy_q)) ||
804 (i >= DO_Q && pme->doLJ && (i == 2 ||
805 bFreeEnergy_lj ||
806 ir->ljpme_combination_rule == eljpmeLB)))
808 pmegrids_init(&pme->pmegrid[i],
809 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
810 pme->pmegrid_nz_base,
811 pme->pme_order,
812 pme->bUseThreads,
813 pme->nthread,
814 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
815 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
816 /* This routine will allocate the grid data to fit the FFTs */
817 gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
818 &pme->fftgrid[i], &pme->cfftgrid[i],
819 pme->mpi_comm_d,
820 bReproducible, pme->nthread);
825 if (!pme->bP3M)
827 /* Use plain SPME B-spline interpolation */
828 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
830 else
832 /* Use the P3M grid-optimized influence function */
833 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
836 /* Use atc[0] for spreading */
837 init_atomcomm(pme.get(), &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
838 if (pme->ndecompdim >= 2)
840 init_atomcomm(pme.get(), &pme->atc[1], 1, FALSE);
843 if (pme->nnodes == 1)
845 pme->atc[0].n = homenr;
846 pme_realloc_atomcomm_things(&pme->atc[0]);
849 pme->lb_buf1 = nullptr;
850 pme->lb_buf2 = nullptr;
851 pme->lb_buf_nalloc = 0;
853 pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
855 // no exception was thrown during the init, so we hand over the PME structure handle
856 *pmedata = pme.release();
858 return 0;
861 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
862 t_commrec * cr,
863 struct gmx_pme_t * pme_src,
864 const t_inputrec * ir,
865 ivec grid_size,
866 real ewaldcoeff_q,
867 real ewaldcoeff_lj)
869 int homenr;
870 int ret;
872 // Create a copy of t_inputrec fields that are used in gmx_pme_init().
873 // TODO: This would be better as just copying a sub-structure that contains
874 // all the PME parameters and nothing else.
875 t_inputrec irc;
876 irc.ePBC = ir->ePBC;
877 irc.coulombtype = ir->coulombtype;
878 irc.vdwtype = ir->vdwtype;
879 irc.efep = ir->efep;
880 irc.pme_order = ir->pme_order;
881 irc.epsilon_r = ir->epsilon_r;
882 irc.ljpme_combination_rule = ir->ljpme_combination_rule;
883 irc.nkx = grid_size[XX];
884 irc.nky = grid_size[YY];
885 irc.nkz = grid_size[ZZ];
887 if (pme_src->nnodes == 1)
889 homenr = pme_src->atc[0].n;
891 else
893 homenr = -1;
898 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
899 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
901 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
903 if (ret == 0)
905 /* We can easily reuse the allocated pme grids in pme_src */
906 reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
907 /* We would like to reuse the fft grids, but that's harder */
910 return ret;
913 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
915 pme_atomcomm_t *atc;
916 pmegrids_t *grid;
918 if (pme->nnodes > 1)
920 gmx_incons("gmx_pme_calc_energy called in parallel");
922 if (pme->bFEP_q > 1)
924 gmx_incons("gmx_pme_calc_energy with free energy");
927 atc = &pme->atc_energy;
928 atc->nthread = 1;
929 if (atc->spline == nullptr)
931 snew(atc->spline, atc->nthread);
933 atc->nslab = 1;
934 atc->bSpread = TRUE;
935 atc->pme_order = pme->pme_order;
936 atc->n = n;
937 pme_realloc_atomcomm_things(atc);
938 atc->x = x;
939 atc->coefficient = q;
941 /* We only use the A-charges grid */
942 grid = &pme->pmegrid[PME_GRID_QA];
944 /* Only calculate the spline coefficients, don't actually spread */
945 spread_on_grid(pme, atc, nullptr, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
947 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
950 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
951 static void
952 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
954 int i;
955 for (i = 0; i < pme->atc[0].n; ++i)
957 real sigma4;
958 sigma4 = local_sigma[i];
959 sigma4 = sigma4*sigma4;
960 sigma4 = sigma4*sigma4;
961 pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
965 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
966 static void
967 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
969 int i;
971 for (i = 0; i < pme->atc[0].n; ++i)
973 pme->atc[0].coefficient[i] *= local_sigma[i];
977 int gmx_pme_do(struct gmx_pme_t *pme,
978 int start, int homenr,
979 rvec x[], rvec f[],
980 real chargeA[], real chargeB[],
981 real c6A[], real c6B[],
982 real sigmaA[], real sigmaB[],
983 matrix box, t_commrec *cr,
984 int maxshift_x, int maxshift_y,
985 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
986 matrix vir_q, matrix vir_lj,
987 real *energy_q, real *energy_lj,
988 real lambda_q, real lambda_lj,
989 real *dvdlambda_q, real *dvdlambda_lj,
990 int flags)
992 int d, i, j, npme, grid_index, max_grid_index;
993 int n_d;
994 pme_atomcomm_t *atc = nullptr;
995 pmegrids_t *pmegrid = nullptr;
996 real *grid = nullptr;
997 rvec *f_d;
998 real *coefficient = nullptr;
999 real energy_AB[4];
1000 matrix vir_AB[4];
1001 real scale, lambda;
1002 gmx_bool bClearF;
1003 gmx_parallel_3dfft_t pfft_setup;
1004 real * fftgrid;
1005 t_complex * cfftgrid;
1006 int thread;
1007 gmx_bool bFirst, bDoSplines;
1008 int fep_state;
1009 int fep_states_lj = pme->bFEP_lj ? 2 : 1;
1010 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
1011 const gmx_bool bBackFFT = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
1012 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
1014 assert(pme->nnodes > 0);
1015 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
1017 if (pme->nnodes > 1)
1019 atc = &pme->atc[0];
1020 atc->npd = homenr;
1021 if (atc->npd > atc->pd_nalloc)
1023 atc->pd_nalloc = over_alloc_dd(atc->npd);
1024 srenew(atc->pd, atc->pd_nalloc);
1026 for (d = pme->ndecompdim-1; d >= 0; d--)
1028 atc = &pme->atc[d];
1029 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
1032 else
1034 atc = &pme->atc[0];
1035 /* This could be necessary for TPI */
1036 pme->atc[0].n = homenr;
1037 if (DOMAINDECOMP(cr))
1039 pme_realloc_atomcomm_things(atc);
1041 atc->x = x;
1042 atc->f = f;
1045 gmx::invertBoxMatrix(box, pme->recipbox);
1046 bFirst = TRUE;
1048 /* For simplicity, we construct the splines for all particles if
1049 * more than one PME calculations is needed. Some optimization
1050 * could be done by keeping track of which atoms have splines
1051 * constructed, and construct new splines on each pass for atoms
1052 * that don't yet have them.
1055 bDoSplines = pme->bFEP || (pme->doCoulomb && pme->doLJ);
1057 /* We need a maximum of four separate PME calculations:
1058 * grid_index=0: Coulomb PME with charges from state A
1059 * grid_index=1: Coulomb PME with charges from state B
1060 * grid_index=2: LJ PME with C6 from state A
1061 * grid_index=3: LJ PME with C6 from state B
1062 * For Lorentz-Berthelot combination rules, a separate loop is used to
1063 * calculate all the terms
1066 /* If we are doing LJ-PME with LB, we only do Q here */
1067 max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
1069 for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
1071 /* Check if we should do calculations at this grid_index
1072 * If grid_index is odd we should be doing FEP
1073 * If grid_index < 2 we should be doing electrostatic PME
1074 * If grid_index >= 2 we should be doing LJ-PME
1076 if ((grid_index < DO_Q && (!pme->doCoulomb ||
1077 (grid_index == 1 && !pme->bFEP_q))) ||
1078 (grid_index >= DO_Q && (!pme->doLJ ||
1079 (grid_index == 3 && !pme->bFEP_lj))))
1081 continue;
1083 /* Unpack structure */
1084 pmegrid = &pme->pmegrid[grid_index];
1085 fftgrid = pme->fftgrid[grid_index];
1086 cfftgrid = pme->cfftgrid[grid_index];
1087 pfft_setup = pme->pfft_setup[grid_index];
1088 switch (grid_index)
1090 case 0: coefficient = chargeA + start; break;
1091 case 1: coefficient = chargeB + start; break;
1092 case 2: coefficient = c6A + start; break;
1093 case 3: coefficient = c6B + start; break;
1096 grid = pmegrid->grid.grid;
1098 if (debug)
1100 fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1101 cr->nnodes, cr->nodeid);
1102 fprintf(debug, "Grid = %p\n", (void*)grid);
1103 if (grid == nullptr)
1105 gmx_fatal(FARGS, "No grid!");
1108 where();
1110 if (pme->nnodes == 1)
1112 atc->coefficient = coefficient;
1114 else
1116 wallcycle_start(wcycle, ewcPME_REDISTXF);
1117 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1118 where();
1120 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1123 if (debug)
1125 fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1126 cr->nodeid, atc->n);
1129 if (flags & GMX_PME_SPREAD)
1131 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1133 /* Spread the coefficients on a grid */
1134 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1136 if (bFirst)
1138 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1140 inc_nrnb(nrnb, eNR_SPREADBSP,
1141 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1143 if (!pme->bUseThreads)
1145 wrap_periodic_pmegrid(pme, grid);
1147 /* sum contributions to local grid from other nodes */
1148 #if GMX_MPI
1149 if (pme->nnodes > 1)
1151 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1152 where();
1154 #endif
1156 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1159 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1161 /* TODO If the OpenMP and single-threaded implementations
1162 converge, then spread_on_grid() and
1163 copy_pmegrid_to_fftgrid() will perhaps live in the same
1164 source file and the following debugging function can live
1165 there too. */
1167 dump_local_fftgrid(pme,fftgrid);
1168 exit(0);
1172 /* Here we start a large thread parallel region */
1173 #pragma omp parallel num_threads(pme->nthread) private(thread)
1177 thread = gmx_omp_get_thread_num();
1178 if (flags & GMX_PME_SOLVE)
1180 int loop_count;
1182 /* do 3d-fft */
1183 if (thread == 0)
1185 wallcycle_start(wcycle, ewcPME_FFT);
1187 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1188 thread, wcycle);
1189 if (thread == 0)
1191 wallcycle_stop(wcycle, ewcPME_FFT);
1193 where();
1195 /* solve in k-space for our local cells */
1196 if (thread == 0)
1198 wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1200 if (grid_index < DO_Q)
1202 loop_count =
1203 solve_pme_yzx(pme, cfftgrid,
1204 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1205 bCalcEnerVir,
1206 pme->nthread, thread);
1208 else
1210 loop_count =
1211 solve_pme_lj_yzx(pme, &cfftgrid, FALSE,
1212 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1213 bCalcEnerVir,
1214 pme->nthread, thread);
1217 if (thread == 0)
1219 wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1220 where();
1221 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1225 if (bBackFFT)
1227 /* do 3d-invfft */
1228 if (thread == 0)
1230 where();
1231 wallcycle_start(wcycle, ewcPME_FFT);
1233 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1234 thread, wcycle);
1235 if (thread == 0)
1237 wallcycle_stop(wcycle, ewcPME_FFT);
1239 where();
1241 if (pme->nodeid == 0)
1243 real ntot = pme->nkx*pme->nky*pme->nkz;
1244 npme = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1245 inc_nrnb(nrnb, eNR_FFT, 2*npme);
1248 /* Note: this wallcycle region is closed below
1249 outside an OpenMP region, so take care if
1250 refactoring code here. */
1251 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1254 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1256 } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1258 /* End of thread parallel section.
1259 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1262 if (bBackFFT)
1264 /* distribute local grid to all nodes */
1265 #if GMX_MPI
1266 if (pme->nnodes > 1)
1268 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1270 #endif
1271 where();
1273 unwrap_periodic_pmegrid(pme, grid);
1276 if (bCalcF)
1278 /* interpolate forces for our local atoms */
1280 where();
1282 /* If we are running without parallelization,
1283 * atc->f is the actual force array, not a buffer,
1284 * therefore we should not clear it.
1286 lambda = grid_index < DO_Q ? lambda_q : lambda_lj;
1287 bClearF = (bFirst && PAR(cr));
1288 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1289 for (thread = 0; thread < pme->nthread; thread++)
1293 gather_f_bsplines(pme, grid, bClearF, atc,
1294 &atc->spline[thread],
1295 pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1297 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1300 where();
1302 inc_nrnb(nrnb, eNR_GATHERFBSP,
1303 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1304 /* Note: this wallcycle region is opened above inside an OpenMP
1305 region, so take care if refactoring code here. */
1306 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1309 if (bCalcEnerVir)
1311 /* This should only be called on the master thread
1312 * and after the threads have synchronized.
1314 if (grid_index < 2)
1316 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1318 else
1320 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1323 bFirst = FALSE;
1324 } /* of grid_index-loop */
1326 /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1327 * seven terms. */
1329 if (pme->doLJ && pme->ljpme_combination_rule == eljpmeLB)
1331 /* Loop over A- and B-state if we are doing FEP */
1332 for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1334 real *local_c6 = nullptr, *local_sigma = nullptr, *RedistC6 = nullptr, *RedistSigma = nullptr;
1335 if (pme->nnodes == 1)
1337 if (pme->lb_buf1 == nullptr)
1339 pme->lb_buf_nalloc = pme->atc[0].n;
1340 snew(pme->lb_buf1, pme->lb_buf_nalloc);
1342 pme->atc[0].coefficient = pme->lb_buf1;
1343 switch (fep_state)
1345 case 0:
1346 local_c6 = c6A;
1347 local_sigma = sigmaA;
1348 break;
1349 case 1:
1350 local_c6 = c6B;
1351 local_sigma = sigmaB;
1352 break;
1353 default:
1354 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1357 else
1359 atc = &pme->atc[0];
1360 switch (fep_state)
1362 case 0:
1363 RedistC6 = c6A;
1364 RedistSigma = sigmaA;
1365 break;
1366 case 1:
1367 RedistC6 = c6B;
1368 RedistSigma = sigmaB;
1369 break;
1370 default:
1371 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1373 wallcycle_start(wcycle, ewcPME_REDISTXF);
1375 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1376 if (pme->lb_buf_nalloc < atc->n)
1378 pme->lb_buf_nalloc = atc->nalloc;
1379 srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1380 srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1382 local_c6 = pme->lb_buf1;
1383 for (i = 0; i < atc->n; ++i)
1385 local_c6[i] = atc->coefficient[i];
1387 where();
1389 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1390 local_sigma = pme->lb_buf2;
1391 for (i = 0; i < atc->n; ++i)
1393 local_sigma[i] = atc->coefficient[i];
1395 where();
1397 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1399 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1401 /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1402 for (grid_index = 2; grid_index < 9; ++grid_index)
1404 /* Unpack structure */
1405 pmegrid = &pme->pmegrid[grid_index];
1406 fftgrid = pme->fftgrid[grid_index];
1407 pfft_setup = pme->pfft_setup[grid_index];
1408 calc_next_lb_coeffs(pme, local_sigma);
1409 grid = pmegrid->grid.grid;
1410 where();
1412 if (flags & GMX_PME_SPREAD)
1414 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1415 /* Spread the c6 on a grid */
1416 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1418 if (bFirst)
1420 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1423 inc_nrnb(nrnb, eNR_SPREADBSP,
1424 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1425 if (pme->nthread == 1)
1427 wrap_periodic_pmegrid(pme, grid);
1428 /* sum contributions to local grid from other nodes */
1429 #if GMX_MPI
1430 if (pme->nnodes > 1)
1432 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1433 where();
1435 #endif
1436 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1438 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1440 /*Here we start a large thread parallel region*/
1441 #pragma omp parallel num_threads(pme->nthread) private(thread)
1445 thread = gmx_omp_get_thread_num();
1446 if (flags & GMX_PME_SOLVE)
1448 /* do 3d-fft */
1449 if (thread == 0)
1451 wallcycle_start(wcycle, ewcPME_FFT);
1454 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1455 thread, wcycle);
1456 if (thread == 0)
1458 wallcycle_stop(wcycle, ewcPME_FFT);
1460 where();
1463 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1465 bFirst = FALSE;
1467 if (flags & GMX_PME_SOLVE)
1469 /* solve in k-space for our local cells */
1470 #pragma omp parallel num_threads(pme->nthread) private(thread)
1474 int loop_count;
1475 thread = gmx_omp_get_thread_num();
1476 if (thread == 0)
1478 wallcycle_start(wcycle, ewcLJPME);
1481 loop_count =
1482 solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE,
1483 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1484 bCalcEnerVir,
1485 pme->nthread, thread);
1486 if (thread == 0)
1488 wallcycle_stop(wcycle, ewcLJPME);
1489 where();
1490 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1493 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1497 if (bCalcEnerVir)
1499 /* This should only be called on the master thread and
1500 * after the threads have synchronized.
1502 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1505 if (bBackFFT)
1507 bFirst = !pme->doCoulomb;
1508 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1509 for (grid_index = 8; grid_index >= 2; --grid_index)
1511 /* Unpack structure */
1512 pmegrid = &pme->pmegrid[grid_index];
1513 fftgrid = pme->fftgrid[grid_index];
1514 pfft_setup = pme->pfft_setup[grid_index];
1515 grid = pmegrid->grid.grid;
1516 calc_next_lb_coeffs(pme, local_sigma);
1517 where();
1518 #pragma omp parallel num_threads(pme->nthread) private(thread)
1522 thread = gmx_omp_get_thread_num();
1523 /* do 3d-invfft */
1524 if (thread == 0)
1526 where();
1527 wallcycle_start(wcycle, ewcPME_FFT);
1530 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1531 thread, wcycle);
1532 if (thread == 0)
1534 wallcycle_stop(wcycle, ewcPME_FFT);
1536 where();
1538 if (pme->nodeid == 0)
1540 real ntot = pme->nkx*pme->nky*pme->nkz;
1541 npme = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1542 inc_nrnb(nrnb, eNR_FFT, 2*npme);
1544 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1547 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1549 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1550 } /*#pragma omp parallel*/
1552 /* distribute local grid to all nodes */
1553 #if GMX_MPI
1554 if (pme->nnodes > 1)
1556 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1558 #endif
1559 where();
1561 unwrap_periodic_pmegrid(pme, grid);
1563 if (bCalcF)
1565 /* interpolate forces for our local atoms */
1566 where();
1567 bClearF = (bFirst && PAR(cr));
1568 scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1569 scale *= lb_scale_factor[grid_index-2];
1571 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1572 for (thread = 0; thread < pme->nthread; thread++)
1576 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1577 &pme->atc[0].spline[thread],
1578 scale);
1580 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1583 where();
1585 inc_nrnb(nrnb, eNR_GATHERFBSP,
1586 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1588 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1590 bFirst = FALSE;
1591 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1592 } /* if (bCalcF) */
1593 } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1594 } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1596 if (bCalcF && pme->nnodes > 1)
1598 wallcycle_start(wcycle, ewcPME_REDISTXF);
1599 for (d = 0; d < pme->ndecompdim; d++)
1601 atc = &pme->atc[d];
1602 if (d == pme->ndecompdim - 1)
1604 n_d = homenr;
1605 f_d = f + start;
1607 else
1609 n_d = pme->atc[d+1].n;
1610 f_d = pme->atc[d+1].f;
1612 if (DOMAINDECOMP(cr))
1614 dd_pmeredist_f(pme, atc, n_d, f_d,
1615 d == pme->ndecompdim-1 && pme->bPPnode);
1619 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1621 where();
1623 if (bCalcEnerVir)
1625 if (pme->doCoulomb)
1627 if (!pme->bFEP_q)
1629 *energy_q = energy_AB[0];
1630 m_add(vir_q, vir_AB[0], vir_q);
1632 else
1634 *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1635 *dvdlambda_q += energy_AB[1] - energy_AB[0];
1636 for (i = 0; i < DIM; i++)
1638 for (j = 0; j < DIM; j++)
1640 vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1641 lambda_q*vir_AB[1][i][j];
1645 if (debug)
1647 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1650 else
1652 *energy_q = 0;
1655 if (pme->doLJ)
1657 if (!pme->bFEP_lj)
1659 *energy_lj = energy_AB[2];
1660 m_add(vir_lj, vir_AB[2], vir_lj);
1662 else
1664 *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1665 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1666 for (i = 0; i < DIM; i++)
1668 for (j = 0; j < DIM; j++)
1670 vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1674 if (debug)
1676 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1679 else
1681 *energy_lj = 0;
1684 return 0;
1687 void gmx_pme_destroy(gmx_pme_t *pme)
1689 if (!pme)
1691 return;
1694 sfree(pme->nnx);
1695 sfree(pme->nny);
1696 sfree(pme->nnz);
1697 sfree(pme->fshx);
1698 sfree(pme->fshy);
1699 sfree(pme->fshz);
1701 for (int i = 0; i < pme->ngrids; ++i)
1703 pmegrids_destroy(&pme->pmegrid[i]);
1705 if (pme->pfft_setup)
1707 for (int i = 0; i < pme->ngrids; ++i)
1709 gmx_parallel_3dfft_destroy(pme->pfft_setup[i]);
1712 sfree(pme->fftgrid);
1713 sfree(pme->cfftgrid);
1714 sfree(pme->pfft_setup);
1716 for (int i = 0; i < std::max(1, pme->ndecompdim); i++) //pme->atc[0] is always allocated
1718 destroy_atomcomm(&pme->atc[i]);
1721 for (int i = 0; i < DIM; i++)
1723 sfree(pme->bsp_mod[i]);
1726 destroy_overlap_comm(&pme->overlap[0]);
1727 destroy_overlap_comm(&pme->overlap[1]);
1729 sfree(pme->lb_buf1);
1730 sfree(pme->lb_buf2);
1732 sfree(pme->bufv);
1733 sfree(pme->bufr);
1735 pme_free_all_work(&pme->solve_work, pme->nthread);
1737 sfree(pme->sum_qgrid_tmp);
1738 sfree(pme->sum_qgrid_dd_tmp);
1740 sfree(pme);