src/gromacs/ewald/pme.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief This file contains function definitions necessary for
  40  * computing energies and forces for the PME long-ranged part (Coulomb
  41  * and LJ).
  42  *
  43  * \author Erik Lindahl <erik@kth.se>
  44  * \author Berk Hess <hess@kth.se>
  45  * \ingroup module_ewald
  46  */
  47 /* IMPORTANT FOR DEVELOPERS:
  48  *
  49  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  50  * some bugs during development (many of them due to me). To avoid
  51  * this in the future, please check the following things if you make
  52  * changes in this file:
  53  *
  54  * 1. You should obtain identical (at least to the PME precision)
  55  *    energies, forces, and virial for
  56  *    a rectangular box and a triclinic one where the z (or y) axis is
  57  *    tilted a whole box side. For instance you could use these boxes:
  58  *
  59  *    rectangular       triclinic
  60  *     2  0  0           2  0  0
  61  *     0  2  0           0  2  0
  62  *     0  0  6           2  2  6
  63  *
  64  * 2. You should check the energy conservation in a triclinic box.
  65  *
  66  * It might seem an overkill, but better safe than sorry.
  67  * /Erik 001109
  68  */
  69
  70 #include "gmxpre.h"
  71
  72 #include "pme.h"
  73
  74 #include "config.h"
  75
  76 #include <assert.h>
  77 #include <math.h>
  78 #include <stdio.h>
  79 #include <stdlib.h>
  80 #include <string.h>
  81
  82 #include <algorithm>
  83
  84 #include "gromacs/fft/parallel_3dfft.h"
  85 #include "gromacs/fileio/pdbio.h"
  86 #include "gromacs/gmxlib/network.h"
  87 #include "gromacs/gmxlib/nrnb.h"
  88 #include "gromacs/math/gmxcomplex.h"
  89 #include "gromacs/math/invertmatrix.h"
  90 #include "gromacs/math/units.h"
  91 #include "gromacs/math/vec.h"
  92 #include "gromacs/math/vectypes.h"
  93 #include "gromacs/mdtypes/commrec.h"
  94 #include "gromacs/mdtypes/forcerec.h"
  95 #include "gromacs/mdtypes/inputrec.h"
  96 #include "gromacs/mdtypes/md_enums.h"
  97 #include "gromacs/pbcutil/pbc.h"
  98 #include "gromacs/timing/cyclecounter.h"
  99 #include "gromacs/timing/wallcycle.h"
 100 #include "gromacs/timing/walltime_accounting.h"
 101 #include "gromacs/utility/basedefinitions.h"
 102 #include "gromacs/utility/exceptions.h"
 103 #include "gromacs/utility/fatalerror.h"
 104 #include "gromacs/utility/gmxmpi.h"
 105 #include "gromacs/utility/gmxomp.h"
 106 #include "gromacs/utility/real.h"
 107 #include "gromacs/utility/smalloc.h"
 108 #include "gromacs/utility/stringutil.h"
 109 #include "gromacs/utility/unique_cptr.h"
 110
 111 #include "calculate-spline-moduli.h"
 112 #include "pme-gather.h"
 113 #include "pme-grid.h"
 114 #include "pme-internal.h"
 115 #include "pme-redistribute.h"
 116 #include "pme-solve.h"
 117 #include "pme-spline-work.h"
 118 #include "pme-spread.h"
 119
 120 /*! \brief Number of bytes in a cache line.
 121  *
 122  * Must also be a multiple of the SIMD and SIMD4 register size, to
 123  * preserve alignment.
 124  */
 125 const int gmxCacheLineSize = 64;
 126
 127 //! Set up coordinate communication
 128 static void setup_coordinate_communication(pme_atomcomm_t *atc)
 129 {
 130     int nslab, n, i;
 131     int fw, bw;
 132
 133     nslab = atc->nslab;
 134
 135     n = 0;
 136     for (i = 1; i <= nslab/2; i++)
 137     {
 138         fw = (atc->nodeid + i) % nslab;
 139         bw = (atc->nodeid - i + nslab) % nslab;
 140         if (n < nslab - 1)
 141         {
 142             atc->node_dest[n] = fw;
 143             atc->node_src[n]  = bw;
 144             n++;
 145         }
 146         if (n < nslab - 1)
 147         {
 148             atc->node_dest[n] = bw;
 149             atc->node_src[n]  = fw;
 150             n++;
 151         }
 152     }
 153 }
 154
 155 /*! \brief Round \p n up to the next multiple of \p f */
 156 static int mult_up(int n, int f)
 157 {
 158     return ((n + f - 1)/f)*f;
 159 }
 160
 161 /*! \brief Return estimate of the load imbalance from the PME grid not being a good match for the number of PME ranks */
 162 static double estimate_pme_load_imbalance(struct gmx_pme_t *pme)
 163 {
 164     int    nma, nmi;
 165     double n1, n2, n3;
 166
 167     nma = pme->nnodes_major;
 168     nmi = pme->nnodes_minor;
 169
 170     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 171     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 172     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 173
 174     /* pme_solve is roughly double the cost of an fft */
 175
 176     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 177 }
 178
 179 /*! \brief Initialize atom communication data structure */
 180 static void init_atomcomm(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
 181                           int dimind, gmx_bool bSpread)
 182 {
 183     int thread;
 184
 185     atc->dimind    = dimind;
 186     atc->nslab     = 1;
 187     atc->nodeid    = 0;
 188     atc->pd_nalloc = 0;
 189 #if GMX_MPI
 190     if (pme->nnodes > 1)
 191     {
 192         atc->mpi_comm = pme->mpi_comm_d[dimind];
 193         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 194         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 195     }
 196     if (debug)
 197     {
 198         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 199     }
 200 #endif
 201
 202     atc->bSpread   = bSpread;
 203     atc->pme_order = pme->pme_order;
 204
 205     if (atc->nslab > 1)
 206     {
 207         snew(atc->node_dest, atc->nslab);
 208         snew(atc->node_src, atc->nslab);
 209         setup_coordinate_communication(atc);
 210
 211         snew(atc->count_thread, pme->nthread);
 212         for (thread = 0; thread < pme->nthread; thread++)
 213         {
 214             snew(atc->count_thread[thread], atc->nslab);
 215         }
 216         atc->count = atc->count_thread[0];
 217         snew(atc->rcount, atc->nslab);
 218         snew(atc->buf_index, atc->nslab);
 219     }
 220
 221     atc->nthread = pme->nthread;
 222     if (atc->nthread > 1)
 223     {
 224         snew(atc->thread_plist, atc->nthread);
 225     }
 226     snew(atc->spline, atc->nthread);
 227     for (thread = 0; thread < atc->nthread; thread++)
 228     {
 229         if (atc->nthread > 1)
 230         {
 231             snew(atc->thread_plist[thread].n, atc->nthread+2*gmxCacheLineSize);
 232             atc->thread_plist[thread].n += gmxCacheLineSize;
 233         }
 234     }
 235 }
 236
 237 /*! \brief Destroy an atom communication data structure and its child structs */
 238 static void destroy_atomcomm(pme_atomcomm_t *atc)
 239 {
 240     sfree(atc->pd);
 241     if (atc->nslab > 1)
 242     {
 243         sfree(atc->node_dest);
 244         sfree(atc->node_src);
 245         for (int i = 0; i < atc->nthread; i++)
 246         {
 247             sfree(atc->count_thread[i]);
 248         }
 249         sfree(atc->count_thread);
 250         sfree(atc->rcount);
 251         sfree(atc->buf_index);
 252
 253         sfree(atc->x);
 254         sfree(atc->coefficient);
 255         sfree(atc->f);
 256     }
 257     sfree(atc->idx);
 258     sfree(atc->fractx);
 259
 260     sfree(atc->thread_idx);
 261     for (int i = 0; i < atc->nthread; i++)
 262     {
 263         if (atc->nthread > 1)
 264         {
 265             int *n_ptr = atc->thread_plist[i].n - gmxCacheLineSize;
 266             sfree(n_ptr);
 267             sfree(atc->thread_plist[i].i);
 268         }
 269         sfree(atc->spline[i].ind);
 270         for (int d = 0; d < ZZ; d++)
 271         {
 272             sfree(atc->spline[i].theta[d]);
 273             sfree(atc->spline[i].dtheta[d]);
 274         }
 275         sfree_aligned(atc->spline[i].ptr_dtheta_z);
 276         sfree_aligned(atc->spline[i].ptr_theta_z);
 277     }
 278     if (atc->nthread > 1)
 279     {
 280         sfree(atc->thread_plist);
 281     }
 282     sfree(atc->spline);
 283 }
 284
 285 /*! \brief Initialize data structure for communication */
 286 static void
 287 init_overlap_comm(pme_overlap_t *  ol,
 288                   int              norder,
 289 #if GMX_MPI
 290                   MPI_Comm         comm,
 291 #endif
 292                   int              nnodes,
 293                   int              nodeid,
 294                   int              ndata,
 295                   int              commplainsize)
 296 {
 297     int              b, i;
 298     pme_grid_comm_t *pgc;
 299     gmx_bool         bCont;
 300     int              fft_start, fft_end, send_index1, recv_index1;
 301 #if GMX_MPI
 302     MPI_Status       stat;
 303
 304     ol->mpi_comm = comm;
 305 #endif
 306
 307     ol->nnodes = nnodes;
 308     ol->nodeid = nodeid;
 309
 310     /* Linear translation of the PME grid won't affect reciprocal space
 311      * calculations, so to optimize we only interpolate "upwards",
 312      * which also means we only have to consider overlap in one direction.
 313      * I.e., particles on this node might also be spread to grid indices
 314      * that belong to higher nodes (modulo nnodes)
 315      */
 316
 317     snew(ol->s2g0, ol->nnodes+1);
 318     snew(ol->s2g1, ol->nnodes);
 319     if (debug)
 320     {
 321         fprintf(debug, "PME slab boundaries:");
 322     }
 323     for (i = 0; i < nnodes; i++)
 324     {
 325         /* s2g0 the local interpolation grid start.
 326          * s2g1 the local interpolation grid end.
 327          * Since in calc_pidx we divide particles, and not grid lines,
 328          * spatially uniform along dimension x or y, we need to round
 329          * s2g0 down and s2g1 up.
 330          */
 331         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 332         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 333
 334         if (debug)
 335         {
 336             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 337         }
 338     }
 339     ol->s2g0[nnodes] = ndata;
 340     if (debug)
 341     {
 342         fprintf(debug, "\n");
 343     }
 344
 345     /* Determine with how many nodes we need to communicate the grid overlap */
 346     b = 0;
 347     do
 348     {
 349         b++;
 350         bCont = FALSE;
 351         for (i = 0; i < nnodes; i++)
 352         {
 353             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 354                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 355             {
 356                 bCont = TRUE;
 357             }
 358         }
 359     }
 360     while (bCont && b < nnodes);
 361     ol->noverlap_nodes = b - 1;
 362
 363     snew(ol->send_id, ol->noverlap_nodes);
 364     snew(ol->recv_id, ol->noverlap_nodes);
 365     for (b = 0; b < ol->noverlap_nodes; b++)
 366     {
 367         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 368         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 369     }
 370     snew(ol->comm_data, ol->noverlap_nodes);
 371
 372     ol->send_size = 0;
 373     for (b = 0; b < ol->noverlap_nodes; b++)
 374     {
 375         pgc = &ol->comm_data[b];
 376         /* Send */
 377         fft_start        = ol->s2g0[ol->send_id[b]];
 378         fft_end          = ol->s2g0[ol->send_id[b]+1];
 379         if (ol->send_id[b] < nodeid)
 380         {
 381             fft_start += ndata;
 382             fft_end   += ndata;
 383         }
 384         send_index1       = ol->s2g1[nodeid];
 385         send_index1       = std::min(send_index1, fft_end);
 386         pgc->send_index0  = fft_start;
 387         pgc->send_nindex  = std::max(0, send_index1 - pgc->send_index0);
 388         ol->send_size    += pgc->send_nindex;
 389
 390         /* We always start receiving to the first index of our slab */
 391         fft_start        = ol->s2g0[ol->nodeid];
 392         fft_end          = ol->s2g0[ol->nodeid+1];
 393         recv_index1      = ol->s2g1[ol->recv_id[b]];
 394         if (ol->recv_id[b] > nodeid)
 395         {
 396             recv_index1 -= ndata;
 397         }
 398         recv_index1      = std::min(recv_index1, fft_end);
 399         pgc->recv_index0 = fft_start;
 400         pgc->recv_nindex = std::max(0, recv_index1 - pgc->recv_index0);
 401     }
 402
 403 #if GMX_MPI
 404     /* Communicate the buffer sizes to receive */
 405     for (b = 0; b < ol->noverlap_nodes; b++)
 406     {
 407         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 408                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 409                      ol->mpi_comm, &stat);
 410     }
 411 #endif
 412
 413     /* For non-divisible grid we need pme_order iso pme_order-1 */
 414     snew(ol->sendbuf, norder*commplainsize);
 415     snew(ol->recvbuf, norder*commplainsize);
 416 }
 417
 418 /*! \brief Destroy data structure for communication */
 419 static void
 420 destroy_overlap_comm(const pme_overlap_t *ol)
 421 {
 422     sfree(ol->s2g0);
 423     sfree(ol->s2g1);
 424     sfree(ol->send_id);
 425     sfree(ol->recv_id);
 426     sfree(ol->comm_data);
 427     sfree(ol->sendbuf);
 428     sfree(ol->recvbuf);
 429 }
 430
 431 int minimalPmeGridSize(int pmeOrder)
 432 {
 433     /* The actual grid size limitations are:
 434      *   serial:        >= pme_order
 435      *   DD, no OpenMP: >= 2*(pme_order - 1)
 436      *   DD, OpenMP:    >= pme_order + 1
 437      * But we use the maximum for simplicity since in practice there is not
 438      * much performance difference between pme_order and 2*(pme_order -1).
 439      */
 440     int minimalSize = 2*(pmeOrder - 1);
 441
 442     GMX_RELEASE_ASSERT(pmeOrder >= 3, "pmeOrder has to be >= 3");
 443     GMX_RELEASE_ASSERT(minimalSize >= pmeOrder + 1, "The grid size should be >= pmeOrder + 1");
 444
 445     return minimalSize;
 446 }
 447
 448 void gmx_pme_check_restrictions(int pme_order,
 449                                 int nkx, int nky, int nkz,
 450                                 int nnodes_major,
 451                                 gmx_bool bUseThreads,
 452                                 gmx_bool bFatal,
 453                                 gmx_bool *bValidSettings)
 454 {
 455     if (pme_order > PME_ORDER_MAX)
 456     {
 457         if (!bFatal)
 458         {
 459             *bValidSettings = FALSE;
 460             return;
 461         }
 462
 463         std::string message = gmx::formatString(
 464                     "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 465                     pme_order, PME_ORDER_MAX);
 466         GMX_THROW(InconsistentInputError(message));
 467     }
 468
 469     const int minGridSize = minimalPmeGridSize(pme_order);
 470     if (nkx < minGridSize ||
 471         nky < minGridSize ||
 472         nkz < minGridSize)
 473     {
 474         if (!bFatal)
 475         {
 476             *bValidSettings = FALSE;
 477             return;
 478         }
 479         std::string message = gmx::formatString(
 480                     "The PME grid sizes need to be >= 2*(pme_order-1) (%d)",
 481                     minGridSize);
 482         GMX_THROW(InconsistentInputError(message));
 483     }
 484
 485     /* Check for a limitation of the (current) sum_fftgrid_dd code.
 486      * We only allow multiple communication pulses in dim 1, not in dim 0.
 487      */
 488     if (bUseThreads && (nkx < nnodes_major*pme_order &&
 489                         nkx != nnodes_major*(pme_order - 1)))
 490     {
 491         if (!bFatal)
 492         {
 493             *bValidSettings = FALSE;
 494             return;
 495         }
 496         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
 497                   nkx/(double)nnodes_major, pme_order);
 498     }
 499
 500     if (bValidSettings != nullptr)
 501     {
 502         *bValidSettings = TRUE;
 503     }
 504
 505     return;
 506 }
 507
 508 /*! \brief Round \p enumerator */
 509 static int div_round_up(int enumerator, int denominator)
 510 {
 511     return (enumerator + denominator - 1)/denominator;
 512 }
 513
 514 int gmx_pme_init(struct gmx_pme_t **pmedata,
 515                  t_commrec *        cr,
 516                  int                nnodes_major,
 517                  int                nnodes_minor,
 518                  const t_inputrec * ir,
 519                  int                homenr,
 520                  gmx_bool           bFreeEnergy_q,
 521                  gmx_bool           bFreeEnergy_lj,
 522                  gmx_bool           bReproducible,
 523                  real               ewaldcoeff_q,
 524                  real               ewaldcoeff_lj,
 525                  int                nthread)
 526 {
 527     int               use_threads, sum_use_threads, i;
 528     ivec              ndata;
 529
 530     if (debug)
 531     {
 532         fprintf(debug, "Creating PME data structures.\n");
 533     }
 534
 535     gmx_pme_t *pmeRaw = nullptr;
 536     snew(pmeRaw, 1);
 537     unique_cptr<gmx_pme_t, gmx_pme_destroy> pme(pmeRaw);
 538
 539     pme->sum_qgrid_tmp       = nullptr;
 540     pme->sum_qgrid_dd_tmp    = nullptr;
 541
 542     pme->buf_nalloc          = 0;
 543
 544     pme->nnodes              = 1;
 545     pme->bPPnode             = TRUE;
 546
 547     pme->nnodes_major        = nnodes_major;
 548     pme->nnodes_minor        = nnodes_minor;
 549
 550 #if GMX_MPI
 551     if (nnodes_major*nnodes_minor > 1)
 552     {
 553         pme->mpi_comm = cr->mpi_comm_mygroup;
 554
 555         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 556         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 557         if (pme->nnodes != nnodes_major*nnodes_minor)
 558         {
 559             gmx_incons("PME rank count mismatch");
 560         }
 561     }
 562     else
 563     {
 564         pme->mpi_comm = MPI_COMM_NULL;
 565     }
 566 #endif
 567
 568     if (pme->nnodes == 1)
 569     {
 570 #if GMX_MPI
 571         pme->mpi_comm_d[0] = MPI_COMM_NULL;
 572         pme->mpi_comm_d[1] = MPI_COMM_NULL;
 573 #endif
 574         pme->ndecompdim   = 0;
 575         pme->nodeid_major = 0;
 576         pme->nodeid_minor = 0;
 577 #if GMX_MPI
 578         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 579 #endif
 580     }
 581     else
 582     {
 583         if (nnodes_minor == 1)
 584         {
 585 #if GMX_MPI
 586             pme->mpi_comm_d[0] = pme->mpi_comm;
 587             pme->mpi_comm_d[1] = MPI_COMM_NULL;
 588 #endif
 589             pme->ndecompdim   = 1;
 590             pme->nodeid_major = pme->nodeid;
 591             pme->nodeid_minor = 0;
 592
 593         }
 594         else if (nnodes_major == 1)
 595         {
 596 #if GMX_MPI
 597             pme->mpi_comm_d[0] = MPI_COMM_NULL;
 598             pme->mpi_comm_d[1] = pme->mpi_comm;
 599 #endif
 600             pme->ndecompdim   = 1;
 601             pme->nodeid_major = 0;
 602             pme->nodeid_minor = pme->nodeid;
 603         }
 604         else
 605         {
 606             if (pme->nnodes % nnodes_major != 0)
 607             {
 608                 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
 609             }
 610             pme->ndecompdim = 2;
 611
 612 #if GMX_MPI
 613             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 614                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 615             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 616                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 617
 618             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 619             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 620             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 621             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 622 #endif
 623         }
 624         pme->bPPnode = (cr->duty & DUTY_PP);
 625     }
 626
 627     pme->nthread = nthread;
 628
 629     /* Check if any of the PME MPI ranks uses threads */
 630     use_threads = (pme->nthread > 1 ? 1 : 0);
 631 #if GMX_MPI
 632     if (pme->nnodes > 1)
 633     {
 634         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
 635                       MPI_SUM, pme->mpi_comm);
 636     }
 637     else
 638 #endif
 639     {
 640         sum_use_threads = use_threads;
 641     }
 642     pme->bUseThreads = (sum_use_threads > 0);
 643
 644     if (ir->ePBC == epbcSCREW)
 645     {
 646         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 647     }
 648
 649     /* NOTE:
 650      * It is likely that the current gmx_pme_do() routine supports calculating
 651      * only Coulomb or LJ while gmx_pme_init() configures for both,
 652      * but that has never been tested.
 653      * It is likely that the current gmx_pme_do() routine supports calculating,
 654      * not calculating free-energy for Coulomb and/or LJ while gmx_pme_init()
 655      * configures with free-energy, but that has never been tested.
 656      */
 657     pme->doCoulomb     = EEL_PME(ir->coulombtype);
 658     pme->doLJ          = EVDW_PME(ir->vdwtype);
 659     pme->bFEP_q        = ((ir->efep != efepNO) && bFreeEnergy_q);
 660     pme->bFEP_lj       = ((ir->efep != efepNO) && bFreeEnergy_lj);
 661     pme->bFEP          = (pme->bFEP_q || pme->bFEP_lj);
 662     pme->nkx           = ir->nkx;
 663     pme->nky           = ir->nky;
 664     pme->nkz           = ir->nkz;
 665     pme->bP3M          = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != nullptr);
 666     pme->pme_order     = ir->pme_order;
 667     pme->ewaldcoeff_q  = ewaldcoeff_q;
 668     pme->ewaldcoeff_lj = ewaldcoeff_lj;
 669
 670     /* Always constant electrostatics coefficients */
 671     pme->epsilon_r     = ir->epsilon_r;
 672
 673     /* Always constant LJ coefficients */
 674     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
 675
 676     /* If we violate restrictions, generate a fatal error here */
 677     gmx_pme_check_restrictions(pme->pme_order,
 678                                pme->nkx, pme->nky, pme->nkz,
 679                                pme->nnodes_major,
 680                                pme->bUseThreads,
 681                                TRUE,
 682                                nullptr);
 683
 684     if (pme->nnodes > 1)
 685     {
 686         double imbal;
 687
 688 #if GMX_MPI
 689         MPI_Type_contiguous(DIM, GMX_MPI_REAL, &(pme->rvec_mpi));
 690         MPI_Type_commit(&(pme->rvec_mpi));
 691 #endif
 692
 693         /* Note that the coefficient spreading and force gathering, which usually
 694          * takes about the same amount of time as FFT+solve_pme,
 695          * is always fully load balanced
 696          * (unless the coefficient distribution is inhomogeneous).
 697          */
 698
 699         imbal = estimate_pme_load_imbalance(pme.get());
 700         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 701         {
 702             fprintf(stderr,
 703                     "\n"
 704                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 705                     "      For optimal PME load balancing\n"
 706                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
 707                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
 708                     "\n",
 709                     (int)((imbal-1)*100 + 0.5),
 710                     pme->nkx, pme->nky, pme->nnodes_major,
 711                     pme->nky, pme->nkz, pme->nnodes_minor);
 712         }
 713     }
 714
 715     /* For non-divisible grid we need pme_order iso pme_order-1 */
 716     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 717      * y is always copied through a buffer: we don't need padding in z,
 718      * but we do need the overlap in x because of the communication order.
 719      */
 720     init_overlap_comm(&pme->overlap[0], pme->pme_order,
 721 #if GMX_MPI
 722                       pme->mpi_comm_d[0],
 723 #endif
 724                       pme->nnodes_major, pme->nodeid_major,
 725                       pme->nkx,
 726                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 727
 728     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 729      * We do this with an offset buffer of equal size, so we need to allocate
 730      * extra for the offset. That's what the (+1)*pme->nkz is for.
 731      */
 732     init_overlap_comm(&pme->overlap[1], pme->pme_order,
 733 #if GMX_MPI
 734                       pme->mpi_comm_d[1],
 735 #endif
 736                       pme->nnodes_minor, pme->nodeid_minor,
 737                       pme->nky,
 738                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 739
 740     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
 741      * Note that gmx_pme_check_restrictions checked for this already.
 742      */
 743     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
 744     {
 745         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
 746     }
 747
 748     snew(pme->bsp_mod[XX], pme->nkx);
 749     snew(pme->bsp_mod[YY], pme->nky);
 750     snew(pme->bsp_mod[ZZ], pme->nkz);
 751
 752     /* The required size of the interpolation grid, including overlap.
 753      * The allocated size (pmegrid_n?) might be slightly larger.
 754      */
 755     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 756         pme->overlap[0].s2g0[pme->nodeid_major];
 757     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 758         pme->overlap[1].s2g0[pme->nodeid_minor];
 759     pme->pmegrid_nz_base = pme->nkz;
 760     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 761     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 762
 763     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 764     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 765     pme->pmegrid_start_iz = 0;
 766
 767     make_gridindex_to_localindex(pme->nkx,
 768                                  pme->pmegrid_start_ix,
 769                                  pme->pmegrid_nx - (pme->pme_order-1),
 770                                  &pme->nnx, &pme->fshx);
 771     make_gridindex_to_localindex(pme->nky,
 772                                  pme->pmegrid_start_iy,
 773                                  pme->pmegrid_ny - (pme->pme_order-1),
 774                                  &pme->nny, &pme->fshy);
 775     make_gridindex_to_localindex(pme->nkz,
 776                                  pme->pmegrid_start_iz,
 777                                  pme->pmegrid_nz_base,
 778                                  &pme->nnz, &pme->fshz);
 779
 780     pme->spline_work = make_pme_spline_work(pme->pme_order);
 781
 782     ndata[0]    = pme->nkx;
 783     ndata[1]    = pme->nky;
 784     ndata[2]    = pme->nkz;
 785     /* It doesn't matter if we allocate too many grids here,
 786      * we only allocate and use the ones we need.
 787      */
 788     if (pme->doLJ)
 789     {
 790         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
 791     }
 792     else
 793     {
 794         pme->ngrids = DO_Q;
 795     }
 796     snew(pme->fftgrid, pme->ngrids);
 797     snew(pme->cfftgrid, pme->ngrids);
 798     snew(pme->pfft_setup, pme->ngrids);
 799
 800     for (i = 0; i < pme->ngrids; ++i)
 801     {
 802         if ((i <  DO_Q && pme->doCoulomb && (i == 0 ||
 803                                              bFreeEnergy_q)) ||
 804             (i >= DO_Q && pme->doLJ && (i == 2 ||
 805                                         bFreeEnergy_lj ||
 806                                         ir->ljpme_combination_rule == eljpmeLB)))
 807         {
 808             pmegrids_init(&pme->pmegrid[i],
 809                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 810                           pme->pmegrid_nz_base,
 811                           pme->pme_order,
 812                           pme->bUseThreads,
 813                           pme->nthread,
 814                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 815                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 816             /* This routine will allocate the grid data to fit the FFTs */
 817             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
 818                                     &pme->fftgrid[i], &pme->cfftgrid[i],
 819                                     pme->mpi_comm_d,
 820                                     bReproducible, pme->nthread);
 821
 822         }
 823     }
 824
 825     if (!pme->bP3M)
 826     {
 827         /* Use plain SPME B-spline interpolation */
 828         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 829     }
 830     else
 831     {
 832         /* Use the P3M grid-optimized influence function */
 833         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 834     }
 835
 836     /* Use atc[0] for spreading */
 837     init_atomcomm(pme.get(), &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
 838     if (pme->ndecompdim >= 2)
 839     {
 840         init_atomcomm(pme.get(), &pme->atc[1], 1, FALSE);
 841     }
 842
 843     if (pme->nnodes == 1)
 844     {
 845         pme->atc[0].n = homenr;
 846         pme_realloc_atomcomm_things(&pme->atc[0]);
 847     }
 848
 849     pme->lb_buf1       = nullptr;
 850     pme->lb_buf2       = nullptr;
 851     pme->lb_buf_nalloc = 0;
 852
 853     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 854
 855     // no exception was thrown during the init, so we hand over the PME structure handle
 856     *pmedata = pme.release();
 857
 858     return 0;
 859 }
 860
 861 int gmx_pme_reinit(struct gmx_pme_t **pmedata,
 862                    t_commrec *        cr,
 863                    struct gmx_pme_t * pme_src,
 864                    const t_inputrec * ir,
 865                    ivec               grid_size,
 866                    real               ewaldcoeff_q,
 867                    real               ewaldcoeff_lj)
 868 {
 869     int        homenr;
 870     int        ret;
 871
 872     // Create a copy of t_inputrec fields that are used in gmx_pme_init().
 873     // TODO: This would be better as just copying a sub-structure that contains
 874     // all the PME parameters and nothing else.
 875     t_inputrec irc;
 876     irc.ePBC                   = ir->ePBC;
 877     irc.coulombtype            = ir->coulombtype;
 878     irc.vdwtype                = ir->vdwtype;
 879     irc.efep                   = ir->efep;
 880     irc.pme_order              = ir->pme_order;
 881     irc.epsilon_r              = ir->epsilon_r;
 882     irc.ljpme_combination_rule = ir->ljpme_combination_rule;
 883     irc.nkx                    = grid_size[XX];
 884     irc.nky                    = grid_size[YY];
 885     irc.nkz                    = grid_size[ZZ];
 886
 887     if (pme_src->nnodes == 1)
 888     {
 889         homenr = pme_src->atc[0].n;
 890     }
 891     else
 892     {
 893         homenr = -1;
 894     }
 895
 896     try
 897     {
 898         ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 899                            &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
 900     }
 901     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 902
 903     if (ret == 0)
 904     {
 905         /* We can easily reuse the allocated pme grids in pme_src */
 906         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
 907         /* We would like to reuse the fft grids, but that's harder */
 908     }
 909
 910     return ret;
 911 }
 912
 913 void gmx_pme_calc_energy(struct gmx_pme_t *pme, int n, rvec *x, real *q, real *V)
 914 {
 915     pme_atomcomm_t *atc;
 916     pmegrids_t     *grid;
 917
 918     if (pme->nnodes > 1)
 919     {
 920         gmx_incons("gmx_pme_calc_energy called in parallel");
 921     }
 922     if (pme->bFEP_q > 1)
 923     {
 924         gmx_incons("gmx_pme_calc_energy with free energy");
 925     }
 926
 927     atc            = &pme->atc_energy;
 928     atc->nthread   = 1;
 929     if (atc->spline == nullptr)
 930     {
 931         snew(atc->spline, atc->nthread);
 932     }
 933     atc->nslab     = 1;
 934     atc->bSpread   = TRUE;
 935     atc->pme_order = pme->pme_order;
 936     atc->n         = n;
 937     pme_realloc_atomcomm_things(atc);
 938     atc->x           = x;
 939     atc->coefficient = q;
 940
 941     /* We only use the A-charges grid */
 942     grid = &pme->pmegrid[PME_GRID_QA];
 943
 944     /* Only calculate the spline coefficients, don't actually spread */
 945     spread_on_grid(pme, atc, nullptr, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 946
 947     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 948 }
 949
 950 /*! \brief Calculate initial Lorentz-Berthelot coefficients for LJ-PME */
 951 static void
 952 calc_initial_lb_coeffs(struct gmx_pme_t *pme, real *local_c6, real *local_sigma)
 953 {
 954     int  i;
 955     for (i = 0; i < pme->atc[0].n; ++i)
 956     {
 957         real sigma4;
 958         sigma4                     = local_sigma[i];
 959         sigma4                     = sigma4*sigma4;
 960         sigma4                     = sigma4*sigma4;
 961         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
 962     }
 963 }
 964
 965 /*! \brief Calculate next Lorentz-Berthelot coefficients for LJ-PME */
 966 static void
 967 calc_next_lb_coeffs(struct gmx_pme_t *pme, real *local_sigma)
 968 {
 969     int  i;
 970
 971     for (i = 0; i < pme->atc[0].n; ++i)
 972     {
 973         pme->atc[0].coefficient[i] *= local_sigma[i];
 974     }
 975 }
 976
 977 int gmx_pme_do(struct gmx_pme_t *pme,
 978                int start,       int homenr,
 979                rvec x[],        rvec f[],
 980                real chargeA[],  real chargeB[],
 981                real c6A[],      real c6B[],
 982                real sigmaA[],   real sigmaB[],
 983                matrix box,      t_commrec *cr,
 984                int  maxshift_x, int maxshift_y,
 985                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 986                matrix vir_q,    matrix vir_lj,
 987                real *energy_q,  real *energy_lj,
 988                real lambda_q,   real lambda_lj,
 989                real *dvdlambda_q, real *dvdlambda_lj,
 990                int flags)
 991 {
 992     int                  d, i, j, npme, grid_index, max_grid_index;
 993     int                  n_d;
 994     pme_atomcomm_t      *atc        = nullptr;
 995     pmegrids_t          *pmegrid    = nullptr;
 996     real                *grid       = nullptr;
 997     rvec                *f_d;
 998     real                *coefficient = nullptr;
 999     real                 energy_AB[4];
1000     matrix               vir_AB[4];
1001     real                 scale, lambda;
1002     gmx_bool             bClearF;
1003     gmx_parallel_3dfft_t pfft_setup;
1004     real              *  fftgrid;
1005     t_complex          * cfftgrid;
1006     int                  thread;
1007     gmx_bool             bFirst, bDoSplines;
1008     int                  fep_state;
1009     int                  fep_states_lj           = pme->bFEP_lj ? 2 : 1;
1010     const gmx_bool       bCalcEnerVir            = flags & GMX_PME_CALC_ENER_VIR;
1011     const gmx_bool       bBackFFT                = flags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
1012     const gmx_bool       bCalcF                  = flags & GMX_PME_CALC_F;
1013
1014     assert(pme->nnodes > 0);
1015     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
1016
1017     if (pme->nnodes > 1)
1018     {
1019         atc      = &pme->atc[0];
1020         atc->npd = homenr;
1021         if (atc->npd > atc->pd_nalloc)
1022         {
1023             atc->pd_nalloc = over_alloc_dd(atc->npd);
1024             srenew(atc->pd, atc->pd_nalloc);
1025         }
1026         for (d = pme->ndecompdim-1; d >= 0; d--)
1027         {
1028             atc           = &pme->atc[d];
1029             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
1030         }
1031     }
1032     else
1033     {
1034         atc = &pme->atc[0];
1035         /* This could be necessary for TPI */
1036         pme->atc[0].n = homenr;
1037         if (DOMAINDECOMP(cr))
1038         {
1039             pme_realloc_atomcomm_things(atc);
1040         }
1041         atc->x = x;
1042         atc->f = f;
1043     }
1044
1045     gmx::invertBoxMatrix(box, pme->recipbox);
1046     bFirst = TRUE;
1047
1048     /* For simplicity, we construct the splines for all particles if
1049      * more than one PME calculations is needed. Some optimization
1050      * could be done by keeping track of which atoms have splines
1051      * constructed, and construct new splines on each pass for atoms
1052      * that don't yet have them.
1053      */
1054
1055     bDoSplines = pme->bFEP || (pme->doCoulomb && pme->doLJ);
1056
1057     /* We need a maximum of four separate PME calculations:
1058      * grid_index=0: Coulomb PME with charges from state A
1059      * grid_index=1: Coulomb PME with charges from state B
1060      * grid_index=2: LJ PME with C6 from state A
1061      * grid_index=3: LJ PME with C6 from state B
1062      * For Lorentz-Berthelot combination rules, a separate loop is used to
1063      * calculate all the terms
1064      */
1065
1066     /* If we are doing LJ-PME with LB, we only do Q here */
1067     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
1068
1069     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
1070     {
1071         /* Check if we should do calculations at this grid_index
1072          * If grid_index is odd we should be doing FEP
1073          * If grid_index < 2 we should be doing electrostatic PME
1074          * If grid_index >= 2 we should be doing LJ-PME
1075          */
1076         if ((grid_index <  DO_Q && (!pme->doCoulomb ||
1077                                     (grid_index == 1 && !pme->bFEP_q))) ||
1078             (grid_index >= DO_Q && (!pme->doLJ ||
1079                                     (grid_index == 3 && !pme->bFEP_lj))))
1080         {
1081             continue;
1082         }
1083         /* Unpack structure */
1084         pmegrid    = &pme->pmegrid[grid_index];
1085         fftgrid    = pme->fftgrid[grid_index];
1086         cfftgrid   = pme->cfftgrid[grid_index];
1087         pfft_setup = pme->pfft_setup[grid_index];
1088         switch (grid_index)
1089         {
1090             case 0: coefficient = chargeA + start; break;
1091             case 1: coefficient = chargeB + start; break;
1092             case 2: coefficient = c6A + start; break;
1093             case 3: coefficient = c6B + start; break;
1094         }
1095
1096         grid = pmegrid->grid.grid;
1097
1098         if (debug)
1099         {
1100             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
1101                     cr->nnodes, cr->nodeid);
1102             fprintf(debug, "Grid = %p\n", (void*)grid);
1103             if (grid == nullptr)
1104             {
1105                 gmx_fatal(FARGS, "No grid!");
1106             }
1107         }
1108         where();
1109
1110         if (pme->nnodes == 1)
1111         {
1112             atc->coefficient = coefficient;
1113         }
1114         else
1115         {
1116             wallcycle_start(wcycle, ewcPME_REDISTXF);
1117             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
1118             where();
1119
1120             wallcycle_stop(wcycle, ewcPME_REDISTXF);
1121         }
1122
1123         if (debug)
1124         {
1125             fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
1126                     cr->nodeid, atc->n);
1127         }
1128
1129         if (flags & GMX_PME_SPREAD)
1130         {
1131             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1132
1133             /* Spread the coefficients on a grid */
1134             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1135
1136             if (bFirst)
1137             {
1138                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1139             }
1140             inc_nrnb(nrnb, eNR_SPREADBSP,
1141                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1142
1143             if (!pme->bUseThreads)
1144             {
1145                 wrap_periodic_pmegrid(pme, grid);
1146
1147                 /* sum contributions to local grid from other nodes */
1148 #if GMX_MPI
1149                 if (pme->nnodes > 1)
1150                 {
1151                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1152                     where();
1153                 }
1154 #endif
1155
1156                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1157             }
1158
1159             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1160
1161             /* TODO If the OpenMP and single-threaded implementations
1162                converge, then spread_on_grid() and
1163                copy_pmegrid_to_fftgrid() will perhaps live in the same
1164                source file and the following debugging function can live
1165                there too. */
1166             /*
1167                dump_local_fftgrid(pme,fftgrid);
1168                exit(0);
1169              */
1170         }
1171
1172         /* Here we start a large thread parallel region */
1173 #pragma omp parallel num_threads(pme->nthread) private(thread)
1174         {
1175             try
1176             {
1177                 thread = gmx_omp_get_thread_num();
1178                 if (flags & GMX_PME_SOLVE)
1179                 {
1180                     int loop_count;
1181
1182                     /* do 3d-fft */
1183                     if (thread == 0)
1184                     {
1185                         wallcycle_start(wcycle, ewcPME_FFT);
1186                     }
1187                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1188                                                thread, wcycle);
1189                     if (thread == 0)
1190                     {
1191                         wallcycle_stop(wcycle, ewcPME_FFT);
1192                     }
1193                     where();
1194
1195                     /* solve in k-space for our local cells */
1196                     if (thread == 0)
1197                     {
1198                         wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1199                     }
1200                     if (grid_index < DO_Q)
1201                     {
1202                         loop_count =
1203                             solve_pme_yzx(pme, cfftgrid,
1204                                           box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1205                                           bCalcEnerVir,
1206                                           pme->nthread, thread);
1207                     }
1208                     else
1209                     {
1210                         loop_count =
1211                             solve_pme_lj_yzx(pme, &cfftgrid, FALSE,
1212                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1213                                              bCalcEnerVir,
1214                                              pme->nthread, thread);
1215                     }
1216
1217                     if (thread == 0)
1218                     {
1219                         wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
1220                         where();
1221                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1222                     }
1223                 }
1224
1225                 if (bBackFFT)
1226                 {
1227                     /* do 3d-invfft */
1228                     if (thread == 0)
1229                     {
1230                         where();
1231                         wallcycle_start(wcycle, ewcPME_FFT);
1232                     }
1233                     gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1234                                                thread, wcycle);
1235                     if (thread == 0)
1236                     {
1237                         wallcycle_stop(wcycle, ewcPME_FFT);
1238
1239                         where();
1240
1241                         if (pme->nodeid == 0)
1242                         {
1243                             real ntot = pme->nkx*pme->nky*pme->nkz;
1244                             npme  = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1245                             inc_nrnb(nrnb, eNR_FFT, 2*npme);
1246                         }
1247
1248                         /* Note: this wallcycle region is closed below
1249                            outside an OpenMP region, so take care if
1250                            refactoring code here. */
1251                         wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1252                     }
1253
1254                     copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1255                 }
1256             } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1257         }
1258         /* End of thread parallel section.
1259          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
1260          */
1261
1262         if (bBackFFT)
1263         {
1264             /* distribute local grid to all nodes */
1265 #if GMX_MPI
1266             if (pme->nnodes > 1)
1267             {
1268                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1269             }
1270 #endif
1271             where();
1272
1273             unwrap_periodic_pmegrid(pme, grid);
1274         }
1275
1276         if (bCalcF)
1277         {
1278             /* interpolate forces for our local atoms */
1279
1280             where();
1281
1282             /* If we are running without parallelization,
1283              * atc->f is the actual force array, not a buffer,
1284              * therefore we should not clear it.
1285              */
1286             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
1287             bClearF = (bFirst && PAR(cr));
1288 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1289             for (thread = 0; thread < pme->nthread; thread++)
1290             {
1291                 try
1292                 {
1293                     gather_f_bsplines(pme, grid, bClearF, atc,
1294                                       &atc->spline[thread],
1295                                       pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
1296                 }
1297                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1298             }
1299
1300             where();
1301
1302             inc_nrnb(nrnb, eNR_GATHERFBSP,
1303                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1304             /* Note: this wallcycle region is opened above inside an OpenMP
1305                region, so take care if refactoring code here. */
1306             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1307         }
1308
1309         if (bCalcEnerVir)
1310         {
1311             /* This should only be called on the master thread
1312              * and after the threads have synchronized.
1313              */
1314             if (grid_index < 2)
1315             {
1316                 get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1317             }
1318             else
1319             {
1320                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
1321             }
1322         }
1323         bFirst = FALSE;
1324     } /* of grid_index-loop */
1325
1326     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
1327      * seven terms. */
1328
1329     if (pme->doLJ && pme->ljpme_combination_rule == eljpmeLB)
1330     {
1331         /* Loop over A- and B-state if we are doing FEP */
1332         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
1333         {
1334             real *local_c6 = nullptr, *local_sigma = nullptr, *RedistC6 = nullptr, *RedistSigma = nullptr;
1335             if (pme->nnodes == 1)
1336             {
1337                 if (pme->lb_buf1 == nullptr)
1338                 {
1339                     pme->lb_buf_nalloc = pme->atc[0].n;
1340                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
1341                 }
1342                 pme->atc[0].coefficient = pme->lb_buf1;
1343                 switch (fep_state)
1344                 {
1345                     case 0:
1346                         local_c6      = c6A;
1347                         local_sigma   = sigmaA;
1348                         break;
1349                     case 1:
1350                         local_c6      = c6B;
1351                         local_sigma   = sigmaB;
1352                         break;
1353                     default:
1354                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1355                 }
1356             }
1357             else
1358             {
1359                 atc = &pme->atc[0];
1360                 switch (fep_state)
1361                 {
1362                     case 0:
1363                         RedistC6      = c6A;
1364                         RedistSigma   = sigmaA;
1365                         break;
1366                     case 1:
1367                         RedistC6      = c6B;
1368                         RedistSigma   = sigmaB;
1369                         break;
1370                     default:
1371                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
1372                 }
1373                 wallcycle_start(wcycle, ewcPME_REDISTXF);
1374
1375                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
1376                 if (pme->lb_buf_nalloc < atc->n)
1377                 {
1378                     pme->lb_buf_nalloc = atc->nalloc;
1379                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
1380                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
1381                 }
1382                 local_c6 = pme->lb_buf1;
1383                 for (i = 0; i < atc->n; ++i)
1384                 {
1385                     local_c6[i] = atc->coefficient[i];
1386                 }
1387                 where();
1388
1389                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
1390                 local_sigma = pme->lb_buf2;
1391                 for (i = 0; i < atc->n; ++i)
1392                 {
1393                     local_sigma[i] = atc->coefficient[i];
1394                 }
1395                 where();
1396
1397                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
1398             }
1399             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1400
1401             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
1402             for (grid_index = 2; grid_index < 9; ++grid_index)
1403             {
1404                 /* Unpack structure */
1405                 pmegrid    = &pme->pmegrid[grid_index];
1406                 fftgrid    = pme->fftgrid[grid_index];
1407                 pfft_setup = pme->pfft_setup[grid_index];
1408                 calc_next_lb_coeffs(pme, local_sigma);
1409                 grid = pmegrid->grid.grid;
1410                 where();
1411
1412                 if (flags & GMX_PME_SPREAD)
1413                 {
1414                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1415                     /* Spread the c6 on a grid */
1416                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
1417
1418                     if (bFirst)
1419                     {
1420                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
1421                     }
1422
1423                     inc_nrnb(nrnb, eNR_SPREADBSP,
1424                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
1425                     if (pme->nthread == 1)
1426                     {
1427                         wrap_periodic_pmegrid(pme, grid);
1428                         /* sum contributions to local grid from other nodes */
1429 #if GMX_MPI
1430                         if (pme->nnodes > 1)
1431                         {
1432                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
1433                             where();
1434                         }
1435 #endif
1436                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
1437                     }
1438                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1439                 }
1440                 /*Here we start a large thread parallel region*/
1441 #pragma omp parallel num_threads(pme->nthread) private(thread)
1442                 {
1443                     try
1444                     {
1445                         thread = gmx_omp_get_thread_num();
1446                         if (flags & GMX_PME_SOLVE)
1447                         {
1448                             /* do 3d-fft */
1449                             if (thread == 0)
1450                             {
1451                                 wallcycle_start(wcycle, ewcPME_FFT);
1452                             }
1453
1454                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
1455                                                        thread, wcycle);
1456                             if (thread == 0)
1457                             {
1458                                 wallcycle_stop(wcycle, ewcPME_FFT);
1459                             }
1460                             where();
1461                         }
1462                     }
1463                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1464                 }
1465                 bFirst = FALSE;
1466             }
1467             if (flags & GMX_PME_SOLVE)
1468             {
1469                 /* solve in k-space for our local cells */
1470 #pragma omp parallel num_threads(pme->nthread) private(thread)
1471                 {
1472                     try
1473                     {
1474                         int loop_count;
1475                         thread = gmx_omp_get_thread_num();
1476                         if (thread == 0)
1477                         {
1478                             wallcycle_start(wcycle, ewcLJPME);
1479                         }
1480
1481                         loop_count =
1482                             solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE,
1483                                              box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
1484                                              bCalcEnerVir,
1485                                              pme->nthread, thread);
1486                         if (thread == 0)
1487                         {
1488                             wallcycle_stop(wcycle, ewcLJPME);
1489                             where();
1490                             inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
1491                         }
1492                     }
1493                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1494                 }
1495             }
1496
1497             if (bCalcEnerVir)
1498             {
1499                 /* This should only be called on the master thread and
1500                  * after the threads have synchronized.
1501                  */
1502                 get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
1503             }
1504
1505             if (bBackFFT)
1506             {
1507                 bFirst = !pme->doCoulomb;
1508                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
1509                 for (grid_index = 8; grid_index >= 2; --grid_index)
1510                 {
1511                     /* Unpack structure */
1512                     pmegrid    = &pme->pmegrid[grid_index];
1513                     fftgrid    = pme->fftgrid[grid_index];
1514                     pfft_setup = pme->pfft_setup[grid_index];
1515                     grid       = pmegrid->grid.grid;
1516                     calc_next_lb_coeffs(pme, local_sigma);
1517                     where();
1518 #pragma omp parallel num_threads(pme->nthread) private(thread)
1519                     {
1520                         try
1521                         {
1522                             thread = gmx_omp_get_thread_num();
1523                             /* do 3d-invfft */
1524                             if (thread == 0)
1525                             {
1526                                 where();
1527                                 wallcycle_start(wcycle, ewcPME_FFT);
1528                             }
1529
1530                             gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
1531                                                        thread, wcycle);
1532                             if (thread == 0)
1533                             {
1534                                 wallcycle_stop(wcycle, ewcPME_FFT);
1535
1536                                 where();
1537
1538                                 if (pme->nodeid == 0)
1539                                 {
1540                                     real ntot = pme->nkx*pme->nky*pme->nkz;
1541                                     npme  = static_cast<int>(ntot*std::log(ntot)/std::log(2.0));
1542                                     inc_nrnb(nrnb, eNR_FFT, 2*npme);
1543                                 }
1544                                 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
1545                             }
1546
1547                             copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
1548                         }
1549                         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1550                     } /*#pragma omp parallel*/
1551
1552                     /* distribute local grid to all nodes */
1553 #if GMX_MPI
1554                     if (pme->nnodes > 1)
1555                     {
1556                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
1557                     }
1558 #endif
1559                     where();
1560
1561                     unwrap_periodic_pmegrid(pme, grid);
1562
1563                     if (bCalcF)
1564                     {
1565                         /* interpolate forces for our local atoms */
1566                         where();
1567                         bClearF = (bFirst && PAR(cr));
1568                         scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
1569                         scale  *= lb_scale_factor[grid_index-2];
1570
1571 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1572                         for (thread = 0; thread < pme->nthread; thread++)
1573                         {
1574                             try
1575                             {
1576                                 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
1577                                                   &pme->atc[0].spline[thread],
1578                                                   scale);
1579                             }
1580                             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1581                         }
1582
1583                         where();
1584
1585                         inc_nrnb(nrnb, eNR_GATHERFBSP,
1586                                  pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
1587                     }
1588                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
1589
1590                     bFirst = FALSE;
1591                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
1592             }     /* if (bCalcF) */
1593         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
1594     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
1595
1596     if (bCalcF && pme->nnodes > 1)
1597     {
1598         wallcycle_start(wcycle, ewcPME_REDISTXF);
1599         for (d = 0; d < pme->ndecompdim; d++)
1600         {
1601             atc = &pme->atc[d];
1602             if (d == pme->ndecompdim - 1)
1603             {
1604                 n_d = homenr;
1605                 f_d = f + start;
1606             }
1607             else
1608             {
1609                 n_d = pme->atc[d+1].n;
1610                 f_d = pme->atc[d+1].f;
1611             }
1612             if (DOMAINDECOMP(cr))
1613             {
1614                 dd_pmeredist_f(pme, atc, n_d, f_d,
1615                                d == pme->ndecompdim-1 && pme->bPPnode);
1616             }
1617         }
1618
1619         wallcycle_stop(wcycle, ewcPME_REDISTXF);
1620     }
1621     where();
1622
1623     if (bCalcEnerVir)
1624     {
1625         if (pme->doCoulomb)
1626         {
1627             if (!pme->bFEP_q)
1628             {
1629                 *energy_q = energy_AB[0];
1630                 m_add(vir_q, vir_AB[0], vir_q);
1631             }
1632             else
1633             {
1634                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
1635                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
1636                 for (i = 0; i < DIM; i++)
1637                 {
1638                     for (j = 0; j < DIM; j++)
1639                     {
1640                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
1641                             lambda_q*vir_AB[1][i][j];
1642                     }
1643                 }
1644             }
1645             if (debug)
1646             {
1647                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
1648             }
1649         }
1650         else
1651         {
1652             *energy_q = 0;
1653         }
1654
1655         if (pme->doLJ)
1656         {
1657             if (!pme->bFEP_lj)
1658             {
1659                 *energy_lj = energy_AB[2];
1660                 m_add(vir_lj, vir_AB[2], vir_lj);
1661             }
1662             else
1663             {
1664                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
1665                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
1666                 for (i = 0; i < DIM; i++)
1667                 {
1668                     for (j = 0; j < DIM; j++)
1669                     {
1670                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
1671                     }
1672                 }
1673             }
1674             if (debug)
1675             {
1676                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
1677             }
1678         }
1679         else
1680         {
1681             *energy_lj = 0;
1682         }
1683     }
1684     return 0;
1685 }
1686
1687 void gmx_pme_destroy(gmx_pme_t *pme)
1688 {
1689     if (!pme)
1690     {
1691         return;
1692     }
1693
1694     sfree(pme->nnx);
1695     sfree(pme->nny);
1696     sfree(pme->nnz);
1697     sfree(pme->fshx);
1698     sfree(pme->fshy);
1699     sfree(pme->fshz);
1700
1701     for (int i = 0; i < pme->ngrids; ++i)
1702     {
1703         pmegrids_destroy(&pme->pmegrid[i]);
1704     }
1705     if (pme->pfft_setup)
1706     {
1707         for (int i = 0; i < pme->ngrids; ++i)
1708         {
1709             gmx_parallel_3dfft_destroy(pme->pfft_setup[i]);
1710         }
1711     }
1712     sfree(pme->fftgrid);
1713     sfree(pme->cfftgrid);
1714     sfree(pme->pfft_setup);
1715
1716     for (int i = 0; i < std::max(1, pme->ndecompdim); i++) //pme->atc[0] is always allocated
1717     {
1718         destroy_atomcomm(&pme->atc[i]);
1719     }
1720
1721     for (int i = 0; i < DIM; i++)
1722     {
1723         sfree(pme->bsp_mod[i]);
1724     }
1725
1726     destroy_overlap_comm(&pme->overlap[0]);
1727     destroy_overlap_comm(&pme->overlap[1]);
1728
1729     sfree(pme->lb_buf1);
1730     sfree(pme->lb_buf2);
1731
1732     sfree(pme->bufv);
1733     sfree(pme->bufr);
1734
1735     pme_free_all_work(&pme->solve_work, pme->nthread);
1736
1737     sfree(pme->sum_qgrid_tmp);
1738     sfree(pme->sum_qgrid_dd_tmp);
1739
1740     sfree(pme);
1741 }