src/gromacs/ewald/pme-grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* TODO find out what this file should be called */
  38 #include "gmxpre.h"
  39
  40 #include "pme-grid.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include "gromacs/ewald/pme.h"
  47 #include "gromacs/fft/parallel_3dfft.h"
  48 #include "gromacs/math/vec.h"
  49 #include "gromacs/timing/cyclecounter.h"
  50 #include "gromacs/utility/fatalerror.h"
  51 #include "gromacs/utility/smalloc.h"
  52
  53 #ifdef DEBUG_PME
  54 #include "gromacs/fileio/pdbio.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/futil.h"
  57 #endif
  58
  59 #include "pme-simd.h"
  60
  61 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
  62  * to preserve alignment.
  63  */
  64 #define GMX_CACHE_SEP 64
  65
  66 #if GMX_MPI
  67 void gmx_sum_qgrid_dd(struct gmx_pme_t *pme, real *grid, int direction)
  68 {
  69     pme_overlap_t *overlap;
  70     int            send_index0, send_nindex;
  71     int            recv_index0, recv_nindex;
  72     MPI_Status     stat;
  73     int            i, j, k, ix, iy, iz, icnt;
  74     int            ipulse, send_id, recv_id, datasize;
  75     real          *p;
  76     real          *sendptr, *recvptr;
  77
  78     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
  79     overlap = &pme->overlap[1];
  80
  81     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
  82     {
  83         /* Since we have already (un)wrapped the overlap in the z-dimension,
  84          * we only have to communicate 0 to nkz (not pmegrid_nz).
  85          */
  86         if (direction == GMX_SUM_GRID_FORWARD)
  87         {
  88             send_id       = overlap->send_id[ipulse];
  89             recv_id       = overlap->recv_id[ipulse];
  90             send_index0   = overlap->comm_data[ipulse].send_index0;
  91             send_nindex   = overlap->comm_data[ipulse].send_nindex;
  92             recv_index0   = overlap->comm_data[ipulse].recv_index0;
  93             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
  94         }
  95         else
  96         {
  97             send_id       = overlap->recv_id[ipulse];
  98             recv_id       = overlap->send_id[ipulse];
  99             send_index0   = overlap->comm_data[ipulse].recv_index0;
 100             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 101             recv_index0   = overlap->comm_data[ipulse].send_index0;
 102             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 103         }
 104
 105         /* Copy data to contiguous send buffer */
 106         if (debug)
 107         {
 108             fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 109                     pme->nodeid, overlap->nodeid, send_id,
 110                     pme->pmegrid_start_iy,
 111                     send_index0-pme->pmegrid_start_iy,
 112                     send_index0-pme->pmegrid_start_iy+send_nindex);
 113         }
 114         icnt = 0;
 115         for (i = 0; i < pme->pmegrid_nx; i++)
 116         {
 117             ix = i;
 118             for (j = 0; j < send_nindex; j++)
 119             {
 120                 iy = j + send_index0 - pme->pmegrid_start_iy;
 121                 for (k = 0; k < pme->nkz; k++)
 122                 {
 123                     iz = k;
 124                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 125                 }
 126             }
 127         }
 128
 129         datasize      = pme->pmegrid_nx * pme->nkz;
 130
 131         MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
 132                      send_id, ipulse,
 133                      overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
 134                      recv_id, ipulse,
 135                      overlap->mpi_comm, &stat);
 136
 137         /* Get data from contiguous recv buffer */
 138         if (debug)
 139         {
 140             fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 141                     pme->nodeid, overlap->nodeid, recv_id,
 142                     pme->pmegrid_start_iy,
 143                     recv_index0-pme->pmegrid_start_iy,
 144                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
 145         }
 146         icnt = 0;
 147         for (i = 0; i < pme->pmegrid_nx; i++)
 148         {
 149             ix = i;
 150             for (j = 0; j < recv_nindex; j++)
 151             {
 152                 iy = j + recv_index0 - pme->pmegrid_start_iy;
 153                 for (k = 0; k < pme->nkz; k++)
 154                 {
 155                     iz = k;
 156                     if (direction == GMX_SUM_GRID_FORWARD)
 157                     {
 158                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 159                     }
 160                     else
 161                     {
 162                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 163                     }
 164                 }
 165             }
 166         }
 167     }
 168
 169     /* Major dimension is easier, no copying required,
 170      * but we might have to sum to separate array.
 171      * Since we don't copy, we have to communicate up to pmegrid_nz,
 172      * not nkz as for the minor direction.
 173      */
 174     overlap = &pme->overlap[0];
 175
 176     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 177     {
 178         if (direction == GMX_SUM_GRID_FORWARD)
 179         {
 180             send_id       = overlap->send_id[ipulse];
 181             recv_id       = overlap->recv_id[ipulse];
 182             send_index0   = overlap->comm_data[ipulse].send_index0;
 183             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 184             recv_index0   = overlap->comm_data[ipulse].recv_index0;
 185             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 186             recvptr       = overlap->recvbuf;
 187         }
 188         else
 189         {
 190             send_id       = overlap->recv_id[ipulse];
 191             recv_id       = overlap->send_id[ipulse];
 192             send_index0   = overlap->comm_data[ipulse].recv_index0;
 193             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 194             recv_index0   = overlap->comm_data[ipulse].send_index0;
 195             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 196             recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 197         }
 198
 199         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 200         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 201
 202         if (debug)
 203         {
 204             fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 205                     pme->nodeid, overlap->nodeid, send_id,
 206                     pme->pmegrid_start_ix,
 207                     send_index0-pme->pmegrid_start_ix,
 208                     send_index0-pme->pmegrid_start_ix+send_nindex);
 209             fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 210                     pme->nodeid, overlap->nodeid, recv_id,
 211                     pme->pmegrid_start_ix,
 212                     recv_index0-pme->pmegrid_start_ix,
 213                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
 214         }
 215
 216         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 217                      send_id, ipulse,
 218                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 219                      recv_id, ipulse,
 220                      overlap->mpi_comm, &stat);
 221
 222         /* ADD data from contiguous recv buffer */
 223         if (direction == GMX_SUM_GRID_FORWARD)
 224         {
 225             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 226             for (i = 0; i < recv_nindex*datasize; i++)
 227             {
 228                 p[i] += overlap->recvbuf[i];
 229             }
 230         }
 231     }
 232 }
 233 #endif
 234
 235
 236 int copy_pmegrid_to_fftgrid(const gmx_pme_t *pme, real *pmegrid, real *fftgrid, int grid_index)
 237 {
 238     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
 239     ivec    local_pme_size;
 240     int     ix, iy, iz;
 241     int     pmeidx, fftidx;
 242
 243     /* Dimensions should be identical for A/B grid, so we just use A here */
 244     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 245                                    local_fft_ndata,
 246                                    local_fft_offset,
 247                                    local_fft_size);
 248
 249     local_pme_size[0] = pme->pmegrid_nx;
 250     local_pme_size[1] = pme->pmegrid_ny;
 251     local_pme_size[2] = pme->pmegrid_nz;
 252
 253     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 254        the offset is identical, and the PME grid always has more data (due to overlap)
 255      */
 256     {
 257 #ifdef DEBUG_PME
 258         FILE *fp, *fp2;
 259         char  fn[STRLEN];
 260         real  val;
 261         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
 262         fp = gmx_ffopen(fn, "w");
 263         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
 264         fp2 = gmx_ffopen(fn, "w");
 265 #endif
 266
 267         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
 268         {
 269             for (iy = 0; iy < local_fft_ndata[YY]; iy++)
 270             {
 271                 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 272                 {
 273                     pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 274                     fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 275                     fftgrid[fftidx] = pmegrid[pmeidx];
 276 #ifdef DEBUG_PME
 277                     val = 100*pmegrid[pmeidx];
 278                     if (pmegrid[pmeidx] != 0)
 279                     {
 280                         gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
 281                                                  5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
 282                     }
 283                     if (pmegrid[pmeidx] != 0)
 284                     {
 285                         fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
 286                                 "qgrid",
 287                                 pme->pmegrid_start_ix + ix,
 288                                 pme->pmegrid_start_iy + iy,
 289                                 pme->pmegrid_start_iz + iz,
 290                                 pmegrid[pmeidx]);
 291                     }
 292 #endif
 293                 }
 294             }
 295         }
 296 #ifdef DEBUG_PME
 297         gmx_ffclose(fp);
 298         gmx_ffclose(fp2);
 299 #endif
 300     }
 301     return 0;
 302 }
 303
 304
 305 #ifdef PME_TIME_THREADS
 306 static gmx_cycles_t omp_cyc_start()
 307 {
 308     return gmx_cycles_read();
 309 }
 310
 311 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 312 {
 313     return gmx_cycles_read() - c;
 314 }
 315 #endif
 316
 317
 318 int copy_fftgrid_to_pmegrid(struct gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
 319                             int nthread, int thread)
 320 {
 321     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
 322     ivec          local_pme_size;
 323     int           ixy0, ixy1, ixy, ix, iy, iz;
 324     int           pmeidx, fftidx;
 325 #ifdef PME_TIME_THREADS
 326     gmx_cycles_t  c1;
 327     static double cs1 = 0;
 328     static int    cnt = 0;
 329 #endif
 330
 331 #ifdef PME_TIME_THREADS
 332     c1 = omp_cyc_start();
 333 #endif
 334     /* Dimensions should be identical for A/B grid, so we just use A here */
 335     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 336                                    local_fft_ndata,
 337                                    local_fft_offset,
 338                                    local_fft_size);
 339
 340     local_pme_size[0] = pme->pmegrid_nx;
 341     local_pme_size[1] = pme->pmegrid_ny;
 342     local_pme_size[2] = pme->pmegrid_nz;
 343
 344     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 345        the offset is identical, and the PME grid always has more data (due to overlap)
 346      */
 347     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 348     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 349
 350     for (ixy = ixy0; ixy < ixy1; ixy++)
 351     {
 352         ix = ixy/local_fft_ndata[YY];
 353         iy = ixy - ix*local_fft_ndata[YY];
 354
 355         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 356         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 357         for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 358         {
 359             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 360         }
 361     }
 362
 363 #ifdef PME_TIME_THREADS
 364     c1   = omp_cyc_end(c1);
 365     cs1 += (double)c1;
 366     cnt++;
 367     if (cnt % 20 == 0)
 368     {
 369         printf("copy %.2f\n", cs1*1e-9);
 370     }
 371 #endif
 372
 373     return 0;
 374 }
 375
 376
 377 void wrap_periodic_pmegrid(const gmx_pme_t *pme, real *pmegrid)
 378 {
 379     int     nx, ny, nz, pny, pnz, ny_x, overlap, ix, iy, iz;
 380
 381     nx = pme->nkx;
 382     ny = pme->nky;
 383     nz = pme->nkz;
 384
 385     pny = pme->pmegrid_ny;
 386     pnz = pme->pmegrid_nz;
 387
 388     overlap = pme->pme_order - 1;
 389
 390     /* Add periodic overlap in z */
 391     for (ix = 0; ix < pme->pmegrid_nx; ix++)
 392     {
 393         for (iy = 0; iy < pme->pmegrid_ny; iy++)
 394         {
 395             for (iz = 0; iz < overlap; iz++)
 396             {
 397                 pmegrid[(ix*pny+iy)*pnz+iz] +=
 398                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
 399             }
 400         }
 401     }
 402
 403     if (pme->nnodes_minor == 1)
 404     {
 405         for (ix = 0; ix < pme->pmegrid_nx; ix++)
 406         {
 407             for (iy = 0; iy < overlap; iy++)
 408             {
 409                 for (iz = 0; iz < nz; iz++)
 410                 {
 411                     pmegrid[(ix*pny+iy)*pnz+iz] +=
 412                         pmegrid[(ix*pny+ny+iy)*pnz+iz];
 413                 }
 414             }
 415         }
 416     }
 417
 418     if (pme->nnodes_major == 1)
 419     {
 420         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 421
 422         for (ix = 0; ix < overlap; ix++)
 423         {
 424             for (iy = 0; iy < ny_x; iy++)
 425             {
 426                 for (iz = 0; iz < nz; iz++)
 427                 {
 428                     pmegrid[(ix*pny+iy)*pnz+iz] +=
 429                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 430                 }
 431             }
 432         }
 433     }
 434 }
 435
 436
 437 void unwrap_periodic_pmegrid(struct gmx_pme_t *pme, real *pmegrid)
 438 {
 439     int     nx, ny, nz, pny, pnz, ny_x, overlap, ix;
 440
 441     nx = pme->nkx;
 442     ny = pme->nky;
 443     nz = pme->nkz;
 444
 445     pny = pme->pmegrid_ny;
 446     pnz = pme->pmegrid_nz;
 447
 448     overlap = pme->pme_order - 1;
 449
 450     if (pme->nnodes_major == 1)
 451     {
 452         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 453
 454         for (ix = 0; ix < overlap; ix++)
 455         {
 456             int iy, iz;
 457
 458             for (iy = 0; iy < ny_x; iy++)
 459             {
 460                 for (iz = 0; iz < nz; iz++)
 461                 {
 462                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 463                         pmegrid[(ix*pny+iy)*pnz+iz];
 464                 }
 465             }
 466         }
 467     }
 468
 469     if (pme->nnodes_minor == 1)
 470     {
 471 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 472         for (ix = 0; ix < pme->pmegrid_nx; ix++)
 473         {
 474             // Trivial OpenMP region that does not throw, no need for try/catch
 475             int iy, iz;
 476
 477             for (iy = 0; iy < overlap; iy++)
 478             {
 479                 for (iz = 0; iz < nz; iz++)
 480                 {
 481                     pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 482                         pmegrid[(ix*pny+iy)*pnz+iz];
 483                 }
 484             }
 485         }
 486     }
 487
 488     /* Copy periodic overlap in z */
 489 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 490     for (ix = 0; ix < pme->pmegrid_nx; ix++)
 491     {
 492         // Trivial OpenMP region that does not throw, no need for try/catch
 493         int iy, iz;
 494
 495         for (iy = 0; iy < pme->pmegrid_ny; iy++)
 496         {
 497             for (iz = 0; iz < overlap; iz++)
 498             {
 499                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 500                     pmegrid[(ix*pny+iy)*pnz+iz];
 501             }
 502         }
 503     }
 504 }
 505
 506 void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
 507 {
 508 #ifdef PME_SIMD4_SPREAD_GATHER
 509     if (pme_order == 5
 510 #ifndef PME_SIMD4_UNALIGNED
 511         || pme_order == 4
 512 #endif
 513         )
 514     {
 515         /* Round nz up to a multiple of 4 to ensure alignment */
 516         *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 517     }
 518 #endif
 519 }
 520
 521 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
 522 {
 523 #ifdef PME_SIMD4_SPREAD_GATHER
 524 #ifndef PME_SIMD4_UNALIGNED
 525     if (pme_order == 4)
 526     {
 527         /* Add extra elements to ensured aligned operations do not go
 528          * beyond the allocated grid size.
 529          * Note that for pme_order=5, the pme grid z-size alignment
 530          * ensures that we will not go beyond the grid size.
 531          */
 532         *gridsize += 4;
 533     }
 534 #endif
 535 #endif
 536 }
 537
 538 void pmegrid_init(pmegrid_t *grid,
 539                   int cx, int cy, int cz,
 540                   int x0, int y0, int z0,
 541                   int x1, int y1, int z1,
 542                   gmx_bool set_alignment,
 543                   int pme_order,
 544                   real *ptr)
 545 {
 546     int nz, gridsize;
 547
 548     grid->ci[XX]     = cx;
 549     grid->ci[YY]     = cy;
 550     grid->ci[ZZ]     = cz;
 551     grid->offset[XX] = x0;
 552     grid->offset[YY] = y0;
 553     grid->offset[ZZ] = z0;
 554     grid->n[XX]      = x1 - x0 + pme_order - 1;
 555     grid->n[YY]      = y1 - y0 + pme_order - 1;
 556     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 557     copy_ivec(grid->n, grid->s);
 558
 559     nz = grid->s[ZZ];
 560     set_grid_alignment(&nz, pme_order);
 561     if (set_alignment)
 562     {
 563         grid->s[ZZ] = nz;
 564     }
 565     else if (nz != grid->s[ZZ])
 566     {
 567         gmx_incons("pmegrid_init call with an unaligned z size");
 568     }
 569
 570     grid->order = pme_order;
 571     if (ptr == nullptr)
 572     {
 573         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
 574         set_gridsize_alignment(&gridsize, pme_order);
 575         snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
 576     }
 577     else
 578     {
 579         grid->grid = ptr;
 580     }
 581 }
 582
 583 static int div_round_up(int enumerator, int denominator)
 584 {
 585     return (enumerator + denominator - 1)/denominator;
 586 }
 587
 588 static void make_subgrid_division(const ivec n, int ovl, int nthread,
 589                                   ivec nsub)
 590 {
 591     int   gsize_opt, gsize;
 592     int   nsx, nsy, nsz;
 593     char *env;
 594
 595     gsize_opt = -1;
 596     for (nsx = 1; nsx <= nthread; nsx++)
 597     {
 598         if (nthread % nsx == 0)
 599         {
 600             for (nsy = 1; nsy <= nthread; nsy++)
 601             {
 602                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 603                 {
 604                     nsz = nthread/(nsx*nsy);
 605
 606                     /* Determine the number of grid points per thread */
 607                     gsize =
 608                         (div_round_up(n[XX], nsx) + ovl)*
 609                         (div_round_up(n[YY], nsy) + ovl)*
 610                         (div_round_up(n[ZZ], nsz) + ovl);
 611
 612                     /* Minimize the number of grids points per thread
 613                      * and, secondarily, the number of cuts in minor dimensions.
 614                      */
 615                     if (gsize_opt == -1 ||
 616                         gsize < gsize_opt ||
 617                         (gsize == gsize_opt &&
 618                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 619                     {
 620                         nsub[XX]  = nsx;
 621                         nsub[YY]  = nsy;
 622                         nsub[ZZ]  = nsz;
 623                         gsize_opt = gsize;
 624                     }
 625                 }
 626             }
 627         }
 628     }
 629
 630     env = getenv("GMX_PME_THREAD_DIVISION");
 631     if (env != nullptr)
 632     {
 633         sscanf(env, "%20d %20d %20d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
 634     }
 635
 636     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 637     {
 638         gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
 639     }
 640 }
 641
 642 void pmegrids_init(pmegrids_t *grids,
 643                    int nx, int ny, int nz, int nz_base,
 644                    int pme_order,
 645                    gmx_bool bUseThreads,
 646                    int nthread,
 647                    int overlap_x,
 648                    int overlap_y)
 649 {
 650     ivec n, n_base;
 651     int  t, x, y, z, d, i, tfac;
 652     int  max_comm_lines = -1;
 653
 654     n[XX] = nx - (pme_order - 1);
 655     n[YY] = ny - (pme_order - 1);
 656     n[ZZ] = nz - (pme_order - 1);
 657
 658     copy_ivec(n, n_base);
 659     n_base[ZZ] = nz_base;
 660
 661     pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
 662                  nullptr);
 663
 664     grids->nthread = nthread;
 665
 666     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 667
 668     if (bUseThreads)
 669     {
 670         ivec nst;
 671         int  gridsize;
 672
 673         for (d = 0; d < DIM; d++)
 674         {
 675             nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
 676         }
 677         set_grid_alignment(&nst[ZZ], pme_order);
 678
 679         if (debug)
 680         {
 681             fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
 682                     grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
 683             fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 684                     nx, ny, nz,
 685                     nst[XX], nst[YY], nst[ZZ]);
 686         }
 687
 688         snew(grids->grid_th, grids->nthread);
 689         t        = 0;
 690         gridsize = nst[XX]*nst[YY]*nst[ZZ];
 691         set_gridsize_alignment(&gridsize, pme_order);
 692         snew_aligned(grids->grid_all,
 693                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 694                      SIMD4_ALIGNMENT);
 695
 696         for (x = 0; x < grids->nc[XX]; x++)
 697         {
 698             for (y = 0; y < grids->nc[YY]; y++)
 699             {
 700                 for (z = 0; z < grids->nc[ZZ]; z++)
 701                 {
 702                     pmegrid_init(&grids->grid_th[t],
 703                                  x, y, z,
 704                                  (n[XX]*(x  ))/grids->nc[XX],
 705                                  (n[YY]*(y  ))/grids->nc[YY],
 706                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
 707                                  (n[XX]*(x+1))/grids->nc[XX],
 708                                  (n[YY]*(y+1))/grids->nc[YY],
 709                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
 710                                  TRUE,
 711                                  pme_order,
 712                                  grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 713                     t++;
 714                 }
 715             }
 716         }
 717     }
 718     else
 719     {
 720         grids->grid_th = nullptr;
 721     }
 722
 723     tfac = 1;
 724     for (d = DIM-1; d >= 0; d--)
 725     {
 726         snew(grids->g2t[d], n[d]);
 727         t = 0;
 728         for (i = 0; i < n[d]; i++)
 729         {
 730             /* The second check should match the parameters
 731              * of the pmegrid_init call above.
 732              */
 733             while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 734             {
 735                 t++;
 736             }
 737             grids->g2t[d][i] = t*tfac;
 738         }
 739
 740         tfac *= grids->nc[d];
 741
 742         switch (d)
 743         {
 744             case XX: max_comm_lines = overlap_x;     break;
 745             case YY: max_comm_lines = overlap_y;     break;
 746             case ZZ: max_comm_lines = pme_order - 1; break;
 747         }
 748         grids->nthread_comm[d] = 0;
 749         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
 750                grids->nthread_comm[d] < grids->nc[d])
 751         {
 752             grids->nthread_comm[d]++;
 753         }
 754         if (debug != nullptr)
 755         {
 756             fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
 757                     'x'+d, grids->nthread_comm[d]);
 758         }
 759         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 760          * work, but this is not a problematic restriction.
 761          */
 762         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 763         {
 764             gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
 765         }
 766     }
 767 }
 768
 769 void pmegrids_destroy(pmegrids_t *grids)
 770 {
 771     if (grids->grid.grid != nullptr)
 772     {
 773         sfree_aligned(grids->grid.grid);
 774
 775         if (grids->nthread > 0)
 776         {
 777             sfree_aligned(grids->grid_all);
 778             sfree(grids->grid_th);
 779         }
 780         for (int d = 0; d < DIM; d++)
 781         {
 782             sfree(grids->g2t[d]);
 783         }
 784     }
 785 }
 786
 787 void
 788 make_gridindex_to_localindex(int n, int local_start, int local_range,
 789                              int **global_to_local,
 790                              real **fraction_shift)
 791 {
 792     /* Here we construct array for looking up the grid line index and
 793      * fraction for particles. This is done because it is slighlty
 794      * faster than the modulo operation and to because we need to take
 795      * care of rounding issues, see below.
 796      * We use an array size of c_pmeNeighborUnitcellCount times the grid size
 797      * to allow for particles to be out of the triclinic unit-cell.
 798      */
 799     const int arraySize = c_pmeNeighborUnitcellCount * n;
 800     int     * gtl;
 801     real    * fsh;
 802
 803     snew(gtl, arraySize);
 804     snew(fsh, arraySize);
 805
 806     for (int i = 0; i < arraySize; i++)
 807     {
 808         /* Transform global grid index to the local grid index.
 809          * Our local grid always runs from 0 to local_range-1.
 810          */
 811         gtl[i] = (i - local_start + n) % n;
 812         /* For coordinates that fall within the local grid the fraction
 813          * is correct, we don't need to shift it.
 814          */
 815         fsh[i] = 0;
 816         /* Check if we are using domain decomposition for PME */
 817         if (local_range < n)
 818         {
 819             /* Due to rounding issues i could be 1 beyond the lower or
 820              * upper boundary of the local grid. Correct the index for this.
 821              * If we shift the index, we need to shift the fraction by
 822              * the same amount in the other direction to not affect
 823              * the weights.
 824              * Note that due to this shifting the weights at the end of
 825              * the spline might change, but that will only involve values
 826              * between zero and values close to the precision of a real,
 827              * which is anyhow the accuracy of the whole mesh calculation.
 828              */
 829             if (gtl[i] == n - 1)
 830             {
 831                 /* When this i is used, we should round the local index up */
 832                 gtl[i] = 0;
 833                 fsh[i] = -1;
 834             }
 835             else if (gtl[i] == local_range && local_range > 0)
 836             {
 837                 /* When this i is used, we should round the local index down */
 838                 gtl[i] = local_range - 1;
 839                 fsh[i] = 1;
 840             }
 841         }
 842     }
 843
 844     *global_to_local = gtl;
 845     *fraction_shift  = fsh;
 846 }
 847
 848 void reuse_pmegrids(const pmegrids_t *oldgrid, pmegrids_t *newgrid)
 849 {
 850     int d, t;
 851
 852     for (d = 0; d < DIM; d++)
 853     {
 854         if (newgrid->grid.n[d] > oldgrid->grid.n[d])
 855         {
 856             return;
 857         }
 858     }
 859
 860     sfree_aligned(newgrid->grid.grid);
 861     newgrid->grid.grid = oldgrid->grid.grid;
 862
 863     if (newgrid->grid_th != nullptr && newgrid->nthread == oldgrid->nthread)
 864     {
 865         sfree_aligned(newgrid->grid_all);
 866         newgrid->grid_all = oldgrid->grid_all;
 867         for (t = 0; t < newgrid->nthread; t++)
 868         {
 869             newgrid->grid_th[t].grid = oldgrid->grid_th[t].grid;
 870         }
 871     }
 872 }
 873
 874 static void dump_grid(FILE *fp,
 875                       int sx, int sy, int sz, int nx, int ny, int nz,
 876                       int my, int mz, const real *g)
 877 {
 878     int x, y, z;
 879
 880     for (x = 0; x < nx; x++)
 881     {
 882         for (y = 0; y < ny; y++)
 883         {
 884             for (z = 0; z < nz; z++)
 885             {
 886                 fprintf(fp, "%2d %2d %2d %6.3f\n",
 887                         sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
 888             }
 889         }
 890     }
 891 }
 892
 893 /* This function is called from gmx_pme_do() only from debugging code
 894    that is commented out. */
 895 void dump_local_fftgrid(struct gmx_pme_t *pme, const real *fftgrid)
 896 {
 897     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 898
 899     gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
 900                                    local_fft_ndata,
 901                                    local_fft_offset,
 902                                    local_fft_size);
 903
 904     dump_grid(stderr,
 905               pme->pmegrid_start_ix,
 906               pme->pmegrid_start_iy,
 907               pme->pmegrid_start_iz,
 908               pme->pmegrid_nx-pme->pme_order+1,
 909               pme->pmegrid_ny-pme->pme_order+1,
 910               pme->pmegrid_nz-pme->pme_order+1,
 911               local_fft_size[YY],
 912               local_fft_size[ZZ],
 913               fftgrid);
 914 }