src/gromacs/ewald/pme_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* TODO find out what this file should be called */
  38 #include "gmxpre.h"
  39
  40 #include "pme_grid.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include "gromacs/ewald/pme.h"
  47 #include "gromacs/fft/parallel_3dfft.h"
  48 #include "gromacs/math/vec.h"
  49 #include "gromacs/timing/cyclecounter.h"
  50 #include "gromacs/utility/fatalerror.h"
  51 #include "gromacs/utility/smalloc.h"
  52
  53 #include "pme_internal.h"
  54
  55 #ifdef DEBUG_PME
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/futil.h"
  59 #endif
  60
  61 #include "pme_simd.h"
  62
  63 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
  64  * to preserve alignment.
  65  */
  66 #define GMX_CACHE_SEP 64
  67
  68 void gmx_sum_qgrid_dd(gmx_pme_t *pme,
  69                       real      *grid,
  70                       const int  direction)
  71 {
  72 #if GMX_MPI
  73     pme_overlap_t *overlap;
  74     int            send_index0, send_nindex;
  75     int            recv_index0, recv_nindex;
  76     MPI_Status     stat;
  77     int            i, j, k, ix, iy, iz, icnt;
  78     int            send_id, recv_id, datasize;
  79     real          *p;
  80     real          *sendptr, *recvptr;
  81
  82     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
  83     overlap = &pme->overlap[1];
  84
  85     for (size_t ipulse = 0; ipulse < overlap->comm_data.size(); ipulse++)
  86     {
  87         /* Since we have already (un)wrapped the overlap in the z-dimension,
  88          * we only have to communicate 0 to nkz (not pmegrid_nz).
  89          */
  90         if (direction == GMX_SUM_GRID_FORWARD)
  91         {
  92             send_id       = overlap->comm_data[ipulse].send_id;
  93             recv_id       = overlap->comm_data[ipulse].recv_id;
  94             send_index0   = overlap->comm_data[ipulse].send_index0;
  95             send_nindex   = overlap->comm_data[ipulse].send_nindex;
  96             recv_index0   = overlap->comm_data[ipulse].recv_index0;
  97             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
  98         }
  99         else
 100         {
 101             send_id       = overlap->comm_data[ipulse].recv_id;
 102             recv_id       = overlap->comm_data[ipulse].send_id;
 103             send_index0   = overlap->comm_data[ipulse].recv_index0;
 104             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 105             recv_index0   = overlap->comm_data[ipulse].send_index0;
 106             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 107         }
 108
 109         /* Copy data to contiguous send buffer */
 110         if (debug)
 111         {
 112             fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 113                     pme->nodeid, overlap->nodeid, send_id,
 114                     pme->pmegrid_start_iy,
 115                     send_index0-pme->pmegrid_start_iy,
 116                     send_index0-pme->pmegrid_start_iy+send_nindex);
 117         }
 118         icnt = 0;
 119         for (i = 0; i < pme->pmegrid_nx; i++)
 120         {
 121             ix = i;
 122             for (j = 0; j < send_nindex; j++)
 123             {
 124                 iy = j + send_index0 - pme->pmegrid_start_iy;
 125                 for (k = 0; k < pme->nkz; k++)
 126                 {
 127                     iz = k;
 128                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 129                 }
 130             }
 131         }
 132
 133         datasize      = pme->pmegrid_nx * pme->nkz;
 134
 135         MPI_Sendrecv(overlap->sendbuf.data(), send_nindex*datasize, GMX_MPI_REAL,
 136                      send_id, ipulse,
 137                      overlap->recvbuf.data(), recv_nindex*datasize, GMX_MPI_REAL,
 138                      recv_id, ipulse,
 139                      overlap->mpi_comm, &stat);
 140
 141         /* Get data from contiguous recv buffer */
 142         if (debug)
 143         {
 144             fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 145                     pme->nodeid, overlap->nodeid, recv_id,
 146                     pme->pmegrid_start_iy,
 147                     recv_index0-pme->pmegrid_start_iy,
 148                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
 149         }
 150         icnt = 0;
 151         for (i = 0; i < pme->pmegrid_nx; i++)
 152         {
 153             ix = i;
 154             for (j = 0; j < recv_nindex; j++)
 155             {
 156                 iy = j + recv_index0 - pme->pmegrid_start_iy;
 157                 for (k = 0; k < pme->nkz; k++)
 158                 {
 159                     iz = k;
 160                     if (direction == GMX_SUM_GRID_FORWARD)
 161                     {
 162                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 163                     }
 164                     else
 165                     {
 166                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 167                     }
 168                 }
 169             }
 170         }
 171     }
 172
 173     /* Major dimension is easier, no copying required,
 174      * but we might have to sum to separate array.
 175      * Since we don't copy, we have to communicate up to pmegrid_nz,
 176      * not nkz as for the minor direction.
 177      */
 178     overlap = &pme->overlap[0];
 179
 180     for (size_t ipulse = 0; ipulse < overlap->comm_data.size(); ipulse++)
 181     {
 182         if (direction == GMX_SUM_GRID_FORWARD)
 183         {
 184             send_id       = overlap->comm_data[ipulse].send_id;
 185             recv_id       = overlap->comm_data[ipulse].recv_id;
 186             send_index0   = overlap->comm_data[ipulse].send_index0;
 187             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 188             recv_index0   = overlap->comm_data[ipulse].recv_index0;
 189             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 190             recvptr       = overlap->recvbuf.data();
 191         }
 192         else
 193         {
 194             send_id       = overlap->comm_data[ipulse].recv_id;
 195             recv_id       = overlap->comm_data[ipulse].send_id;
 196             send_index0   = overlap->comm_data[ipulse].recv_index0;
 197             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 198             recv_index0   = overlap->comm_data[ipulse].send_index0;
 199             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 200             recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 201         }
 202
 203         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 204         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 205
 206         if (debug)
 207         {
 208             fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 209                     pme->nodeid, overlap->nodeid, send_id,
 210                     pme->pmegrid_start_ix,
 211                     send_index0-pme->pmegrid_start_ix,
 212                     send_index0-pme->pmegrid_start_ix+send_nindex);
 213             fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 214                     pme->nodeid, overlap->nodeid, recv_id,
 215                     pme->pmegrid_start_ix,
 216                     recv_index0-pme->pmegrid_start_ix,
 217                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
 218         }
 219
 220         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 221                      send_id, ipulse,
 222                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 223                      recv_id, ipulse,
 224                      overlap->mpi_comm, &stat);
 225
 226         /* ADD data from contiguous recv buffer */
 227         if (direction == GMX_SUM_GRID_FORWARD)
 228         {
 229             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 230             for (i = 0; i < recv_nindex*datasize; i++)
 231             {
 232                 p[i] += overlap->recvbuf[i];
 233             }
 234         }
 235     }
 236 #else  // GMX_MPI
 237     GMX_UNUSED_VALUE(pme);
 238     GMX_UNUSED_VALUE(grid);
 239     GMX_UNUSED_VALUE(direction);
 240
 241     GMX_RELEASE_ASSERT(false, "gmx_sum_qgrid_dd() should not be called without MPI");
 242 #endif // GMX_MPI
 243 }
 244
 245
 246 int copy_pmegrid_to_fftgrid(const gmx_pme_t *pme, const real *pmegrid, real *fftgrid, int grid_index)
 247 {
 248     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
 249     ivec    local_pme_size;
 250     int     ix, iy, iz;
 251     int     pmeidx, fftidx;
 252
 253     /* Dimensions should be identical for A/B grid, so we just use A here */
 254     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 255                                    local_fft_ndata,
 256                                    local_fft_offset,
 257                                    local_fft_size);
 258
 259     local_pme_size[0] = pme->pmegrid_nx;
 260     local_pme_size[1] = pme->pmegrid_ny;
 261     local_pme_size[2] = pme->pmegrid_nz;
 262
 263     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 264        the offset is identical, and the PME grid always has more data (due to overlap)
 265      */
 266     {
 267 #ifdef DEBUG_PME
 268         FILE *fp, *fp2;
 269         char  fn[STRLEN];
 270         real  val;
 271         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
 272         fp = gmx_ffopen(fn, "w");
 273         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
 274         fp2 = gmx_ffopen(fn, "w");
 275 #endif
 276
 277         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
 278         {
 279             for (iy = 0; iy < local_fft_ndata[YY]; iy++)
 280             {
 281                 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 282                 {
 283                     pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 284                     fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 285                     fftgrid[fftidx] = pmegrid[pmeidx];
 286 #ifdef DEBUG_PME
 287                     val = 100*pmegrid[pmeidx];
 288                     if (pmegrid[pmeidx] != 0)
 289                     {
 290                         gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
 291                                                  5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
 292                     }
 293                     if (pmegrid[pmeidx] != 0)
 294                     {
 295                         fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
 296                                 "qgrid",
 297                                 pme->pmegrid_start_ix + ix,
 298                                 pme->pmegrid_start_iy + iy,
 299                                 pme->pmegrid_start_iz + iz,
 300                                 pmegrid[pmeidx]);
 301                     }
 302 #endif
 303                 }
 304             }
 305         }
 306 #ifdef DEBUG_PME
 307         gmx_ffclose(fp);
 308         gmx_ffclose(fp2);
 309 #endif
 310     }
 311     return 0;
 312 }
 313
 314
 315 #ifdef PME_TIME_THREADS
 316 static gmx_cycles_t omp_cyc_start()
 317 {
 318     return gmx_cycles_read();
 319 }
 320
 321 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 322 {
 323     return gmx_cycles_read() - c;
 324 }
 325 #endif
 326
 327
 328 int copy_fftgrid_to_pmegrid(struct gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
 329                             int nthread, int thread)
 330 {
 331     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
 332     ivec          local_pme_size;
 333     int           ixy0, ixy1, ixy, ix, iy, iz;
 334     int           pmeidx, fftidx;
 335 #ifdef PME_TIME_THREADS
 336     gmx_cycles_t  c1;
 337     static double cs1 = 0;
 338     static int    cnt = 0;
 339 #endif
 340
 341 #ifdef PME_TIME_THREADS
 342     c1 = omp_cyc_start();
 343 #endif
 344     /* Dimensions should be identical for A/B grid, so we just use A here */
 345     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 346                                    local_fft_ndata,
 347                                    local_fft_offset,
 348                                    local_fft_size);
 349
 350     local_pme_size[0] = pme->pmegrid_nx;
 351     local_pme_size[1] = pme->pmegrid_ny;
 352     local_pme_size[2] = pme->pmegrid_nz;
 353
 354     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 355        the offset is identical, and the PME grid always has more data (due to overlap)
 356      */
 357     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 358     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 359
 360     for (ixy = ixy0; ixy < ixy1; ixy++)
 361     {
 362         ix = ixy/local_fft_ndata[YY];
 363         iy = ixy - ix*local_fft_ndata[YY];
 364
 365         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 366         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 367         for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 368         {
 369             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 370         }
 371     }
 372
 373 #ifdef PME_TIME_THREADS
 374     c1   = omp_cyc_end(c1);
 375     cs1 += (double)c1;
 376     cnt++;
 377     if (cnt % 20 == 0)
 378     {
 379         printf("copy %.2f\n", cs1*1e-9);
 380     }
 381 #endif
 382
 383     return 0;
 384 }
 385
 386
 387 void wrap_periodic_pmegrid(const gmx_pme_t *pme, real *pmegrid)
 388 {
 389     int     nx, ny, nz, pny, pnz, ny_x, overlap, ix, iy, iz;
 390
 391     nx = pme->nkx;
 392     ny = pme->nky;
 393     nz = pme->nkz;
 394
 395     pny = pme->pmegrid_ny;
 396     pnz = pme->pmegrid_nz;
 397
 398     overlap = pme->pme_order - 1;
 399
 400     /* Add periodic overlap in z */
 401     for (ix = 0; ix < pme->pmegrid_nx; ix++)
 402     {
 403         for (iy = 0; iy < pme->pmegrid_ny; iy++)
 404         {
 405             for (iz = 0; iz < overlap; iz++)
 406             {
 407                 pmegrid[(ix*pny+iy)*pnz+iz] +=
 408                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
 409             }
 410         }
 411     }
 412
 413     if (pme->nnodes_minor == 1)
 414     {
 415         for (ix = 0; ix < pme->pmegrid_nx; ix++)
 416         {
 417             for (iy = 0; iy < overlap; iy++)
 418             {
 419                 for (iz = 0; iz < nz; iz++)
 420                 {
 421                     pmegrid[(ix*pny+iy)*pnz+iz] +=
 422                         pmegrid[(ix*pny+ny+iy)*pnz+iz];
 423                 }
 424             }
 425         }
 426     }
 427
 428     if (pme->nnodes_major == 1)
 429     {
 430         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 431
 432         for (ix = 0; ix < overlap; ix++)
 433         {
 434             for (iy = 0; iy < ny_x; iy++)
 435             {
 436                 for (iz = 0; iz < nz; iz++)
 437                 {
 438                     pmegrid[(ix*pny+iy)*pnz+iz] +=
 439                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 440                 }
 441             }
 442         }
 443     }
 444 }
 445
 446
 447 void unwrap_periodic_pmegrid(struct gmx_pme_t *pme, real *pmegrid)
 448 {
 449     int     nx, ny, nz, pny, pnz, ny_x, overlap, ix;
 450
 451     nx = pme->nkx;
 452     ny = pme->nky;
 453     nz = pme->nkz;
 454
 455     pny = pme->pmegrid_ny;
 456     pnz = pme->pmegrid_nz;
 457
 458     overlap = pme->pme_order - 1;
 459
 460     if (pme->nnodes_major == 1)
 461     {
 462         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 463
 464         for (ix = 0; ix < overlap; ix++)
 465         {
 466             int iy, iz;
 467
 468             for (iy = 0; iy < ny_x; iy++)
 469             {
 470                 for (iz = 0; iz < nz; iz++)
 471                 {
 472                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 473                         pmegrid[(ix*pny+iy)*pnz+iz];
 474                 }
 475             }
 476         }
 477     }
 478
 479     if (pme->nnodes_minor == 1)
 480     {
 481 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 482         for (ix = 0; ix < pme->pmegrid_nx; ix++)
 483         {
 484             // Trivial OpenMP region that does not throw, no need for try/catch
 485             int iy, iz;
 486
 487             for (iy = 0; iy < overlap; iy++)
 488             {
 489                 for (iz = 0; iz < nz; iz++)
 490                 {
 491                     pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 492                         pmegrid[(ix*pny+iy)*pnz+iz];
 493                 }
 494             }
 495         }
 496     }
 497
 498     /* Copy periodic overlap in z */
 499 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 500     for (ix = 0; ix < pme->pmegrid_nx; ix++)
 501     {
 502         // Trivial OpenMP region that does not throw, no need for try/catch
 503         int iy, iz;
 504
 505         for (iy = 0; iy < pme->pmegrid_ny; iy++)
 506         {
 507             for (iz = 0; iz < overlap; iz++)
 508             {
 509                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 510                     pmegrid[(ix*pny+iy)*pnz+iz];
 511             }
 512         }
 513     }
 514 }
 515
 516 void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
 517 {
 518 #ifdef PME_SIMD4_SPREAD_GATHER
 519     if (pme_order == 5
 520 #if !PME_4NSIMD_GATHER
 521         || pme_order == 4
 522 #endif
 523         )
 524     {
 525         /* Round nz up to a multiple of 4 to ensure alignment */
 526         *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 527     }
 528 #endif
 529 }
 530
 531 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
 532 {
 533 #ifdef PME_SIMD4_SPREAD_GATHER
 534 #if !PME_4NSIMD_GATHER
 535     if (pme_order == 4)
 536     {
 537         /* Add extra elements to ensured aligned operations do not go
 538          * beyond the allocated grid size.
 539          * Note that for pme_order=5, the pme grid z-size alignment
 540          * ensures that we will not go beyond the grid size.
 541          */
 542         *gridsize += 4;
 543     }
 544 #endif
 545 #endif
 546 }
 547
 548 void pmegrid_init(pmegrid_t *grid,
 549                   int cx, int cy, int cz,
 550                   int x0, int y0, int z0,
 551                   int x1, int y1, int z1,
 552                   gmx_bool set_alignment,
 553                   int pme_order,
 554                   real *ptr)
 555 {
 556     int nz, gridsize;
 557
 558     grid->ci[XX]     = cx;
 559     grid->ci[YY]     = cy;
 560     grid->ci[ZZ]     = cz;
 561     grid->offset[XX] = x0;
 562     grid->offset[YY] = y0;
 563     grid->offset[ZZ] = z0;
 564     grid->n[XX]      = x1 - x0 + pme_order - 1;
 565     grid->n[YY]      = y1 - y0 + pme_order - 1;
 566     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 567     copy_ivec(grid->n, grid->s);
 568
 569     nz = grid->s[ZZ];
 570     set_grid_alignment(&nz, pme_order);
 571     if (set_alignment)
 572     {
 573         grid->s[ZZ] = nz;
 574     }
 575     else if (nz != grid->s[ZZ])
 576     {
 577         gmx_incons("pmegrid_init call with an unaligned z size");
 578     }
 579
 580     grid->order = pme_order;
 581     if (ptr == nullptr)
 582     {
 583         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
 584         set_gridsize_alignment(&gridsize, pme_order);
 585         snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
 586     }
 587     else
 588     {
 589         grid->grid = ptr;
 590     }
 591 }
 592
 593 static int div_round_up(int enumerator, int denominator)
 594 {
 595     return (enumerator + denominator - 1)/denominator;
 596 }
 597
 598 static void make_subgrid_division(const ivec n, int ovl, int nthread,
 599                                   ivec nsub)
 600 {
 601     int   gsize_opt, gsize;
 602     int   nsx, nsy, nsz;
 603     char *env;
 604
 605     gsize_opt = -1;
 606     for (nsx = 1; nsx <= nthread; nsx++)
 607     {
 608         if (nthread % nsx == 0)
 609         {
 610             for (nsy = 1; nsy <= nthread; nsy++)
 611             {
 612                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 613                 {
 614                     nsz = nthread/(nsx*nsy);
 615
 616                     /* Determine the number of grid points per thread */
 617                     gsize =
 618                         (div_round_up(n[XX], nsx) + ovl)*
 619                         (div_round_up(n[YY], nsy) + ovl)*
 620                         (div_round_up(n[ZZ], nsz) + ovl);
 621
 622                     /* Minimize the number of grids points per thread
 623                      * and, secondarily, the number of cuts in minor dimensions.
 624                      */
 625                     if (gsize_opt == -1 ||
 626                         gsize < gsize_opt ||
 627                         (gsize == gsize_opt &&
 628                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 629                     {
 630                         nsub[XX]  = nsx;
 631                         nsub[YY]  = nsy;
 632                         nsub[ZZ]  = nsz;
 633                         gsize_opt = gsize;
 634                     }
 635                 }
 636             }
 637         }
 638     }
 639
 640     env = getenv("GMX_PME_THREAD_DIVISION");
 641     if (env != nullptr)
 642     {
 643         sscanf(env, "%20d %20d %20d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
 644     }
 645
 646     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 647     {
 648         gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
 649     }
 650 }
 651
 652 void pmegrids_init(pmegrids_t *grids,
 653                    int nx, int ny, int nz, int nz_base,
 654                    int pme_order,
 655                    gmx_bool bUseThreads,
 656                    int nthread,
 657                    int overlap_x,
 658                    int overlap_y)
 659 {
 660     ivec n, n_base;
 661     int  t, x, y, z, d, i, tfac;
 662     int  max_comm_lines = -1;
 663
 664     n[XX] = nx - (pme_order - 1);
 665     n[YY] = ny - (pme_order - 1);
 666     n[ZZ] = nz - (pme_order - 1);
 667
 668     copy_ivec(n, n_base);
 669     n_base[ZZ] = nz_base;
 670
 671     pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
 672                  nullptr);
 673
 674     grids->nthread = nthread;
 675
 676     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 677
 678     if (bUseThreads)
 679     {
 680         ivec nst;
 681         int  gridsize;
 682
 683         for (d = 0; d < DIM; d++)
 684         {
 685             nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
 686         }
 687         set_grid_alignment(&nst[ZZ], pme_order);
 688
 689         if (debug)
 690         {
 691             fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
 692                     grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
 693             fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 694                     nx, ny, nz,
 695                     nst[XX], nst[YY], nst[ZZ]);
 696         }
 697
 698         snew(grids->grid_th, grids->nthread);
 699         t        = 0;
 700         gridsize = nst[XX]*nst[YY]*nst[ZZ];
 701         set_gridsize_alignment(&gridsize, pme_order);
 702         snew_aligned(grids->grid_all,
 703                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 704                      SIMD4_ALIGNMENT);
 705
 706         for (x = 0; x < grids->nc[XX]; x++)
 707         {
 708             for (y = 0; y < grids->nc[YY]; y++)
 709             {
 710                 for (z = 0; z < grids->nc[ZZ]; z++)
 711                 {
 712                     pmegrid_init(&grids->grid_th[t],
 713                                  x, y, z,
 714                                  (n[XX]*(x  ))/grids->nc[XX],
 715                                  (n[YY]*(y  ))/grids->nc[YY],
 716                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
 717                                  (n[XX]*(x+1))/grids->nc[XX],
 718                                  (n[YY]*(y+1))/grids->nc[YY],
 719                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
 720                                  TRUE,
 721                                  pme_order,
 722                                  grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 723                     t++;
 724                 }
 725             }
 726         }
 727     }
 728     else
 729     {
 730         grids->grid_th = nullptr;
 731     }
 732
 733     tfac = 1;
 734     for (d = DIM-1; d >= 0; d--)
 735     {
 736         snew(grids->g2t[d], n[d]);
 737         t = 0;
 738         for (i = 0; i < n[d]; i++)
 739         {
 740             /* The second check should match the parameters
 741              * of the pmegrid_init call above.
 742              */
 743             while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 744             {
 745                 t++;
 746             }
 747             grids->g2t[d][i] = t*tfac;
 748         }
 749
 750         tfac *= grids->nc[d];
 751
 752         switch (d)
 753         {
 754             case XX: max_comm_lines = overlap_x;     break;
 755             case YY: max_comm_lines = overlap_y;     break;
 756             case ZZ: max_comm_lines = pme_order - 1; break;
 757         }
 758         grids->nthread_comm[d] = 0;
 759         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
 760                grids->nthread_comm[d] < grids->nc[d])
 761         {
 762             grids->nthread_comm[d]++;
 763         }
 764         if (debug != nullptr)
 765         {
 766             fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
 767                     'x'+d, grids->nthread_comm[d]);
 768         }
 769         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 770          * work, but this is not a problematic restriction.
 771          */
 772         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 773         {
 774             gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
 775         }
 776     }
 777 }
 778
 779 void pmegrids_destroy(pmegrids_t *grids)
 780 {
 781     if (grids->grid.grid != nullptr)
 782     {
 783         sfree_aligned(grids->grid.grid);
 784
 785         if (grids->nthread > 0)
 786         {
 787             sfree_aligned(grids->grid_all);
 788             sfree(grids->grid_th);
 789         }
 790         for (int d = 0; d < DIM; d++)
 791         {
 792             sfree(grids->g2t[d]);
 793         }
 794     }
 795 }
 796
 797 void
 798 make_gridindex_to_localindex(int n, int local_start, int local_range,
 799                              int **global_to_local,
 800                              real **fraction_shift)
 801 {
 802     /* Here we construct array for looking up the grid line index and
 803      * fraction for particles. This is done because it is slighlty
 804      * faster than the modulo operation and to because we need to take
 805      * care of rounding issues, see below.
 806      * We use an array size of c_pmeNeighborUnitcellCount times the grid size
 807      * to allow for particles to be out of the triclinic unit-cell.
 808      */
 809     const int arraySize = c_pmeNeighborUnitcellCount * n;
 810     int     * gtl;
 811     real    * fsh;
 812
 813     snew(gtl, arraySize);
 814     snew(fsh, arraySize);
 815
 816     for (int i = 0; i < arraySize; i++)
 817     {
 818         /* Transform global grid index to the local grid index.
 819          * Our local grid always runs from 0 to local_range-1.
 820          */
 821         gtl[i] = (i - local_start + n) % n;
 822         /* For coordinates that fall within the local grid the fraction
 823          * is correct, we don't need to shift it.
 824          */
 825         fsh[i] = 0;
 826         /* Check if we are using domain decomposition for PME */
 827         if (local_range < n)
 828         {
 829             /* Due to rounding issues i could be 1 beyond the lower or
 830              * upper boundary of the local grid. Correct the index for this.
 831              * If we shift the index, we need to shift the fraction by
 832              * the same amount in the other direction to not affect
 833              * the weights.
 834              * Note that due to this shifting the weights at the end of
 835              * the spline might change, but that will only involve values
 836              * between zero and values close to the precision of a real,
 837              * which is anyhow the accuracy of the whole mesh calculation.
 838              */
 839             if (gtl[i] == n - 1)
 840             {
 841                 /* When this i is used, we should round the local index up */
 842                 gtl[i] = 0;
 843                 fsh[i] = -1;
 844             }
 845             else if (gtl[i] == local_range && local_range > 0)
 846             {
 847                 /* When this i is used, we should round the local index down */
 848                 gtl[i] = local_range - 1;
 849                 fsh[i] = 1;
 850             }
 851         }
 852     }
 853
 854     *global_to_local = gtl;
 855     *fraction_shift  = fsh;
 856 }
 857
 858 void reuse_pmegrids(const pmegrids_t *oldgrid, pmegrids_t *newgrid)
 859 {
 860     int d, t;
 861
 862     for (d = 0; d < DIM; d++)
 863     {
 864         if (newgrid->grid.n[d] > oldgrid->grid.n[d])
 865         {
 866             return;
 867         }
 868     }
 869
 870     sfree_aligned(newgrid->grid.grid);
 871     newgrid->grid.grid = oldgrid->grid.grid;
 872
 873     if (newgrid->grid_th != nullptr && newgrid->nthread == oldgrid->nthread)
 874     {
 875         sfree_aligned(newgrid->grid_all);
 876         newgrid->grid_all = oldgrid->grid_all;
 877         for (t = 0; t < newgrid->nthread; t++)
 878         {
 879             newgrid->grid_th[t].grid = oldgrid->grid_th[t].grid;
 880         }
 881     }
 882 }