128-bit AVX2 SIMD for AMD Ryzen
[gromacs.git] / src / gromacs / ewald / pme-grid.cpp
blob5c01efa49970f17d4b35d403a88128ebbf62cf45
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* TODO find out what this file should be called */
38 #include "gmxpre.h"
40 #include "pme-grid.h"
42 #include "config.h"
44 #include <cstdlib>
46 #include "gromacs/ewald/pme.h"
47 #include "gromacs/fft/parallel_3dfft.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/timing/cyclecounter.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/smalloc.h"
53 #ifdef DEBUG_PME
54 #include "gromacs/fileio/pdbio.h"
55 #include "gromacs/utility/cstringutil.h"
56 #include "gromacs/utility/futil.h"
57 #endif
59 #include "pme-simd.h"
61 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
62 * to preserve alignment.
64 #define GMX_CACHE_SEP 64
66 #if GMX_MPI
67 void gmx_sum_qgrid_dd(struct gmx_pme_t *pme, real *grid, int direction)
69 pme_overlap_t *overlap;
70 int send_index0, send_nindex;
71 int recv_index0, recv_nindex;
72 MPI_Status stat;
73 int i, j, k, ix, iy, iz, icnt;
74 int ipulse, send_id, recv_id, datasize;
75 real *p;
76 real *sendptr, *recvptr;
78 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
79 overlap = &pme->overlap[1];
81 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
83 /* Since we have already (un)wrapped the overlap in the z-dimension,
84 * we only have to communicate 0 to nkz (not pmegrid_nz).
86 if (direction == GMX_SUM_GRID_FORWARD)
88 send_id = overlap->send_id[ipulse];
89 recv_id = overlap->recv_id[ipulse];
90 send_index0 = overlap->comm_data[ipulse].send_index0;
91 send_nindex = overlap->comm_data[ipulse].send_nindex;
92 recv_index0 = overlap->comm_data[ipulse].recv_index0;
93 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
95 else
97 send_id = overlap->recv_id[ipulse];
98 recv_id = overlap->send_id[ipulse];
99 send_index0 = overlap->comm_data[ipulse].recv_index0;
100 send_nindex = overlap->comm_data[ipulse].recv_nindex;
101 recv_index0 = overlap->comm_data[ipulse].send_index0;
102 recv_nindex = overlap->comm_data[ipulse].send_nindex;
105 /* Copy data to contiguous send buffer */
106 if (debug)
108 fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
109 pme->nodeid, overlap->nodeid, send_id,
110 pme->pmegrid_start_iy,
111 send_index0-pme->pmegrid_start_iy,
112 send_index0-pme->pmegrid_start_iy+send_nindex);
114 icnt = 0;
115 for (i = 0; i < pme->pmegrid_nx; i++)
117 ix = i;
118 for (j = 0; j < send_nindex; j++)
120 iy = j + send_index0 - pme->pmegrid_start_iy;
121 for (k = 0; k < pme->nkz; k++)
123 iz = k;
124 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
129 datasize = pme->pmegrid_nx * pme->nkz;
131 MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
132 send_id, ipulse,
133 overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
134 recv_id, ipulse,
135 overlap->mpi_comm, &stat);
137 /* Get data from contiguous recv buffer */
138 if (debug)
140 fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
141 pme->nodeid, overlap->nodeid, recv_id,
142 pme->pmegrid_start_iy,
143 recv_index0-pme->pmegrid_start_iy,
144 recv_index0-pme->pmegrid_start_iy+recv_nindex);
146 icnt = 0;
147 for (i = 0; i < pme->pmegrid_nx; i++)
149 ix = i;
150 for (j = 0; j < recv_nindex; j++)
152 iy = j + recv_index0 - pme->pmegrid_start_iy;
153 for (k = 0; k < pme->nkz; k++)
155 iz = k;
156 if (direction == GMX_SUM_GRID_FORWARD)
158 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
160 else
162 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
169 /* Major dimension is easier, no copying required,
170 * but we might have to sum to separate array.
171 * Since we don't copy, we have to communicate up to pmegrid_nz,
172 * not nkz as for the minor direction.
174 overlap = &pme->overlap[0];
176 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
178 if (direction == GMX_SUM_GRID_FORWARD)
180 send_id = overlap->send_id[ipulse];
181 recv_id = overlap->recv_id[ipulse];
182 send_index0 = overlap->comm_data[ipulse].send_index0;
183 send_nindex = overlap->comm_data[ipulse].send_nindex;
184 recv_index0 = overlap->comm_data[ipulse].recv_index0;
185 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
186 recvptr = overlap->recvbuf;
188 else
190 send_id = overlap->recv_id[ipulse];
191 recv_id = overlap->send_id[ipulse];
192 send_index0 = overlap->comm_data[ipulse].recv_index0;
193 send_nindex = overlap->comm_data[ipulse].recv_nindex;
194 recv_index0 = overlap->comm_data[ipulse].send_index0;
195 recv_nindex = overlap->comm_data[ipulse].send_nindex;
196 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
199 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
200 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
202 if (debug)
204 fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
205 pme->nodeid, overlap->nodeid, send_id,
206 pme->pmegrid_start_ix,
207 send_index0-pme->pmegrid_start_ix,
208 send_index0-pme->pmegrid_start_ix+send_nindex);
209 fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
210 pme->nodeid, overlap->nodeid, recv_id,
211 pme->pmegrid_start_ix,
212 recv_index0-pme->pmegrid_start_ix,
213 recv_index0-pme->pmegrid_start_ix+recv_nindex);
216 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
217 send_id, ipulse,
218 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
219 recv_id, ipulse,
220 overlap->mpi_comm, &stat);
222 /* ADD data from contiguous recv buffer */
223 if (direction == GMX_SUM_GRID_FORWARD)
225 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
226 for (i = 0; i < recv_nindex*datasize; i++)
228 p[i] += overlap->recvbuf[i];
233 #endif
236 int copy_pmegrid_to_fftgrid(const gmx_pme_t *pme, real *pmegrid, real *fftgrid, int grid_index)
238 ivec local_fft_ndata, local_fft_offset, local_fft_size;
239 ivec local_pme_size;
240 int ix, iy, iz;
241 int pmeidx, fftidx;
243 /* Dimensions should be identical for A/B grid, so we just use A here */
244 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
245 local_fft_ndata,
246 local_fft_offset,
247 local_fft_size);
249 local_pme_size[0] = pme->pmegrid_nx;
250 local_pme_size[1] = pme->pmegrid_ny;
251 local_pme_size[2] = pme->pmegrid_nz;
253 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
254 the offset is identical, and the PME grid always has more data (due to overlap)
257 #ifdef DEBUG_PME
258 FILE *fp, *fp2;
259 char fn[STRLEN];
260 real val;
261 sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
262 fp = gmx_ffopen(fn, "w");
263 sprintf(fn, "pmegrid%d.txt", pme->nodeid);
264 fp2 = gmx_ffopen(fn, "w");
265 #endif
267 for (ix = 0; ix < local_fft_ndata[XX]; ix++)
269 for (iy = 0; iy < local_fft_ndata[YY]; iy++)
271 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
273 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
274 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
275 fftgrid[fftidx] = pmegrid[pmeidx];
276 #ifdef DEBUG_PME
277 val = 100*pmegrid[pmeidx];
278 if (pmegrid[pmeidx] != 0)
280 gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
281 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
283 if (pmegrid[pmeidx] != 0)
285 fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
286 "qgrid",
287 pme->pmegrid_start_ix + ix,
288 pme->pmegrid_start_iy + iy,
289 pme->pmegrid_start_iz + iz,
290 pmegrid[pmeidx]);
292 #endif
296 #ifdef DEBUG_PME
297 gmx_ffclose(fp);
298 gmx_ffclose(fp2);
299 #endif
301 return 0;
305 #ifdef PME_TIME_THREADS
306 static gmx_cycles_t omp_cyc_start()
308 return gmx_cycles_read();
311 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
313 return gmx_cycles_read() - c;
315 #endif
318 int copy_fftgrid_to_pmegrid(struct gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
319 int nthread, int thread)
321 ivec local_fft_ndata, local_fft_offset, local_fft_size;
322 ivec local_pme_size;
323 int ixy0, ixy1, ixy, ix, iy, iz;
324 int pmeidx, fftidx;
325 #ifdef PME_TIME_THREADS
326 gmx_cycles_t c1;
327 static double cs1 = 0;
328 static int cnt = 0;
329 #endif
331 #ifdef PME_TIME_THREADS
332 c1 = omp_cyc_start();
333 #endif
334 /* Dimensions should be identical for A/B grid, so we just use A here */
335 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
336 local_fft_ndata,
337 local_fft_offset,
338 local_fft_size);
340 local_pme_size[0] = pme->pmegrid_nx;
341 local_pme_size[1] = pme->pmegrid_ny;
342 local_pme_size[2] = pme->pmegrid_nz;
344 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
345 the offset is identical, and the PME grid always has more data (due to overlap)
347 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
348 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
350 for (ixy = ixy0; ixy < ixy1; ixy++)
352 ix = ixy/local_fft_ndata[YY];
353 iy = ixy - ix*local_fft_ndata[YY];
355 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
356 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
357 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
359 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
363 #ifdef PME_TIME_THREADS
364 c1 = omp_cyc_end(c1);
365 cs1 += (double)c1;
366 cnt++;
367 if (cnt % 20 == 0)
369 printf("copy %.2f\n", cs1*1e-9);
371 #endif
373 return 0;
377 void wrap_periodic_pmegrid(const gmx_pme_t *pme, real *pmegrid)
379 int nx, ny, nz, pny, pnz, ny_x, overlap, ix, iy, iz;
381 nx = pme->nkx;
382 ny = pme->nky;
383 nz = pme->nkz;
385 pny = pme->pmegrid_ny;
386 pnz = pme->pmegrid_nz;
388 overlap = pme->pme_order - 1;
390 /* Add periodic overlap in z */
391 for (ix = 0; ix < pme->pmegrid_nx; ix++)
393 for (iy = 0; iy < pme->pmegrid_ny; iy++)
395 for (iz = 0; iz < overlap; iz++)
397 pmegrid[(ix*pny+iy)*pnz+iz] +=
398 pmegrid[(ix*pny+iy)*pnz+nz+iz];
403 if (pme->nnodes_minor == 1)
405 for (ix = 0; ix < pme->pmegrid_nx; ix++)
407 for (iy = 0; iy < overlap; iy++)
409 for (iz = 0; iz < nz; iz++)
411 pmegrid[(ix*pny+iy)*pnz+iz] +=
412 pmegrid[(ix*pny+ny+iy)*pnz+iz];
418 if (pme->nnodes_major == 1)
420 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
422 for (ix = 0; ix < overlap; ix++)
424 for (iy = 0; iy < ny_x; iy++)
426 for (iz = 0; iz < nz; iz++)
428 pmegrid[(ix*pny+iy)*pnz+iz] +=
429 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
437 void unwrap_periodic_pmegrid(struct gmx_pme_t *pme, real *pmegrid)
439 int nx, ny, nz, pny, pnz, ny_x, overlap, ix;
441 nx = pme->nkx;
442 ny = pme->nky;
443 nz = pme->nkz;
445 pny = pme->pmegrid_ny;
446 pnz = pme->pmegrid_nz;
448 overlap = pme->pme_order - 1;
450 if (pme->nnodes_major == 1)
452 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
454 for (ix = 0; ix < overlap; ix++)
456 int iy, iz;
458 for (iy = 0; iy < ny_x; iy++)
460 for (iz = 0; iz < nz; iz++)
462 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
463 pmegrid[(ix*pny+iy)*pnz+iz];
469 if (pme->nnodes_minor == 1)
471 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
472 for (ix = 0; ix < pme->pmegrid_nx; ix++)
474 // Trivial OpenMP region that does not throw, no need for try/catch
475 int iy, iz;
477 for (iy = 0; iy < overlap; iy++)
479 for (iz = 0; iz < nz; iz++)
481 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
482 pmegrid[(ix*pny+iy)*pnz+iz];
488 /* Copy periodic overlap in z */
489 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
490 for (ix = 0; ix < pme->pmegrid_nx; ix++)
492 // Trivial OpenMP region that does not throw, no need for try/catch
493 int iy, iz;
495 for (iy = 0; iy < pme->pmegrid_ny; iy++)
497 for (iz = 0; iz < overlap; iz++)
499 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
500 pmegrid[(ix*pny+iy)*pnz+iz];
506 void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
508 #ifdef PME_SIMD4_SPREAD_GATHER
509 if (pme_order == 5
510 #ifndef PME_SIMD4_UNALIGNED
511 || pme_order == 4
512 #endif
515 /* Round nz up to a multiple of 4 to ensure alignment */
516 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
518 #endif
521 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
523 #ifdef PME_SIMD4_SPREAD_GATHER
524 #ifndef PME_SIMD4_UNALIGNED
525 if (pme_order == 4)
527 /* Add extra elements to ensured aligned operations do not go
528 * beyond the allocated grid size.
529 * Note that for pme_order=5, the pme grid z-size alignment
530 * ensures that we will not go beyond the grid size.
532 *gridsize += 4;
534 #endif
535 #endif
538 void pmegrid_init(pmegrid_t *grid,
539 int cx, int cy, int cz,
540 int x0, int y0, int z0,
541 int x1, int y1, int z1,
542 gmx_bool set_alignment,
543 int pme_order,
544 real *ptr)
546 int nz, gridsize;
548 grid->ci[XX] = cx;
549 grid->ci[YY] = cy;
550 grid->ci[ZZ] = cz;
551 grid->offset[XX] = x0;
552 grid->offset[YY] = y0;
553 grid->offset[ZZ] = z0;
554 grid->n[XX] = x1 - x0 + pme_order - 1;
555 grid->n[YY] = y1 - y0 + pme_order - 1;
556 grid->n[ZZ] = z1 - z0 + pme_order - 1;
557 copy_ivec(grid->n, grid->s);
559 nz = grid->s[ZZ];
560 set_grid_alignment(&nz, pme_order);
561 if (set_alignment)
563 grid->s[ZZ] = nz;
565 else if (nz != grid->s[ZZ])
567 gmx_incons("pmegrid_init call with an unaligned z size");
570 grid->order = pme_order;
571 if (ptr == nullptr)
573 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
574 set_gridsize_alignment(&gridsize, pme_order);
575 snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
577 else
579 grid->grid = ptr;
583 static int div_round_up(int enumerator, int denominator)
585 return (enumerator + denominator - 1)/denominator;
588 static void make_subgrid_division(const ivec n, int ovl, int nthread,
589 ivec nsub)
591 int gsize_opt, gsize;
592 int nsx, nsy, nsz;
593 char *env;
595 gsize_opt = -1;
596 for (nsx = 1; nsx <= nthread; nsx++)
598 if (nthread % nsx == 0)
600 for (nsy = 1; nsy <= nthread; nsy++)
602 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
604 nsz = nthread/(nsx*nsy);
606 /* Determine the number of grid points per thread */
607 gsize =
608 (div_round_up(n[XX], nsx) + ovl)*
609 (div_round_up(n[YY], nsy) + ovl)*
610 (div_round_up(n[ZZ], nsz) + ovl);
612 /* Minimize the number of grids points per thread
613 * and, secondarily, the number of cuts in minor dimensions.
615 if (gsize_opt == -1 ||
616 gsize < gsize_opt ||
617 (gsize == gsize_opt &&
618 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
620 nsub[XX] = nsx;
621 nsub[YY] = nsy;
622 nsub[ZZ] = nsz;
623 gsize_opt = gsize;
630 env = getenv("GMX_PME_THREAD_DIVISION");
631 if (env != nullptr)
633 sscanf(env, "%20d %20d %20d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
636 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
638 gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
642 void pmegrids_init(pmegrids_t *grids,
643 int nx, int ny, int nz, int nz_base,
644 int pme_order,
645 gmx_bool bUseThreads,
646 int nthread,
647 int overlap_x,
648 int overlap_y)
650 ivec n, n_base;
651 int t, x, y, z, d, i, tfac;
652 int max_comm_lines = -1;
654 n[XX] = nx - (pme_order - 1);
655 n[YY] = ny - (pme_order - 1);
656 n[ZZ] = nz - (pme_order - 1);
658 copy_ivec(n, n_base);
659 n_base[ZZ] = nz_base;
661 pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
662 nullptr);
664 grids->nthread = nthread;
666 make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
668 if (bUseThreads)
670 ivec nst;
671 int gridsize;
673 for (d = 0; d < DIM; d++)
675 nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
677 set_grid_alignment(&nst[ZZ], pme_order);
679 if (debug)
681 fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
682 grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
683 fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
684 nx, ny, nz,
685 nst[XX], nst[YY], nst[ZZ]);
688 snew(grids->grid_th, grids->nthread);
689 t = 0;
690 gridsize = nst[XX]*nst[YY]*nst[ZZ];
691 set_gridsize_alignment(&gridsize, pme_order);
692 snew_aligned(grids->grid_all,
693 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
694 SIMD4_ALIGNMENT);
696 for (x = 0; x < grids->nc[XX]; x++)
698 for (y = 0; y < grids->nc[YY]; y++)
700 for (z = 0; z < grids->nc[ZZ]; z++)
702 pmegrid_init(&grids->grid_th[t],
703 x, y, z,
704 (n[XX]*(x ))/grids->nc[XX],
705 (n[YY]*(y ))/grids->nc[YY],
706 (n[ZZ]*(z ))/grids->nc[ZZ],
707 (n[XX]*(x+1))/grids->nc[XX],
708 (n[YY]*(y+1))/grids->nc[YY],
709 (n[ZZ]*(z+1))/grids->nc[ZZ],
710 TRUE,
711 pme_order,
712 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
713 t++;
718 else
720 grids->grid_th = nullptr;
723 tfac = 1;
724 for (d = DIM-1; d >= 0; d--)
726 snew(grids->g2t[d], n[d]);
727 t = 0;
728 for (i = 0; i < n[d]; i++)
730 /* The second check should match the parameters
731 * of the pmegrid_init call above.
733 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
735 t++;
737 grids->g2t[d][i] = t*tfac;
740 tfac *= grids->nc[d];
742 switch (d)
744 case XX: max_comm_lines = overlap_x; break;
745 case YY: max_comm_lines = overlap_y; break;
746 case ZZ: max_comm_lines = pme_order - 1; break;
748 grids->nthread_comm[d] = 0;
749 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
750 grids->nthread_comm[d] < grids->nc[d])
752 grids->nthread_comm[d]++;
754 if (debug != nullptr)
756 fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
757 'x'+d, grids->nthread_comm[d]);
759 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
760 * work, but this is not a problematic restriction.
762 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
764 gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
769 void pmegrids_destroy(pmegrids_t *grids)
771 if (grids->grid.grid != nullptr)
773 sfree_aligned(grids->grid.grid);
775 if (grids->nthread > 0)
777 sfree_aligned(grids->grid_all);
778 sfree(grids->grid_th);
780 for (int d = 0; d < DIM; d++)
782 sfree(grids->g2t[d]);
787 void
788 make_gridindex_to_localindex(int n, int local_start, int local_range,
789 int **global_to_local,
790 real **fraction_shift)
792 /* Here we construct array for looking up the grid line index and
793 * fraction for particles. This is done because it is slighlty
794 * faster than the modulo operation and to because we need to take
795 * care of rounding issues, see below.
796 * We use an array size of c_pmeNeighborUnitcellCount times the grid size
797 * to allow for particles to be out of the triclinic unit-cell.
799 const int arraySize = c_pmeNeighborUnitcellCount * n;
800 int * gtl;
801 real * fsh;
803 snew(gtl, arraySize);
804 snew(fsh, arraySize);
806 for (int i = 0; i < arraySize; i++)
808 /* Transform global grid index to the local grid index.
809 * Our local grid always runs from 0 to local_range-1.
811 gtl[i] = (i - local_start + n) % n;
812 /* For coordinates that fall within the local grid the fraction
813 * is correct, we don't need to shift it.
815 fsh[i] = 0;
816 /* Check if we are using domain decomposition for PME */
817 if (local_range < n)
819 /* Due to rounding issues i could be 1 beyond the lower or
820 * upper boundary of the local grid. Correct the index for this.
821 * If we shift the index, we need to shift the fraction by
822 * the same amount in the other direction to not affect
823 * the weights.
824 * Note that due to this shifting the weights at the end of
825 * the spline might change, but that will only involve values
826 * between zero and values close to the precision of a real,
827 * which is anyhow the accuracy of the whole mesh calculation.
829 if (gtl[i] == n - 1)
831 /* When this i is used, we should round the local index up */
832 gtl[i] = 0;
833 fsh[i] = -1;
835 else if (gtl[i] == local_range && local_range > 0)
837 /* When this i is used, we should round the local index down */
838 gtl[i] = local_range - 1;
839 fsh[i] = 1;
844 *global_to_local = gtl;
845 *fraction_shift = fsh;
848 void reuse_pmegrids(const pmegrids_t *oldgrid, pmegrids_t *newgrid)
850 int d, t;
852 for (d = 0; d < DIM; d++)
854 if (newgrid->grid.n[d] > oldgrid->grid.n[d])
856 return;
860 sfree_aligned(newgrid->grid.grid);
861 newgrid->grid.grid = oldgrid->grid.grid;
863 if (newgrid->grid_th != nullptr && newgrid->nthread == oldgrid->nthread)
865 sfree_aligned(newgrid->grid_all);
866 newgrid->grid_all = oldgrid->grid_all;
867 for (t = 0; t < newgrid->nthread; t++)
869 newgrid->grid_th[t].grid = oldgrid->grid_th[t].grid;
874 static void dump_grid(FILE *fp,
875 int sx, int sy, int sz, int nx, int ny, int nz,
876 int my, int mz, const real *g)
878 int x, y, z;
880 for (x = 0; x < nx; x++)
882 for (y = 0; y < ny; y++)
884 for (z = 0; z < nz; z++)
886 fprintf(fp, "%2d %2d %2d %6.3f\n",
887 sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
893 /* This function is called from gmx_pme_do() only from debugging code
894 that is commented out. */
895 void dump_local_fftgrid(struct gmx_pme_t *pme, const real *fftgrid)
897 ivec local_fft_ndata, local_fft_offset, local_fft_size;
899 gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
900 local_fft_ndata,
901 local_fft_offset,
902 local_fft_size);
904 dump_grid(stderr,
905 pme->pmegrid_start_ix,
906 pme->pmegrid_start_iy,
907 pme->pmegrid_start_iz,
908 pme->pmegrid_nx-pme->pme_order+1,
909 pme->pmegrid_ny-pme->pme_order+1,
910 pme->pmegrid_nz-pme->pme_order+1,
911 local_fft_size[YY],
912 local_fft_size[ZZ],
913 fftgrid);