2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* TODO find out what this file should be called */
46 #include "gromacs/ewald/pme.h"
47 #include "gromacs/fft/parallel_3dfft.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/timing/cyclecounter.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/smalloc.h"
53 #include "pme_internal.h"
56 #include "gromacs/fileio/pdbio.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/futil.h"
63 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
64 * to preserve alignment.
66 #define GMX_CACHE_SEP 64
68 void gmx_sum_qgrid_dd(gmx_pme_t
*pme
,
73 pme_overlap_t
*overlap
;
74 int send_index0
, send_nindex
;
75 int recv_index0
, recv_nindex
;
77 int i
, j
, k
, ix
, iy
, iz
, icnt
;
78 int send_id
, recv_id
, datasize
;
80 real
*sendptr
, *recvptr
;
82 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
83 overlap
= &pme
->overlap
[1];
85 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
87 /* Since we have already (un)wrapped the overlap in the z-dimension,
88 * we only have to communicate 0 to nkz (not pmegrid_nz).
90 if (direction
== GMX_SUM_GRID_FORWARD
)
92 send_id
= overlap
->comm_data
[ipulse
].send_id
;
93 recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
94 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
95 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
96 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
97 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
101 send_id
= overlap
->comm_data
[ipulse
].recv_id
;
102 recv_id
= overlap
->comm_data
[ipulse
].send_id
;
103 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
104 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
105 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
106 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
109 /* Copy data to contiguous send buffer */
112 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
113 pme
->nodeid
, overlap
->nodeid
, send_id
,
114 pme
->pmegrid_start_iy
,
115 send_index0
-pme
->pmegrid_start_iy
,
116 send_index0
-pme
->pmegrid_start_iy
+send_nindex
);
119 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
122 for (j
= 0; j
< send_nindex
; j
++)
124 iy
= j
+ send_index0
- pme
->pmegrid_start_iy
;
125 for (k
= 0; k
< pme
->nkz
; k
++)
128 overlap
->sendbuf
[icnt
++] = grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
];
133 datasize
= pme
->pmegrid_nx
* pme
->nkz
;
135 MPI_Sendrecv(overlap
->sendbuf
.data(), send_nindex
*datasize
, GMX_MPI_REAL
,
137 overlap
->recvbuf
.data(), recv_nindex
*datasize
, GMX_MPI_REAL
,
139 overlap
->mpi_comm
, &stat
);
141 /* Get data from contiguous recv buffer */
144 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
145 pme
->nodeid
, overlap
->nodeid
, recv_id
,
146 pme
->pmegrid_start_iy
,
147 recv_index0
-pme
->pmegrid_start_iy
,
148 recv_index0
-pme
->pmegrid_start_iy
+recv_nindex
);
151 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
154 for (j
= 0; j
< recv_nindex
; j
++)
156 iy
= j
+ recv_index0
- pme
->pmegrid_start_iy
;
157 for (k
= 0; k
< pme
->nkz
; k
++)
160 if (direction
== GMX_SUM_GRID_FORWARD
)
162 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] += overlap
->recvbuf
[icnt
++];
166 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] = overlap
->recvbuf
[icnt
++];
173 /* Major dimension is easier, no copying required,
174 * but we might have to sum to separate array.
175 * Since we don't copy, we have to communicate up to pmegrid_nz,
176 * not nkz as for the minor direction.
178 overlap
= &pme
->overlap
[0];
180 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
182 if (direction
== GMX_SUM_GRID_FORWARD
)
184 send_id
= overlap
->comm_data
[ipulse
].send_id
;
185 recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
186 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
187 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
188 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
189 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
190 recvptr
= overlap
->recvbuf
.data();
194 send_id
= overlap
->comm_data
[ipulse
].recv_id
;
195 recv_id
= overlap
->comm_data
[ipulse
].send_id
;
196 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
197 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
198 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
199 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
200 recvptr
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
203 sendptr
= grid
+ (send_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
204 datasize
= pme
->pmegrid_ny
* pme
->pmegrid_nz
;
208 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
209 pme
->nodeid
, overlap
->nodeid
, send_id
,
210 pme
->pmegrid_start_ix
,
211 send_index0
-pme
->pmegrid_start_ix
,
212 send_index0
-pme
->pmegrid_start_ix
+send_nindex
);
213 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
214 pme
->nodeid
, overlap
->nodeid
, recv_id
,
215 pme
->pmegrid_start_ix
,
216 recv_index0
-pme
->pmegrid_start_ix
,
217 recv_index0
-pme
->pmegrid_start_ix
+recv_nindex
);
220 MPI_Sendrecv(sendptr
, send_nindex
*datasize
, GMX_MPI_REAL
,
222 recvptr
, recv_nindex
*datasize
, GMX_MPI_REAL
,
224 overlap
->mpi_comm
, &stat
);
226 /* ADD data from contiguous recv buffer */
227 if (direction
== GMX_SUM_GRID_FORWARD
)
229 p
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
230 for (i
= 0; i
< recv_nindex
*datasize
; i
++)
232 p
[i
] += overlap
->recvbuf
[i
];
237 GMX_UNUSED_VALUE(pme
);
238 GMX_UNUSED_VALUE(grid
);
239 GMX_UNUSED_VALUE(direction
);
241 GMX_RELEASE_ASSERT(false, "gmx_sum_qgrid_dd() should not be called without MPI");
246 int copy_pmegrid_to_fftgrid(const gmx_pme_t
*pme
, const real
*pmegrid
, real
*fftgrid
, int grid_index
)
248 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
253 /* Dimensions should be identical for A/B grid, so we just use A here */
254 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
259 local_pme_size
[0] = pme
->pmegrid_nx
;
260 local_pme_size
[1] = pme
->pmegrid_ny
;
261 local_pme_size
[2] = pme
->pmegrid_nz
;
263 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
264 the offset is identical, and the PME grid always has more data (due to overlap)
271 sprintf(fn
, "pmegrid%d.pdb", pme
->nodeid
);
272 fp
= gmx_ffopen(fn
, "w");
273 sprintf(fn
, "pmegrid%d.txt", pme
->nodeid
);
274 fp2
= gmx_ffopen(fn
, "w");
277 for (ix
= 0; ix
< local_fft_ndata
[XX
]; ix
++)
279 for (iy
= 0; iy
< local_fft_ndata
[YY
]; iy
++)
281 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
283 pmeidx
= ix
*(local_pme_size
[YY
]*local_pme_size
[ZZ
])+iy
*(local_pme_size
[ZZ
])+iz
;
284 fftidx
= ix
*(local_fft_size
[YY
]*local_fft_size
[ZZ
])+iy
*(local_fft_size
[ZZ
])+iz
;
285 fftgrid
[fftidx
] = pmegrid
[pmeidx
];
287 val
= 100*pmegrid
[pmeidx
];
288 if (pmegrid
[pmeidx
] != 0)
290 gmx_fprintf_pdb_atomline(fp
, epdbATOM
, pmeidx
, "CA", ' ', "GLY", ' ', pmeidx
, ' ',
291 5.0*ix
, 5.0*iy
, 5.0*iz
, 1.0, val
, "");
293 if (pmegrid
[pmeidx
] != 0)
295 fprintf(fp2
, "%-12s %5d %5d %5d %12.5e\n",
297 pme
->pmegrid_start_ix
+ ix
,
298 pme
->pmegrid_start_iy
+ iy
,
299 pme
->pmegrid_start_iz
+ iz
,
315 #ifdef PME_TIME_THREADS
316 static gmx_cycles_t
omp_cyc_start()
318 return gmx_cycles_read();
321 static gmx_cycles_t
omp_cyc_end(gmx_cycles_t c
)
323 return gmx_cycles_read() - c
;
328 int copy_fftgrid_to_pmegrid(struct gmx_pme_t
*pme
, const real
*fftgrid
, real
*pmegrid
, int grid_index
,
329 int nthread
, int thread
)
331 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
333 int ixy0
, ixy1
, ixy
, ix
, iy
, iz
;
335 #ifdef PME_TIME_THREADS
337 static double cs1
= 0;
341 #ifdef PME_TIME_THREADS
342 c1
= omp_cyc_start();
344 /* Dimensions should be identical for A/B grid, so we just use A here */
345 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
350 local_pme_size
[0] = pme
->pmegrid_nx
;
351 local_pme_size
[1] = pme
->pmegrid_ny
;
352 local_pme_size
[2] = pme
->pmegrid_nz
;
354 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
355 the offset is identical, and the PME grid always has more data (due to overlap)
357 ixy0
= ((thread
)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
358 ixy1
= ((thread
+1)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
360 for (ixy
= ixy0
; ixy
< ixy1
; ixy
++)
362 ix
= ixy
/local_fft_ndata
[YY
];
363 iy
= ixy
- ix
*local_fft_ndata
[YY
];
365 pmeidx
= (ix
*local_pme_size
[YY
] + iy
)*local_pme_size
[ZZ
];
366 fftidx
= (ix
*local_fft_size
[YY
] + iy
)*local_fft_size
[ZZ
];
367 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
369 pmegrid
[pmeidx
+iz
] = fftgrid
[fftidx
+iz
];
373 #ifdef PME_TIME_THREADS
374 c1
= omp_cyc_end(c1
);
379 printf("copy %.2f\n", cs1
*1e-9);
387 void wrap_periodic_pmegrid(const gmx_pme_t
*pme
, real
*pmegrid
)
389 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
, iy
, iz
;
395 pny
= pme
->pmegrid_ny
;
396 pnz
= pme
->pmegrid_nz
;
398 overlap
= pme
->pme_order
- 1;
400 /* Add periodic overlap in z */
401 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
403 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
405 for (iz
= 0; iz
< overlap
; iz
++)
407 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
408 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
];
413 if (pme
->nnodes_minor
== 1)
415 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
417 for (iy
= 0; iy
< overlap
; iy
++)
419 for (iz
= 0; iz
< nz
; iz
++)
421 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
422 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
];
428 if (pme
->nnodes_major
== 1)
430 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
432 for (ix
= 0; ix
< overlap
; ix
++)
434 for (iy
= 0; iy
< ny_x
; iy
++)
436 for (iz
= 0; iz
< nz
; iz
++)
438 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
439 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
];
447 void unwrap_periodic_pmegrid(struct gmx_pme_t
*pme
, real
*pmegrid
)
449 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
;
455 pny
= pme
->pmegrid_ny
;
456 pnz
= pme
->pmegrid_nz
;
458 overlap
= pme
->pme_order
- 1;
460 if (pme
->nnodes_major
== 1)
462 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
464 for (ix
= 0; ix
< overlap
; ix
++)
468 for (iy
= 0; iy
< ny_x
; iy
++)
470 for (iz
= 0; iz
< nz
; iz
++)
472 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
] =
473 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
479 if (pme
->nnodes_minor
== 1)
481 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
482 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
484 // Trivial OpenMP region that does not throw, no need for try/catch
487 for (iy
= 0; iy
< overlap
; iy
++)
489 for (iz
= 0; iz
< nz
; iz
++)
491 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
] =
492 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
498 /* Copy periodic overlap in z */
499 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
500 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
502 // Trivial OpenMP region that does not throw, no need for try/catch
505 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
507 for (iz
= 0; iz
< overlap
; iz
++)
509 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
] =
510 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
516 void set_grid_alignment(int gmx_unused
*pmegrid_nz
, int gmx_unused pme_order
)
518 #ifdef PME_SIMD4_SPREAD_GATHER
520 #if !PME_4NSIMD_GATHER
525 /* Round nz up to a multiple of 4 to ensure alignment */
526 *pmegrid_nz
= ((*pmegrid_nz
+ 3) & ~3);
531 static void set_gridsize_alignment(int gmx_unused
*gridsize
, int gmx_unused pme_order
)
533 #ifdef PME_SIMD4_SPREAD_GATHER
534 #if !PME_4NSIMD_GATHER
537 /* Add extra elements to ensured aligned operations do not go
538 * beyond the allocated grid size.
539 * Note that for pme_order=5, the pme grid z-size alignment
540 * ensures that we will not go beyond the grid size.
548 void pmegrid_init(pmegrid_t
*grid
,
549 int cx
, int cy
, int cz
,
550 int x0
, int y0
, int z0
,
551 int x1
, int y1
, int z1
,
552 gmx_bool set_alignment
,
561 grid
->offset
[XX
] = x0
;
562 grid
->offset
[YY
] = y0
;
563 grid
->offset
[ZZ
] = z0
;
564 grid
->n
[XX
] = x1
- x0
+ pme_order
- 1;
565 grid
->n
[YY
] = y1
- y0
+ pme_order
- 1;
566 grid
->n
[ZZ
] = z1
- z0
+ pme_order
- 1;
567 copy_ivec(grid
->n
, grid
->s
);
570 set_grid_alignment(&nz
, pme_order
);
575 else if (nz
!= grid
->s
[ZZ
])
577 gmx_incons("pmegrid_init call with an unaligned z size");
580 grid
->order
= pme_order
;
583 gridsize
= grid
->s
[XX
]*grid
->s
[YY
]*grid
->s
[ZZ
];
584 set_gridsize_alignment(&gridsize
, pme_order
);
585 snew_aligned(grid
->grid
, gridsize
, SIMD4_ALIGNMENT
);
593 static int div_round_up(int enumerator
, int denominator
)
595 return (enumerator
+ denominator
- 1)/denominator
;
598 static void make_subgrid_division(const ivec n
, int ovl
, int nthread
,
601 int gsize_opt
, gsize
;
606 for (nsx
= 1; nsx
<= nthread
; nsx
++)
608 if (nthread
% nsx
== 0)
610 for (nsy
= 1; nsy
<= nthread
; nsy
++)
612 if (nsx
*nsy
<= nthread
&& nthread
% (nsx
*nsy
) == 0)
614 nsz
= nthread
/(nsx
*nsy
);
616 /* Determine the number of grid points per thread */
618 (div_round_up(n
[XX
], nsx
) + ovl
)*
619 (div_round_up(n
[YY
], nsy
) + ovl
)*
620 (div_round_up(n
[ZZ
], nsz
) + ovl
);
622 /* Minimize the number of grids points per thread
623 * and, secondarily, the number of cuts in minor dimensions.
625 if (gsize_opt
== -1 ||
627 (gsize
== gsize_opt
&&
628 (nsz
< nsub
[ZZ
] || (nsz
== nsub
[ZZ
] && nsy
< nsub
[YY
]))))
640 env
= getenv("GMX_PME_THREAD_DIVISION");
643 sscanf(env
, "%20d %20d %20d", &nsub
[XX
], &nsub
[YY
], &nsub
[ZZ
]);
646 if (nsub
[XX
]*nsub
[YY
]*nsub
[ZZ
] != nthread
)
648 gmx_fatal(FARGS
, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub
[XX
], nsub
[YY
], nsub
[ZZ
], nthread
);
652 void pmegrids_init(pmegrids_t
*grids
,
653 int nx
, int ny
, int nz
, int nz_base
,
655 gmx_bool bUseThreads
,
661 int t
, x
, y
, z
, d
, i
, tfac
;
662 int max_comm_lines
= -1;
664 n
[XX
] = nx
- (pme_order
- 1);
665 n
[YY
] = ny
- (pme_order
- 1);
666 n
[ZZ
] = nz
- (pme_order
- 1);
668 copy_ivec(n
, n_base
);
669 n_base
[ZZ
] = nz_base
;
671 pmegrid_init(&grids
->grid
, 0, 0, 0, 0, 0, 0, n
[XX
], n
[YY
], n
[ZZ
], FALSE
, pme_order
,
674 grids
->nthread
= nthread
;
676 make_subgrid_division(n_base
, pme_order
-1, grids
->nthread
, grids
->nc
);
683 for (d
= 0; d
< DIM
; d
++)
685 nst
[d
] = div_round_up(n
[d
], grids
->nc
[d
]) + pme_order
- 1;
687 set_grid_alignment(&nst
[ZZ
], pme_order
);
691 fprintf(debug
, "pmegrid thread local division: %d x %d x %d\n",
692 grids
->nc
[XX
], grids
->nc
[YY
], grids
->nc
[ZZ
]);
693 fprintf(debug
, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
695 nst
[XX
], nst
[YY
], nst
[ZZ
]);
698 snew(grids
->grid_th
, grids
->nthread
);
700 gridsize
= nst
[XX
]*nst
[YY
]*nst
[ZZ
];
701 set_gridsize_alignment(&gridsize
, pme_order
);
702 snew_aligned(grids
->grid_all
,
703 grids
->nthread
*gridsize
+(grids
->nthread
+1)*GMX_CACHE_SEP
,
706 for (x
= 0; x
< grids
->nc
[XX
]; x
++)
708 for (y
= 0; y
< grids
->nc
[YY
]; y
++)
710 for (z
= 0; z
< grids
->nc
[ZZ
]; z
++)
712 pmegrid_init(&grids
->grid_th
[t
],
714 (n
[XX
]*(x
))/grids
->nc
[XX
],
715 (n
[YY
]*(y
))/grids
->nc
[YY
],
716 (n
[ZZ
]*(z
))/grids
->nc
[ZZ
],
717 (n
[XX
]*(x
+1))/grids
->nc
[XX
],
718 (n
[YY
]*(y
+1))/grids
->nc
[YY
],
719 (n
[ZZ
]*(z
+1))/grids
->nc
[ZZ
],
722 grids
->grid_all
+GMX_CACHE_SEP
+t
*(gridsize
+GMX_CACHE_SEP
));
730 grids
->grid_th
= nullptr;
734 for (d
= DIM
-1; d
>= 0; d
--)
736 snew(grids
->g2t
[d
], n
[d
]);
738 for (i
= 0; i
< n
[d
]; i
++)
740 /* The second check should match the parameters
741 * of the pmegrid_init call above.
743 while (t
+ 1 < grids
->nc
[d
] && i
>= (n
[d
]*(t
+1))/grids
->nc
[d
])
747 grids
->g2t
[d
][i
] = t
*tfac
;
750 tfac
*= grids
->nc
[d
];
754 case XX
: max_comm_lines
= overlap_x
; break;
755 case YY
: max_comm_lines
= overlap_y
; break;
756 case ZZ
: max_comm_lines
= pme_order
- 1; break;
758 grids
->nthread_comm
[d
] = 0;
759 while ((n
[d
]*grids
->nthread_comm
[d
])/grids
->nc
[d
] < max_comm_lines
&&
760 grids
->nthread_comm
[d
] < grids
->nc
[d
])
762 grids
->nthread_comm
[d
]++;
764 if (debug
!= nullptr)
766 fprintf(debug
, "pmegrid thread grid communication range in %c: %d\n",
767 'x'+d
, grids
->nthread_comm
[d
]);
769 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
770 * work, but this is not a problematic restriction.
772 if (grids
->nc
[d
] > 1 && grids
->nthread_comm
[d
] > grids
->nc
[d
])
774 gmx_fatal(FARGS
, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids
->nthread
);
779 void pmegrids_destroy(pmegrids_t
*grids
)
781 if (grids
->grid
.grid
!= nullptr)
783 sfree_aligned(grids
->grid
.grid
);
785 if (grids
->nthread
> 0)
787 sfree_aligned(grids
->grid_all
);
788 sfree(grids
->grid_th
);
790 for (int d
= 0; d
< DIM
; d
++)
792 sfree(grids
->g2t
[d
]);
798 make_gridindex_to_localindex(int n
, int local_start
, int local_range
,
799 int **global_to_local
,
800 real
**fraction_shift
)
802 /* Here we construct array for looking up the grid line index and
803 * fraction for particles. This is done because it is slighlty
804 * faster than the modulo operation and to because we need to take
805 * care of rounding issues, see below.
806 * We use an array size of c_pmeNeighborUnitcellCount times the grid size
807 * to allow for particles to be out of the triclinic unit-cell.
809 const int arraySize
= c_pmeNeighborUnitcellCount
* n
;
813 snew(gtl
, arraySize
);
814 snew(fsh
, arraySize
);
816 for (int i
= 0; i
< arraySize
; i
++)
818 /* Transform global grid index to the local grid index.
819 * Our local grid always runs from 0 to local_range-1.
821 gtl
[i
] = (i
- local_start
+ n
) % n
;
822 /* For coordinates that fall within the local grid the fraction
823 * is correct, we don't need to shift it.
826 /* Check if we are using domain decomposition for PME */
829 /* Due to rounding issues i could be 1 beyond the lower or
830 * upper boundary of the local grid. Correct the index for this.
831 * If we shift the index, we need to shift the fraction by
832 * the same amount in the other direction to not affect
834 * Note that due to this shifting the weights at the end of
835 * the spline might change, but that will only involve values
836 * between zero and values close to the precision of a real,
837 * which is anyhow the accuracy of the whole mesh calculation.
841 /* When this i is used, we should round the local index up */
845 else if (gtl
[i
] == local_range
&& local_range
> 0)
847 /* When this i is used, we should round the local index down */
848 gtl
[i
] = local_range
- 1;
854 *global_to_local
= gtl
;
855 *fraction_shift
= fsh
;
858 void reuse_pmegrids(const pmegrids_t
*oldgrid
, pmegrids_t
*newgrid
)
862 for (d
= 0; d
< DIM
; d
++)
864 if (newgrid
->grid
.n
[d
] > oldgrid
->grid
.n
[d
])
870 sfree_aligned(newgrid
->grid
.grid
);
871 newgrid
->grid
.grid
= oldgrid
->grid
.grid
;
873 if (newgrid
->grid_th
!= nullptr && newgrid
->nthread
== oldgrid
->nthread
)
875 sfree_aligned(newgrid
->grid_all
);
876 newgrid
->grid_all
= oldgrid
->grid_all
;
877 for (t
= 0; t
< newgrid
->nthread
; t
++)
879 newgrid
->grid_th
[t
].grid
= oldgrid
->grid_th
[t
].grid
;