2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* TODO find out what this file should be called */
46 #include "gromacs/ewald/pme.h"
47 #include "gromacs/fft/parallel_3dfft.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/timing/cyclecounter.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/smalloc.h"
53 #include "pme-internal.h"
56 #include "gromacs/fileio/pdbio.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/futil.h"
63 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
64 * to preserve alignment.
66 #define GMX_CACHE_SEP 64
69 void gmx_sum_qgrid_dd(struct gmx_pme_t
*pme
, real
*grid
, int direction
)
71 pme_overlap_t
*overlap
;
72 int send_index0
, send_nindex
;
73 int recv_index0
, recv_nindex
;
75 int i
, j
, k
, ix
, iy
, iz
, icnt
;
76 int send_id
, recv_id
, datasize
;
78 real
*sendptr
, *recvptr
;
80 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
81 overlap
= &pme
->overlap
[1];
83 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
85 /* Since we have already (un)wrapped the overlap in the z-dimension,
86 * we only have to communicate 0 to nkz (not pmegrid_nz).
88 if (direction
== GMX_SUM_GRID_FORWARD
)
90 send_id
= overlap
->comm_data
[ipulse
].send_id
;
91 recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
92 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
93 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
94 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
95 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
99 send_id
= overlap
->comm_data
[ipulse
].recv_id
;
100 recv_id
= overlap
->comm_data
[ipulse
].send_id
;
101 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
102 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
103 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
104 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
107 /* Copy data to contiguous send buffer */
110 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
111 pme
->nodeid
, overlap
->nodeid
, send_id
,
112 pme
->pmegrid_start_iy
,
113 send_index0
-pme
->pmegrid_start_iy
,
114 send_index0
-pme
->pmegrid_start_iy
+send_nindex
);
117 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
120 for (j
= 0; j
< send_nindex
; j
++)
122 iy
= j
+ send_index0
- pme
->pmegrid_start_iy
;
123 for (k
= 0; k
< pme
->nkz
; k
++)
126 overlap
->sendbuf
[icnt
++] = grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
];
131 datasize
= pme
->pmegrid_nx
* pme
->nkz
;
133 MPI_Sendrecv(overlap
->sendbuf
.data(), send_nindex
*datasize
, GMX_MPI_REAL
,
135 overlap
->recvbuf
.data(), recv_nindex
*datasize
, GMX_MPI_REAL
,
137 overlap
->mpi_comm
, &stat
);
139 /* Get data from contiguous recv buffer */
142 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
143 pme
->nodeid
, overlap
->nodeid
, recv_id
,
144 pme
->pmegrid_start_iy
,
145 recv_index0
-pme
->pmegrid_start_iy
,
146 recv_index0
-pme
->pmegrid_start_iy
+recv_nindex
);
149 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
152 for (j
= 0; j
< recv_nindex
; j
++)
154 iy
= j
+ recv_index0
- pme
->pmegrid_start_iy
;
155 for (k
= 0; k
< pme
->nkz
; k
++)
158 if (direction
== GMX_SUM_GRID_FORWARD
)
160 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] += overlap
->recvbuf
[icnt
++];
164 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] = overlap
->recvbuf
[icnt
++];
171 /* Major dimension is easier, no copying required,
172 * but we might have to sum to separate array.
173 * Since we don't copy, we have to communicate up to pmegrid_nz,
174 * not nkz as for the minor direction.
176 overlap
= &pme
->overlap
[0];
178 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
180 if (direction
== GMX_SUM_GRID_FORWARD
)
182 send_id
= overlap
->comm_data
[ipulse
].send_id
;
183 recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
184 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
185 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
186 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
187 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
188 recvptr
= overlap
->recvbuf
.data();
192 send_id
= overlap
->comm_data
[ipulse
].recv_id
;
193 recv_id
= overlap
->comm_data
[ipulse
].send_id
;
194 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
195 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
196 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
197 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
198 recvptr
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
201 sendptr
= grid
+ (send_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
202 datasize
= pme
->pmegrid_ny
* pme
->pmegrid_nz
;
206 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
207 pme
->nodeid
, overlap
->nodeid
, send_id
,
208 pme
->pmegrid_start_ix
,
209 send_index0
-pme
->pmegrid_start_ix
,
210 send_index0
-pme
->pmegrid_start_ix
+send_nindex
);
211 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
212 pme
->nodeid
, overlap
->nodeid
, recv_id
,
213 pme
->pmegrid_start_ix
,
214 recv_index0
-pme
->pmegrid_start_ix
,
215 recv_index0
-pme
->pmegrid_start_ix
+recv_nindex
);
218 MPI_Sendrecv(sendptr
, send_nindex
*datasize
, GMX_MPI_REAL
,
220 recvptr
, recv_nindex
*datasize
, GMX_MPI_REAL
,
222 overlap
->mpi_comm
, &stat
);
224 /* ADD data from contiguous recv buffer */
225 if (direction
== GMX_SUM_GRID_FORWARD
)
227 p
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
228 for (i
= 0; i
< recv_nindex
*datasize
; i
++)
230 p
[i
] += overlap
->recvbuf
[i
];
238 int copy_pmegrid_to_fftgrid(const gmx_pme_t
*pme
, const real
*pmegrid
, real
*fftgrid
, int grid_index
)
240 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
245 /* Dimensions should be identical for A/B grid, so we just use A here */
246 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
251 local_pme_size
[0] = pme
->pmegrid_nx
;
252 local_pme_size
[1] = pme
->pmegrid_ny
;
253 local_pme_size
[2] = pme
->pmegrid_nz
;
255 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
256 the offset is identical, and the PME grid always has more data (due to overlap)
263 sprintf(fn
, "pmegrid%d.pdb", pme
->nodeid
);
264 fp
= gmx_ffopen(fn
, "w");
265 sprintf(fn
, "pmegrid%d.txt", pme
->nodeid
);
266 fp2
= gmx_ffopen(fn
, "w");
269 for (ix
= 0; ix
< local_fft_ndata
[XX
]; ix
++)
271 for (iy
= 0; iy
< local_fft_ndata
[YY
]; iy
++)
273 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
275 pmeidx
= ix
*(local_pme_size
[YY
]*local_pme_size
[ZZ
])+iy
*(local_pme_size
[ZZ
])+iz
;
276 fftidx
= ix
*(local_fft_size
[YY
]*local_fft_size
[ZZ
])+iy
*(local_fft_size
[ZZ
])+iz
;
277 fftgrid
[fftidx
] = pmegrid
[pmeidx
];
279 val
= 100*pmegrid
[pmeidx
];
280 if (pmegrid
[pmeidx
] != 0)
282 gmx_fprintf_pdb_atomline(fp
, epdbATOM
, pmeidx
, "CA", ' ', "GLY", ' ', pmeidx
, ' ',
283 5.0*ix
, 5.0*iy
, 5.0*iz
, 1.0, val
, "");
285 if (pmegrid
[pmeidx
] != 0)
287 fprintf(fp2
, "%-12s %5d %5d %5d %12.5e\n",
289 pme
->pmegrid_start_ix
+ ix
,
290 pme
->pmegrid_start_iy
+ iy
,
291 pme
->pmegrid_start_iz
+ iz
,
307 #ifdef PME_TIME_THREADS
308 static gmx_cycles_t
omp_cyc_start()
310 return gmx_cycles_read();
313 static gmx_cycles_t
omp_cyc_end(gmx_cycles_t c
)
315 return gmx_cycles_read() - c
;
320 int copy_fftgrid_to_pmegrid(struct gmx_pme_t
*pme
, const real
*fftgrid
, real
*pmegrid
, int grid_index
,
321 int nthread
, int thread
)
323 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
325 int ixy0
, ixy1
, ixy
, ix
, iy
, iz
;
327 #ifdef PME_TIME_THREADS
329 static double cs1
= 0;
333 #ifdef PME_TIME_THREADS
334 c1
= omp_cyc_start();
336 /* Dimensions should be identical for A/B grid, so we just use A here */
337 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
342 local_pme_size
[0] = pme
->pmegrid_nx
;
343 local_pme_size
[1] = pme
->pmegrid_ny
;
344 local_pme_size
[2] = pme
->pmegrid_nz
;
346 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
347 the offset is identical, and the PME grid always has more data (due to overlap)
349 ixy0
= ((thread
)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
350 ixy1
= ((thread
+1)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
352 for (ixy
= ixy0
; ixy
< ixy1
; ixy
++)
354 ix
= ixy
/local_fft_ndata
[YY
];
355 iy
= ixy
- ix
*local_fft_ndata
[YY
];
357 pmeidx
= (ix
*local_pme_size
[YY
] + iy
)*local_pme_size
[ZZ
];
358 fftidx
= (ix
*local_fft_size
[YY
] + iy
)*local_fft_size
[ZZ
];
359 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
361 pmegrid
[pmeidx
+iz
] = fftgrid
[fftidx
+iz
];
365 #ifdef PME_TIME_THREADS
366 c1
= omp_cyc_end(c1
);
371 printf("copy %.2f\n", cs1
*1e-9);
379 void wrap_periodic_pmegrid(const gmx_pme_t
*pme
, real
*pmegrid
)
381 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
, iy
, iz
;
387 pny
= pme
->pmegrid_ny
;
388 pnz
= pme
->pmegrid_nz
;
390 overlap
= pme
->pme_order
- 1;
392 /* Add periodic overlap in z */
393 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
395 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
397 for (iz
= 0; iz
< overlap
; iz
++)
399 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
400 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
];
405 if (pme
->nnodes_minor
== 1)
407 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
409 for (iy
= 0; iy
< overlap
; iy
++)
411 for (iz
= 0; iz
< nz
; iz
++)
413 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
414 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
];
420 if (pme
->nnodes_major
== 1)
422 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
424 for (ix
= 0; ix
< overlap
; ix
++)
426 for (iy
= 0; iy
< ny_x
; iy
++)
428 for (iz
= 0; iz
< nz
; iz
++)
430 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
431 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
];
439 void unwrap_periodic_pmegrid(struct gmx_pme_t
*pme
, real
*pmegrid
)
441 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
;
447 pny
= pme
->pmegrid_ny
;
448 pnz
= pme
->pmegrid_nz
;
450 overlap
= pme
->pme_order
- 1;
452 if (pme
->nnodes_major
== 1)
454 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
456 for (ix
= 0; ix
< overlap
; ix
++)
460 for (iy
= 0; iy
< ny_x
; iy
++)
462 for (iz
= 0; iz
< nz
; iz
++)
464 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
] =
465 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
471 if (pme
->nnodes_minor
== 1)
473 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
474 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
476 // Trivial OpenMP region that does not throw, no need for try/catch
479 for (iy
= 0; iy
< overlap
; iy
++)
481 for (iz
= 0; iz
< nz
; iz
++)
483 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
] =
484 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
490 /* Copy periodic overlap in z */
491 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
492 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
494 // Trivial OpenMP region that does not throw, no need for try/catch
497 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
499 for (iz
= 0; iz
< overlap
; iz
++)
501 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
] =
502 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
508 void set_grid_alignment(int gmx_unused
*pmegrid_nz
, int gmx_unused pme_order
)
510 #ifdef PME_SIMD4_SPREAD_GATHER
512 #if !PME_4NSIMD_GATHER
517 /* Round nz up to a multiple of 4 to ensure alignment */
518 *pmegrid_nz
= ((*pmegrid_nz
+ 3) & ~3);
523 static void set_gridsize_alignment(int gmx_unused
*gridsize
, int gmx_unused pme_order
)
525 #ifdef PME_SIMD4_SPREAD_GATHER
526 #if !PME_4NSIMD_GATHER
529 /* Add extra elements to ensured aligned operations do not go
530 * beyond the allocated grid size.
531 * Note that for pme_order=5, the pme grid z-size alignment
532 * ensures that we will not go beyond the grid size.
540 void pmegrid_init(pmegrid_t
*grid
,
541 int cx
, int cy
, int cz
,
542 int x0
, int y0
, int z0
,
543 int x1
, int y1
, int z1
,
544 gmx_bool set_alignment
,
553 grid
->offset
[XX
] = x0
;
554 grid
->offset
[YY
] = y0
;
555 grid
->offset
[ZZ
] = z0
;
556 grid
->n
[XX
] = x1
- x0
+ pme_order
- 1;
557 grid
->n
[YY
] = y1
- y0
+ pme_order
- 1;
558 grid
->n
[ZZ
] = z1
- z0
+ pme_order
- 1;
559 copy_ivec(grid
->n
, grid
->s
);
562 set_grid_alignment(&nz
, pme_order
);
567 else if (nz
!= grid
->s
[ZZ
])
569 gmx_incons("pmegrid_init call with an unaligned z size");
572 grid
->order
= pme_order
;
575 gridsize
= grid
->s
[XX
]*grid
->s
[YY
]*grid
->s
[ZZ
];
576 set_gridsize_alignment(&gridsize
, pme_order
);
577 snew_aligned(grid
->grid
, gridsize
, SIMD4_ALIGNMENT
);
585 static int div_round_up(int enumerator
, int denominator
)
587 return (enumerator
+ denominator
- 1)/denominator
;
590 static void make_subgrid_division(const ivec n
, int ovl
, int nthread
,
593 int gsize_opt
, gsize
;
598 for (nsx
= 1; nsx
<= nthread
; nsx
++)
600 if (nthread
% nsx
== 0)
602 for (nsy
= 1; nsy
<= nthread
; nsy
++)
604 if (nsx
*nsy
<= nthread
&& nthread
% (nsx
*nsy
) == 0)
606 nsz
= nthread
/(nsx
*nsy
);
608 /* Determine the number of grid points per thread */
610 (div_round_up(n
[XX
], nsx
) + ovl
)*
611 (div_round_up(n
[YY
], nsy
) + ovl
)*
612 (div_round_up(n
[ZZ
], nsz
) + ovl
);
614 /* Minimize the number of grids points per thread
615 * and, secondarily, the number of cuts in minor dimensions.
617 if (gsize_opt
== -1 ||
619 (gsize
== gsize_opt
&&
620 (nsz
< nsub
[ZZ
] || (nsz
== nsub
[ZZ
] && nsy
< nsub
[YY
]))))
632 env
= getenv("GMX_PME_THREAD_DIVISION");
635 sscanf(env
, "%20d %20d %20d", &nsub
[XX
], &nsub
[YY
], &nsub
[ZZ
]);
638 if (nsub
[XX
]*nsub
[YY
]*nsub
[ZZ
] != nthread
)
640 gmx_fatal(FARGS
, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub
[XX
], nsub
[YY
], nsub
[ZZ
], nthread
);
644 void pmegrids_init(pmegrids_t
*grids
,
645 int nx
, int ny
, int nz
, int nz_base
,
647 gmx_bool bUseThreads
,
653 int t
, x
, y
, z
, d
, i
, tfac
;
654 int max_comm_lines
= -1;
656 n
[XX
] = nx
- (pme_order
- 1);
657 n
[YY
] = ny
- (pme_order
- 1);
658 n
[ZZ
] = nz
- (pme_order
- 1);
660 copy_ivec(n
, n_base
);
661 n_base
[ZZ
] = nz_base
;
663 pmegrid_init(&grids
->grid
, 0, 0, 0, 0, 0, 0, n
[XX
], n
[YY
], n
[ZZ
], FALSE
, pme_order
,
666 grids
->nthread
= nthread
;
668 make_subgrid_division(n_base
, pme_order
-1, grids
->nthread
, grids
->nc
);
675 for (d
= 0; d
< DIM
; d
++)
677 nst
[d
] = div_round_up(n
[d
], grids
->nc
[d
]) + pme_order
- 1;
679 set_grid_alignment(&nst
[ZZ
], pme_order
);
683 fprintf(debug
, "pmegrid thread local division: %d x %d x %d\n",
684 grids
->nc
[XX
], grids
->nc
[YY
], grids
->nc
[ZZ
]);
685 fprintf(debug
, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
687 nst
[XX
], nst
[YY
], nst
[ZZ
]);
690 snew(grids
->grid_th
, grids
->nthread
);
692 gridsize
= nst
[XX
]*nst
[YY
]*nst
[ZZ
];
693 set_gridsize_alignment(&gridsize
, pme_order
);
694 snew_aligned(grids
->grid_all
,
695 grids
->nthread
*gridsize
+(grids
->nthread
+1)*GMX_CACHE_SEP
,
698 for (x
= 0; x
< grids
->nc
[XX
]; x
++)
700 for (y
= 0; y
< grids
->nc
[YY
]; y
++)
702 for (z
= 0; z
< grids
->nc
[ZZ
]; z
++)
704 pmegrid_init(&grids
->grid_th
[t
],
706 (n
[XX
]*(x
))/grids
->nc
[XX
],
707 (n
[YY
]*(y
))/grids
->nc
[YY
],
708 (n
[ZZ
]*(z
))/grids
->nc
[ZZ
],
709 (n
[XX
]*(x
+1))/grids
->nc
[XX
],
710 (n
[YY
]*(y
+1))/grids
->nc
[YY
],
711 (n
[ZZ
]*(z
+1))/grids
->nc
[ZZ
],
714 grids
->grid_all
+GMX_CACHE_SEP
+t
*(gridsize
+GMX_CACHE_SEP
));
722 grids
->grid_th
= nullptr;
726 for (d
= DIM
-1; d
>= 0; d
--)
728 snew(grids
->g2t
[d
], n
[d
]);
730 for (i
= 0; i
< n
[d
]; i
++)
732 /* The second check should match the parameters
733 * of the pmegrid_init call above.
735 while (t
+ 1 < grids
->nc
[d
] && i
>= (n
[d
]*(t
+1))/grids
->nc
[d
])
739 grids
->g2t
[d
][i
] = t
*tfac
;
742 tfac
*= grids
->nc
[d
];
746 case XX
: max_comm_lines
= overlap_x
; break;
747 case YY
: max_comm_lines
= overlap_y
; break;
748 case ZZ
: max_comm_lines
= pme_order
- 1; break;
750 grids
->nthread_comm
[d
] = 0;
751 while ((n
[d
]*grids
->nthread_comm
[d
])/grids
->nc
[d
] < max_comm_lines
&&
752 grids
->nthread_comm
[d
] < grids
->nc
[d
])
754 grids
->nthread_comm
[d
]++;
756 if (debug
!= nullptr)
758 fprintf(debug
, "pmegrid thread grid communication range in %c: %d\n",
759 'x'+d
, grids
->nthread_comm
[d
]);
761 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
762 * work, but this is not a problematic restriction.
764 if (grids
->nc
[d
] > 1 && grids
->nthread_comm
[d
] > grids
->nc
[d
])
766 gmx_fatal(FARGS
, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids
->nthread
);
771 void pmegrids_destroy(pmegrids_t
*grids
)
773 if (grids
->grid
.grid
!= nullptr)
775 sfree_aligned(grids
->grid
.grid
);
777 if (grids
->nthread
> 0)
779 sfree_aligned(grids
->grid_all
);
780 sfree(grids
->grid_th
);
782 for (int d
= 0; d
< DIM
; d
++)
784 sfree(grids
->g2t
[d
]);
790 make_gridindex_to_localindex(int n
, int local_start
, int local_range
,
791 int **global_to_local
,
792 real
**fraction_shift
)
794 /* Here we construct array for looking up the grid line index and
795 * fraction for particles. This is done because it is slighlty
796 * faster than the modulo operation and to because we need to take
797 * care of rounding issues, see below.
798 * We use an array size of c_pmeNeighborUnitcellCount times the grid size
799 * to allow for particles to be out of the triclinic unit-cell.
801 const int arraySize
= c_pmeNeighborUnitcellCount
* n
;
805 snew(gtl
, arraySize
);
806 snew(fsh
, arraySize
);
808 for (int i
= 0; i
< arraySize
; i
++)
810 /* Transform global grid index to the local grid index.
811 * Our local grid always runs from 0 to local_range-1.
813 gtl
[i
] = (i
- local_start
+ n
) % n
;
814 /* For coordinates that fall within the local grid the fraction
815 * is correct, we don't need to shift it.
818 /* Check if we are using domain decomposition for PME */
821 /* Due to rounding issues i could be 1 beyond the lower or
822 * upper boundary of the local grid. Correct the index for this.
823 * If we shift the index, we need to shift the fraction by
824 * the same amount in the other direction to not affect
826 * Note that due to this shifting the weights at the end of
827 * the spline might change, but that will only involve values
828 * between zero and values close to the precision of a real,
829 * which is anyhow the accuracy of the whole mesh calculation.
833 /* When this i is used, we should round the local index up */
837 else if (gtl
[i
] == local_range
&& local_range
> 0)
839 /* When this i is used, we should round the local index down */
840 gtl
[i
] = local_range
- 1;
846 *global_to_local
= gtl
;
847 *fraction_shift
= fsh
;
850 void reuse_pmegrids(const pmegrids_t
*oldgrid
, pmegrids_t
*newgrid
)
854 for (d
= 0; d
< DIM
; d
++)
856 if (newgrid
->grid
.n
[d
] > oldgrid
->grid
.n
[d
])
862 sfree_aligned(newgrid
->grid
.grid
);
863 newgrid
->grid
.grid
= oldgrid
->grid
.grid
;
865 if (newgrid
->grid_th
!= nullptr && newgrid
->nthread
== oldgrid
->nthread
)
867 sfree_aligned(newgrid
->grid_all
);
868 newgrid
->grid_all
= oldgrid
->grid_all
;
869 for (t
= 0; t
< newgrid
->nthread
; t
++)
871 newgrid
->grid_th
[t
].grid
= oldgrid
->grid_th
[t
].grid
;