2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /* TODO find out what this file should be called */
46 #include "gromacs/ewald/pme.h"
47 #include "gromacs/fft/parallel_3dfft.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/timing/cyclecounter.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/smalloc.h"
54 #include "gromacs/fileio/pdbio.h"
55 #include "gromacs/utility/cstringutil.h"
56 #include "gromacs/utility/futil.h"
61 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
62 * to preserve alignment.
64 #define GMX_CACHE_SEP 64
67 void gmx_sum_qgrid_dd(struct gmx_pme_t
*pme
, real
*grid
, int direction
)
69 pme_overlap_t
*overlap
;
70 int send_index0
, send_nindex
;
71 int recv_index0
, recv_nindex
;
73 int i
, j
, k
, ix
, iy
, iz
, icnt
;
74 int ipulse
, send_id
, recv_id
, datasize
;
76 real
*sendptr
, *recvptr
;
78 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
79 overlap
= &pme
->overlap
[1];
81 for (ipulse
= 0; ipulse
< overlap
->noverlap_nodes
; ipulse
++)
83 /* Since we have already (un)wrapped the overlap in the z-dimension,
84 * we only have to communicate 0 to nkz (not pmegrid_nz).
86 if (direction
== GMX_SUM_GRID_FORWARD
)
88 send_id
= overlap
->send_id
[ipulse
];
89 recv_id
= overlap
->recv_id
[ipulse
];
90 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
91 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
92 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
93 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
97 send_id
= overlap
->recv_id
[ipulse
];
98 recv_id
= overlap
->send_id
[ipulse
];
99 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
100 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
101 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
102 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
105 /* Copy data to contiguous send buffer */
108 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
109 pme
->nodeid
, overlap
->nodeid
, send_id
,
110 pme
->pmegrid_start_iy
,
111 send_index0
-pme
->pmegrid_start_iy
,
112 send_index0
-pme
->pmegrid_start_iy
+send_nindex
);
115 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
118 for (j
= 0; j
< send_nindex
; j
++)
120 iy
= j
+ send_index0
- pme
->pmegrid_start_iy
;
121 for (k
= 0; k
< pme
->nkz
; k
++)
124 overlap
->sendbuf
[icnt
++] = grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
];
129 datasize
= pme
->pmegrid_nx
* pme
->nkz
;
131 MPI_Sendrecv(overlap
->sendbuf
, send_nindex
*datasize
, GMX_MPI_REAL
,
133 overlap
->recvbuf
, recv_nindex
*datasize
, GMX_MPI_REAL
,
135 overlap
->mpi_comm
, &stat
);
137 /* Get data from contiguous recv buffer */
140 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
141 pme
->nodeid
, overlap
->nodeid
, recv_id
,
142 pme
->pmegrid_start_iy
,
143 recv_index0
-pme
->pmegrid_start_iy
,
144 recv_index0
-pme
->pmegrid_start_iy
+recv_nindex
);
147 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
150 for (j
= 0; j
< recv_nindex
; j
++)
152 iy
= j
+ recv_index0
- pme
->pmegrid_start_iy
;
153 for (k
= 0; k
< pme
->nkz
; k
++)
156 if (direction
== GMX_SUM_GRID_FORWARD
)
158 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] += overlap
->recvbuf
[icnt
++];
162 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] = overlap
->recvbuf
[icnt
++];
169 /* Major dimension is easier, no copying required,
170 * but we might have to sum to separate array.
171 * Since we don't copy, we have to communicate up to pmegrid_nz,
172 * not nkz as for the minor direction.
174 overlap
= &pme
->overlap
[0];
176 for (ipulse
= 0; ipulse
< overlap
->noverlap_nodes
; ipulse
++)
178 if (direction
== GMX_SUM_GRID_FORWARD
)
180 send_id
= overlap
->send_id
[ipulse
];
181 recv_id
= overlap
->recv_id
[ipulse
];
182 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
183 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
184 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
185 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
186 recvptr
= overlap
->recvbuf
;
190 send_id
= overlap
->recv_id
[ipulse
];
191 recv_id
= overlap
->send_id
[ipulse
];
192 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
193 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
194 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
195 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
196 recvptr
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
199 sendptr
= grid
+ (send_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
200 datasize
= pme
->pmegrid_ny
* pme
->pmegrid_nz
;
204 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
205 pme
->nodeid
, overlap
->nodeid
, send_id
,
206 pme
->pmegrid_start_ix
,
207 send_index0
-pme
->pmegrid_start_ix
,
208 send_index0
-pme
->pmegrid_start_ix
+send_nindex
);
209 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
210 pme
->nodeid
, overlap
->nodeid
, recv_id
,
211 pme
->pmegrid_start_ix
,
212 recv_index0
-pme
->pmegrid_start_ix
,
213 recv_index0
-pme
->pmegrid_start_ix
+recv_nindex
);
216 MPI_Sendrecv(sendptr
, send_nindex
*datasize
, GMX_MPI_REAL
,
218 recvptr
, recv_nindex
*datasize
, GMX_MPI_REAL
,
220 overlap
->mpi_comm
, &stat
);
222 /* ADD data from contiguous recv buffer */
223 if (direction
== GMX_SUM_GRID_FORWARD
)
225 p
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
226 for (i
= 0; i
< recv_nindex
*datasize
; i
++)
228 p
[i
] += overlap
->recvbuf
[i
];
236 int copy_pmegrid_to_fftgrid(const gmx_pme_t
*pme
, real
*pmegrid
, real
*fftgrid
, int grid_index
)
238 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
243 /* Dimensions should be identical for A/B grid, so we just use A here */
244 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
249 local_pme_size
[0] = pme
->pmegrid_nx
;
250 local_pme_size
[1] = pme
->pmegrid_ny
;
251 local_pme_size
[2] = pme
->pmegrid_nz
;
253 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
254 the offset is identical, and the PME grid always has more data (due to overlap)
261 sprintf(fn
, "pmegrid%d.pdb", pme
->nodeid
);
262 fp
= gmx_ffopen(fn
, "w");
263 sprintf(fn
, "pmegrid%d.txt", pme
->nodeid
);
264 fp2
= gmx_ffopen(fn
, "w");
267 for (ix
= 0; ix
< local_fft_ndata
[XX
]; ix
++)
269 for (iy
= 0; iy
< local_fft_ndata
[YY
]; iy
++)
271 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
273 pmeidx
= ix
*(local_pme_size
[YY
]*local_pme_size
[ZZ
])+iy
*(local_pme_size
[ZZ
])+iz
;
274 fftidx
= ix
*(local_fft_size
[YY
]*local_fft_size
[ZZ
])+iy
*(local_fft_size
[ZZ
])+iz
;
275 fftgrid
[fftidx
] = pmegrid
[pmeidx
];
277 val
= 100*pmegrid
[pmeidx
];
278 if (pmegrid
[pmeidx
] != 0)
280 gmx_fprintf_pdb_atomline(fp
, epdbATOM
, pmeidx
, "CA", ' ', "GLY", ' ', pmeidx
, ' ',
281 5.0*ix
, 5.0*iy
, 5.0*iz
, 1.0, val
, "");
283 if (pmegrid
[pmeidx
] != 0)
285 fprintf(fp2
, "%-12s %5d %5d %5d %12.5e\n",
287 pme
->pmegrid_start_ix
+ ix
,
288 pme
->pmegrid_start_iy
+ iy
,
289 pme
->pmegrid_start_iz
+ iz
,
305 #ifdef PME_TIME_THREADS
306 static gmx_cycles_t
omp_cyc_start()
308 return gmx_cycles_read();
311 static gmx_cycles_t
omp_cyc_end(gmx_cycles_t c
)
313 return gmx_cycles_read() - c
;
318 int copy_fftgrid_to_pmegrid(struct gmx_pme_t
*pme
, const real
*fftgrid
, real
*pmegrid
, int grid_index
,
319 int nthread
, int thread
)
321 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
323 int ixy0
, ixy1
, ixy
, ix
, iy
, iz
;
325 #ifdef PME_TIME_THREADS
327 static double cs1
= 0;
331 #ifdef PME_TIME_THREADS
332 c1
= omp_cyc_start();
334 /* Dimensions should be identical for A/B grid, so we just use A here */
335 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
340 local_pme_size
[0] = pme
->pmegrid_nx
;
341 local_pme_size
[1] = pme
->pmegrid_ny
;
342 local_pme_size
[2] = pme
->pmegrid_nz
;
344 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
345 the offset is identical, and the PME grid always has more data (due to overlap)
347 ixy0
= ((thread
)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
348 ixy1
= ((thread
+1)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
350 for (ixy
= ixy0
; ixy
< ixy1
; ixy
++)
352 ix
= ixy
/local_fft_ndata
[YY
];
353 iy
= ixy
- ix
*local_fft_ndata
[YY
];
355 pmeidx
= (ix
*local_pme_size
[YY
] + iy
)*local_pme_size
[ZZ
];
356 fftidx
= (ix
*local_fft_size
[YY
] + iy
)*local_fft_size
[ZZ
];
357 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
359 pmegrid
[pmeidx
+iz
] = fftgrid
[fftidx
+iz
];
363 #ifdef PME_TIME_THREADS
364 c1
= omp_cyc_end(c1
);
369 printf("copy %.2f\n", cs1
*1e-9);
377 void wrap_periodic_pmegrid(const gmx_pme_t
*pme
, real
*pmegrid
)
379 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
, iy
, iz
;
385 pny
= pme
->pmegrid_ny
;
386 pnz
= pme
->pmegrid_nz
;
388 overlap
= pme
->pme_order
- 1;
390 /* Add periodic overlap in z */
391 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
393 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
395 for (iz
= 0; iz
< overlap
; iz
++)
397 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
398 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
];
403 if (pme
->nnodes_minor
== 1)
405 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
407 for (iy
= 0; iy
< overlap
; iy
++)
409 for (iz
= 0; iz
< nz
; iz
++)
411 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
412 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
];
418 if (pme
->nnodes_major
== 1)
420 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
422 for (ix
= 0; ix
< overlap
; ix
++)
424 for (iy
= 0; iy
< ny_x
; iy
++)
426 for (iz
= 0; iz
< nz
; iz
++)
428 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
429 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
];
437 void unwrap_periodic_pmegrid(struct gmx_pme_t
*pme
, real
*pmegrid
)
439 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
;
445 pny
= pme
->pmegrid_ny
;
446 pnz
= pme
->pmegrid_nz
;
448 overlap
= pme
->pme_order
- 1;
450 if (pme
->nnodes_major
== 1)
452 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
454 for (ix
= 0; ix
< overlap
; ix
++)
458 for (iy
= 0; iy
< ny_x
; iy
++)
460 for (iz
= 0; iz
< nz
; iz
++)
462 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
] =
463 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
469 if (pme
->nnodes_minor
== 1)
471 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
472 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
474 // Trivial OpenMP region that does not throw, no need for try/catch
477 for (iy
= 0; iy
< overlap
; iy
++)
479 for (iz
= 0; iz
< nz
; iz
++)
481 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
] =
482 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
488 /* Copy periodic overlap in z */
489 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
490 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
492 // Trivial OpenMP region that does not throw, no need for try/catch
495 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
497 for (iz
= 0; iz
< overlap
; iz
++)
499 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
] =
500 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
506 void set_grid_alignment(int gmx_unused
*pmegrid_nz
, int gmx_unused pme_order
)
508 #ifdef PME_SIMD4_SPREAD_GATHER
510 #ifndef PME_SIMD4_UNALIGNED
515 /* Round nz up to a multiple of 4 to ensure alignment */
516 *pmegrid_nz
= ((*pmegrid_nz
+ 3) & ~3);
521 static void set_gridsize_alignment(int gmx_unused
*gridsize
, int gmx_unused pme_order
)
523 #ifdef PME_SIMD4_SPREAD_GATHER
524 #ifndef PME_SIMD4_UNALIGNED
527 /* Add extra elements to ensured aligned operations do not go
528 * beyond the allocated grid size.
529 * Note that for pme_order=5, the pme grid z-size alignment
530 * ensures that we will not go beyond the grid size.
538 void pmegrid_init(pmegrid_t
*grid
,
539 int cx
, int cy
, int cz
,
540 int x0
, int y0
, int z0
,
541 int x1
, int y1
, int z1
,
542 gmx_bool set_alignment
,
551 grid
->offset
[XX
] = x0
;
552 grid
->offset
[YY
] = y0
;
553 grid
->offset
[ZZ
] = z0
;
554 grid
->n
[XX
] = x1
- x0
+ pme_order
- 1;
555 grid
->n
[YY
] = y1
- y0
+ pme_order
- 1;
556 grid
->n
[ZZ
] = z1
- z0
+ pme_order
- 1;
557 copy_ivec(grid
->n
, grid
->s
);
560 set_grid_alignment(&nz
, pme_order
);
565 else if (nz
!= grid
->s
[ZZ
])
567 gmx_incons("pmegrid_init call with an unaligned z size");
570 grid
->order
= pme_order
;
573 gridsize
= grid
->s
[XX
]*grid
->s
[YY
]*grid
->s
[ZZ
];
574 set_gridsize_alignment(&gridsize
, pme_order
);
575 snew_aligned(grid
->grid
, gridsize
, SIMD4_ALIGNMENT
);
583 static int div_round_up(int enumerator
, int denominator
)
585 return (enumerator
+ denominator
- 1)/denominator
;
588 static void make_subgrid_division(const ivec n
, int ovl
, int nthread
,
591 int gsize_opt
, gsize
;
596 for (nsx
= 1; nsx
<= nthread
; nsx
++)
598 if (nthread
% nsx
== 0)
600 for (nsy
= 1; nsy
<= nthread
; nsy
++)
602 if (nsx
*nsy
<= nthread
&& nthread
% (nsx
*nsy
) == 0)
604 nsz
= nthread
/(nsx
*nsy
);
606 /* Determine the number of grid points per thread */
608 (div_round_up(n
[XX
], nsx
) + ovl
)*
609 (div_round_up(n
[YY
], nsy
) + ovl
)*
610 (div_round_up(n
[ZZ
], nsz
) + ovl
);
612 /* Minimize the number of grids points per thread
613 * and, secondarily, the number of cuts in minor dimensions.
615 if (gsize_opt
== -1 ||
617 (gsize
== gsize_opt
&&
618 (nsz
< nsub
[ZZ
] || (nsz
== nsub
[ZZ
] && nsy
< nsub
[YY
]))))
630 env
= getenv("GMX_PME_THREAD_DIVISION");
633 sscanf(env
, "%20d %20d %20d", &nsub
[XX
], &nsub
[YY
], &nsub
[ZZ
]);
636 if (nsub
[XX
]*nsub
[YY
]*nsub
[ZZ
] != nthread
)
638 gmx_fatal(FARGS
, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub
[XX
], nsub
[YY
], nsub
[ZZ
], nthread
);
642 void pmegrids_init(pmegrids_t
*grids
,
643 int nx
, int ny
, int nz
, int nz_base
,
645 gmx_bool bUseThreads
,
651 int t
, x
, y
, z
, d
, i
, tfac
;
652 int max_comm_lines
= -1;
654 n
[XX
] = nx
- (pme_order
- 1);
655 n
[YY
] = ny
- (pme_order
- 1);
656 n
[ZZ
] = nz
- (pme_order
- 1);
658 copy_ivec(n
, n_base
);
659 n_base
[ZZ
] = nz_base
;
661 pmegrid_init(&grids
->grid
, 0, 0, 0, 0, 0, 0, n
[XX
], n
[YY
], n
[ZZ
], FALSE
, pme_order
,
664 grids
->nthread
= nthread
;
666 make_subgrid_division(n_base
, pme_order
-1, grids
->nthread
, grids
->nc
);
673 for (d
= 0; d
< DIM
; d
++)
675 nst
[d
] = div_round_up(n
[d
], grids
->nc
[d
]) + pme_order
- 1;
677 set_grid_alignment(&nst
[ZZ
], pme_order
);
681 fprintf(debug
, "pmegrid thread local division: %d x %d x %d\n",
682 grids
->nc
[XX
], grids
->nc
[YY
], grids
->nc
[ZZ
]);
683 fprintf(debug
, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
685 nst
[XX
], nst
[YY
], nst
[ZZ
]);
688 snew(grids
->grid_th
, grids
->nthread
);
690 gridsize
= nst
[XX
]*nst
[YY
]*nst
[ZZ
];
691 set_gridsize_alignment(&gridsize
, pme_order
);
692 snew_aligned(grids
->grid_all
,
693 grids
->nthread
*gridsize
+(grids
->nthread
+1)*GMX_CACHE_SEP
,
696 for (x
= 0; x
< grids
->nc
[XX
]; x
++)
698 for (y
= 0; y
< grids
->nc
[YY
]; y
++)
700 for (z
= 0; z
< grids
->nc
[ZZ
]; z
++)
702 pmegrid_init(&grids
->grid_th
[t
],
704 (n
[XX
]*(x
))/grids
->nc
[XX
],
705 (n
[YY
]*(y
))/grids
->nc
[YY
],
706 (n
[ZZ
]*(z
))/grids
->nc
[ZZ
],
707 (n
[XX
]*(x
+1))/grids
->nc
[XX
],
708 (n
[YY
]*(y
+1))/grids
->nc
[YY
],
709 (n
[ZZ
]*(z
+1))/grids
->nc
[ZZ
],
712 grids
->grid_all
+GMX_CACHE_SEP
+t
*(gridsize
+GMX_CACHE_SEP
));
720 grids
->grid_th
= nullptr;
724 for (d
= DIM
-1; d
>= 0; d
--)
726 snew(grids
->g2t
[d
], n
[d
]);
728 for (i
= 0; i
< n
[d
]; i
++)
730 /* The second check should match the parameters
731 * of the pmegrid_init call above.
733 while (t
+ 1 < grids
->nc
[d
] && i
>= (n
[d
]*(t
+1))/grids
->nc
[d
])
737 grids
->g2t
[d
][i
] = t
*tfac
;
740 tfac
*= grids
->nc
[d
];
744 case XX
: max_comm_lines
= overlap_x
; break;
745 case YY
: max_comm_lines
= overlap_y
; break;
746 case ZZ
: max_comm_lines
= pme_order
- 1; break;
748 grids
->nthread_comm
[d
] = 0;
749 while ((n
[d
]*grids
->nthread_comm
[d
])/grids
->nc
[d
] < max_comm_lines
&&
750 grids
->nthread_comm
[d
] < grids
->nc
[d
])
752 grids
->nthread_comm
[d
]++;
754 if (debug
!= nullptr)
756 fprintf(debug
, "pmegrid thread grid communication range in %c: %d\n",
757 'x'+d
, grids
->nthread_comm
[d
]);
759 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
760 * work, but this is not a problematic restriction.
762 if (grids
->nc
[d
] > 1 && grids
->nthread_comm
[d
] > grids
->nc
[d
])
764 gmx_fatal(FARGS
, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids
->nthread
);
769 void pmegrids_destroy(pmegrids_t
*grids
)
771 if (grids
->grid
.grid
!= nullptr)
773 sfree_aligned(grids
->grid
.grid
);
775 if (grids
->nthread
> 0)
777 sfree_aligned(grids
->grid_all
);
778 sfree(grids
->grid_th
);
780 for (int d
= 0; d
< DIM
; d
++)
782 sfree(grids
->g2t
[d
]);
788 make_gridindex_to_localindex(int n
, int local_start
, int local_range
,
789 int **global_to_local
,
790 real
**fraction_shift
)
792 /* Here we construct array for looking up the grid line index and
793 * fraction for particles. This is done because it is slighlty
794 * faster than the modulo operation and to because we need to take
795 * care of rounding issues, see below.
796 * We use an array size of c_pmeNeighborUnitcellCount times the grid size
797 * to allow for particles to be out of the triclinic unit-cell.
799 const int arraySize
= c_pmeNeighborUnitcellCount
* n
;
803 snew(gtl
, arraySize
);
804 snew(fsh
, arraySize
);
806 for (int i
= 0; i
< arraySize
; i
++)
808 /* Transform global grid index to the local grid index.
809 * Our local grid always runs from 0 to local_range-1.
811 gtl
[i
] = (i
- local_start
+ n
) % n
;
812 /* For coordinates that fall within the local grid the fraction
813 * is correct, we don't need to shift it.
816 /* Check if we are using domain decomposition for PME */
819 /* Due to rounding issues i could be 1 beyond the lower or
820 * upper boundary of the local grid. Correct the index for this.
821 * If we shift the index, we need to shift the fraction by
822 * the same amount in the other direction to not affect
824 * Note that due to this shifting the weights at the end of
825 * the spline might change, but that will only involve values
826 * between zero and values close to the precision of a real,
827 * which is anyhow the accuracy of the whole mesh calculation.
831 /* When this i is used, we should round the local index up */
835 else if (gtl
[i
] == local_range
&& local_range
> 0)
837 /* When this i is used, we should round the local index down */
838 gtl
[i
] = local_range
- 1;
844 *global_to_local
= gtl
;
845 *fraction_shift
= fsh
;
848 void reuse_pmegrids(const pmegrids_t
*oldgrid
, pmegrids_t
*newgrid
)
852 for (d
= 0; d
< DIM
; d
++)
854 if (newgrid
->grid
.n
[d
] > oldgrid
->grid
.n
[d
])
860 sfree_aligned(newgrid
->grid
.grid
);
861 newgrid
->grid
.grid
= oldgrid
->grid
.grid
;
863 if (newgrid
->grid_th
!= nullptr && newgrid
->nthread
== oldgrid
->nthread
)
865 sfree_aligned(newgrid
->grid_all
);
866 newgrid
->grid_all
= oldgrid
->grid_all
;
867 for (t
= 0; t
< newgrid
->nthread
; t
++)
869 newgrid
->grid_th
[t
].grid
= oldgrid
->grid_th
[t
].grid
;
874 static void dump_grid(FILE *fp
,
875 int sx
, int sy
, int sz
, int nx
, int ny
, int nz
,
876 int my
, int mz
, const real
*g
)
880 for (x
= 0; x
< nx
; x
++)
882 for (y
= 0; y
< ny
; y
++)
884 for (z
= 0; z
< nz
; z
++)
886 fprintf(fp
, "%2d %2d %2d %6.3f\n",
887 sx
+x
, sy
+y
, sz
+z
, g
[(x
*my
+ y
)*mz
+ z
]);
893 /* This function is called from gmx_pme_do() only from debugging code
894 that is commented out. */
895 void dump_local_fftgrid(struct gmx_pme_t
*pme
, const real
*fftgrid
)
897 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
899 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[PME_GRID_QA
],
905 pme
->pmegrid_start_ix
,
906 pme
->pmegrid_start_iy
,
907 pme
->pmegrid_start_iz
,
908 pme
->pmegrid_nx
-pme
->pme_order
+1,
909 pme
->pmegrid_ny
-pme
->pme_order
+1,
910 pme
->pmegrid_nz
-pme
->pme_order
+1,