2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "pme-spread.h"
47 #include "gromacs/ewald/pme.h"
48 #include "gromacs/fft/parallel_3dfft.h"
49 #include "gromacs/simd/simd.h"
50 #include "gromacs/utility/basedefinitions.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
53 #include "gromacs/utility/smalloc.h"
56 #include "pme-internal.h"
58 #include "pme-spline-work.h"
60 /* TODO consider split of pme-spline from this file */
62 static void calc_interpolation_idx(const gmx_pme_t
*pme
, const pme_atomcomm_t
*atc
,
63 int start
, int grid_index
, int end
, int thread
)
66 int *idxptr
, tix
, tiy
, tiz
;
67 real
*xptr
, *fptr
, tx
, ty
, tz
;
68 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
70 int *g2tx
, *g2ty
, *g2tz
;
72 int *thread_idx
= nullptr;
73 thread_plist_t
*tpl
= nullptr;
81 rxx
= pme
->recipbox
[XX
][XX
];
82 ryx
= pme
->recipbox
[YY
][XX
];
83 ryy
= pme
->recipbox
[YY
][YY
];
84 rzx
= pme
->recipbox
[ZZ
][XX
];
85 rzy
= pme
->recipbox
[ZZ
][YY
];
86 rzz
= pme
->recipbox
[ZZ
][ZZ
];
88 g2tx
= pme
->pmegrid
[grid_index
].g2t
[XX
];
89 g2ty
= pme
->pmegrid
[grid_index
].g2t
[YY
];
90 g2tz
= pme
->pmegrid
[grid_index
].g2t
[ZZ
];
92 bThreads
= (atc
->nthread
> 1);
95 thread_idx
= atc
->thread_idx
;
97 tpl
= &atc
->thread_plist
[thread
];
99 for (i
= 0; i
< atc
->nthread
; i
++)
105 const real shift
= c_pmeMaxUnitcellShift
;
107 for (i
= start
; i
< end
; i
++)
110 idxptr
= atc
->idx
[i
];
111 fptr
= atc
->fractx
[i
];
113 /* Fractional coordinates along box vectors, add a positive shift to ensure tx/ty/tz are positive for triclinic boxes */
114 tx
= nx
* ( xptr
[XX
] * rxx
+ xptr
[YY
] * ryx
+ xptr
[ZZ
] * rzx
+ shift
);
115 ty
= ny
* ( xptr
[YY
] * ryy
+ xptr
[ZZ
] * rzy
+ shift
);
116 tz
= nz
* ( xptr
[ZZ
] * rzz
+ shift
);
118 tix
= static_cast<int>(tx
);
119 tiy
= static_cast<int>(ty
);
120 tiz
= static_cast<int>(tz
);
123 range_check(tix
, 0, c_pmeNeighborUnitcellCount
* nx
);
124 range_check(tiy
, 0, c_pmeNeighborUnitcellCount
* ny
);
125 range_check(tiz
, 0, c_pmeNeighborUnitcellCount
* nz
);
127 /* Because decomposition only occurs in x and y,
128 * we never have a fraction correction in z.
130 fptr
[XX
] = tx
- tix
+ pme
->fshx
[tix
];
131 fptr
[YY
] = ty
- tiy
+ pme
->fshy
[tiy
];
134 idxptr
[XX
] = pme
->nnx
[tix
];
135 idxptr
[YY
] = pme
->nny
[tiy
];
136 idxptr
[ZZ
] = pme
->nnz
[tiz
];
139 range_check(idxptr
[XX
], 0, pme
->pmegrid_nx
);
140 range_check(idxptr
[YY
], 0, pme
->pmegrid_ny
);
141 range_check(idxptr
[ZZ
], 0, pme
->pmegrid_nz
);
146 thread_i
= g2tx
[idxptr
[XX
]] + g2ty
[idxptr
[YY
]] + g2tz
[idxptr
[ZZ
]];
147 thread_idx
[i
] = thread_i
;
154 /* Make a list of particle indices sorted on thread */
156 /* Get the cumulative count */
157 for (i
= 1; i
< atc
->nthread
; i
++)
159 tpl_n
[i
] += tpl_n
[i
-1];
161 /* The current implementation distributes particles equally
162 * over the threads, so we could actually allocate for that
163 * in pme_realloc_atomcomm_things.
165 if (tpl_n
[atc
->nthread
-1] > tpl
->nalloc
)
167 tpl
->nalloc
= over_alloc_large(tpl_n
[atc
->nthread
-1]);
168 srenew(tpl
->i
, tpl
->nalloc
);
170 /* Set tpl_n to the cumulative start */
171 for (i
= atc
->nthread
-1; i
>= 1; i
--)
173 tpl_n
[i
] = tpl_n
[i
-1];
177 /* Fill our thread local array with indices sorted on thread */
178 for (i
= start
; i
< end
; i
++)
180 tpl
->i
[tpl_n
[atc
->thread_idx
[i
]]++] = i
;
182 /* Now tpl_n contains the cummulative count again */
186 static void make_thread_local_ind(const pme_atomcomm_t
*atc
,
187 int thread
, splinedata_t
*spline
)
189 int n
, t
, i
, start
, end
;
192 /* Combine the indices made by each thread into one index */
196 for (t
= 0; t
< atc
->nthread
; t
++)
198 tpl
= &atc
->thread_plist
[t
];
199 /* Copy our part (start - end) from the list of thread t */
202 start
= tpl
->n
[thread
-1];
204 end
= tpl
->n
[thread
];
205 for (i
= start
; i
< end
; i
++)
207 spline
->ind
[n
++] = tpl
->i
[i
];
214 /* Macro to force loop unrolling by fixing order.
215 * This gives a significant performance gain.
217 #define CALC_SPLINE(order) \
219 for (int j = 0; (j < DIM); j++) \
222 real data[PME_ORDER_MAX]; \
226 /* dr is relative offset from lower cell limit */ \
227 data[(order)-1] = 0; \
231 for (int k = 3; (k < (order)); k++) \
233 div = 1.0/(k - 1.0); \
234 data[k-1] = div*dr*data[k-2]; \
235 for (int l = 1; (l < (k-1)); l++) \
237 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
240 data[0] = div*(1-dr)*data[0]; \
242 /* differentiate */ \
243 dtheta[j][i*(order)+0] = -data[0]; \
244 for (int k = 1; (k < (order)); k++) \
246 dtheta[j][i*(order)+k] = data[k-1] - data[k]; \
249 div = 1.0/((order) - 1); \
250 data[(order)-1] = div*dr*data[(order)-2]; \
251 for (int l = 1; (l < ((order)-1)); l++) \
253 data[(order)-l-1] = div*((dr+l)*data[(order)-l-2]+ \
254 ((order)-l-dr)*data[(order)-l-1]); \
256 data[0] = div*(1 - dr)*data[0]; \
258 for (int k = 0; k < (order); k++) \
260 theta[j][i*(order)+k] = data[k]; \
265 static void make_bsplines(splinevec theta
, splinevec dtheta
, int order
,
266 rvec fractx
[], int nr
, const int ind
[], const real coefficient
[],
269 /* construct splines for local atoms */
273 for (i
= 0; i
< nr
; i
++)
275 /* With free energy we do not use the coefficient check.
276 * In most cases this will be more efficient than calling make_bsplines
277 * twice, since usually more than half the particles have non-zero coefficients.
280 if (bDoSplines
|| coefficient
[ii
] != 0.0)
283 assert(order
>= 3 && order
<= PME_ORDER_MAX
);
286 case 4: CALC_SPLINE(4); break;
287 case 5: CALC_SPLINE(5); break;
288 default: CALC_SPLINE(order
); break;
294 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
295 #define DO_BSPLINE(order) \
296 for (ithx = 0; (ithx < (order)); ithx++) \
298 index_x = (i0+ithx)*pny*pnz; \
299 valx = coefficient*thx[ithx]; \
301 for (ithy = 0; (ithy < (order)); ithy++) \
303 valxy = valx*thy[ithy]; \
304 index_xy = index_x+(j0+ithy)*pnz; \
306 for (ithz = 0; (ithz < (order)); ithz++) \
308 index_xyz = index_xy+(k0+ithz); \
309 grid[index_xyz] += valxy*thz[ithz]; \
315 static void spread_coefficients_bsplines_thread(const pmegrid_t
*pmegrid
,
316 const pme_atomcomm_t
*atc
,
317 splinedata_t
*spline
,
318 struct pme_spline_work gmx_unused
*work
)
321 /* spread coefficients from home atoms to local grid */
323 int i
, nn
, n
, ithx
, ithy
, ithz
, i0
, j0
, k0
;
325 int order
, norder
, index_x
, index_xy
, index_xyz
;
326 real valx
, valxy
, coefficient
;
327 real
*thx
, *thy
, *thz
;
328 int pnx
, pny
, pnz
, ndatatot
;
329 int offx
, offy
, offz
;
331 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
332 alignas(GMX_SIMD_ALIGNMENT
) real thz_aligned
[GMX_SIMD4_WIDTH
*2];
335 pnx
= pmegrid
->s
[XX
];
336 pny
= pmegrid
->s
[YY
];
337 pnz
= pmegrid
->s
[ZZ
];
339 offx
= pmegrid
->offset
[XX
];
340 offy
= pmegrid
->offset
[YY
];
341 offz
= pmegrid
->offset
[ZZ
];
343 ndatatot
= pnx
*pny
*pnz
;
344 grid
= pmegrid
->grid
;
345 for (i
= 0; i
< ndatatot
; i
++)
350 order
= pmegrid
->order
;
352 for (nn
= 0; nn
< spline
->n
; nn
++)
355 coefficient
= atc
->coefficient
[n
];
357 if (coefficient
!= 0)
359 idxptr
= atc
->idx
[n
];
362 i0
= idxptr
[XX
] - offx
;
363 j0
= idxptr
[YY
] - offy
;
364 k0
= idxptr
[ZZ
] - offz
;
366 thx
= spline
->theta
[XX
] + norder
;
367 thy
= spline
->theta
[YY
] + norder
;
368 thz
= spline
->theta
[ZZ
] + norder
;
373 #ifdef PME_SIMD4_SPREAD_GATHER
374 #ifdef PME_SIMD4_UNALIGNED
375 #define PME_SPREAD_SIMD4_ORDER4
377 #define PME_SPREAD_SIMD4_ALIGNED
380 #include "pme-simd4.h"
386 #ifdef PME_SIMD4_SPREAD_GATHER
387 #define PME_SPREAD_SIMD4_ALIGNED
389 #include "pme-simd4.h"
402 static void copy_local_grid(const gmx_pme_t
*pme
, const pmegrids_t
*pmegrids
,
403 int grid_index
, int thread
, real
*fftgrid
)
405 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
409 int offx
, offy
, offz
, x
, y
, z
, i0
, i0t
;
413 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
417 fft_my
= local_fft_size
[YY
];
418 fft_mz
= local_fft_size
[ZZ
];
420 const pmegrid_t
*pmegrid
= &pmegrids
->grid_th
[thread
];
422 nsy
= pmegrid
->s
[YY
];
423 nsz
= pmegrid
->s
[ZZ
];
425 for (d
= 0; d
< DIM
; d
++)
427 nf
[d
] = std::min(pmegrid
->n
[d
] - (pmegrid
->order
- 1),
428 local_fft_ndata
[d
] - pmegrid
->offset
[d
]);
431 offx
= pmegrid
->offset
[XX
];
432 offy
= pmegrid
->offset
[YY
];
433 offz
= pmegrid
->offset
[ZZ
];
435 /* Directly copy the non-overlapping parts of the local grids.
436 * This also initializes the full grid.
438 grid_th
= pmegrid
->grid
;
439 for (x
= 0; x
< nf
[XX
]; x
++)
441 for (y
= 0; y
< nf
[YY
]; y
++)
443 i0
= ((offx
+ x
)*fft_my
+ (offy
+ y
))*fft_mz
+ offz
;
444 i0t
= (x
*nsy
+ y
)*nsz
;
445 for (z
= 0; z
< nf
[ZZ
]; z
++)
447 fftgrid
[i0
+z
] = grid_th
[i0t
+z
];
454 reduce_threadgrid_overlap(const gmx_pme_t
*pme
,
455 const pmegrids_t
*pmegrids
, int thread
,
456 real
*fftgrid
, real
*commbuf_x
, real
*commbuf_y
,
459 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
460 int fft_nx
, fft_ny
, fft_nz
;
464 ivec localcopy_end
, commcopy_end
;
465 int offx
, offy
, offz
, x
, y
, z
, i0
, i0t
;
466 int sx
, sy
, sz
, fx
, fy
, fz
, tx1
, ty1
, tz1
, ox
, oy
, oz
;
467 gmx_bool bClearBufX
, bClearBufY
, bClearBufXY
, bClearBuf
;
468 gmx_bool bCommX
, bCommY
;
471 const pmegrid_t
*pmegrid
, *pmegrid_g
, *pmegrid_f
;
473 real
*commbuf
= nullptr;
475 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
479 fft_nx
= local_fft_ndata
[XX
];
480 fft_ny
= local_fft_ndata
[YY
];
481 fft_nz
= local_fft_ndata
[ZZ
];
483 fft_my
= local_fft_size
[YY
];
484 fft_mz
= local_fft_size
[ZZ
];
486 /* This routine is called when all thread have finished spreading.
487 * Here each thread sums grid contributions calculated by other threads
488 * to the thread local grid volume.
489 * To minimize the number of grid copying operations,
490 * this routines sums immediately from the pmegrid to the fftgrid.
493 /* Determine which part of the full node grid we should operate on,
494 * this is our thread local part of the full grid.
496 pmegrid
= &pmegrids
->grid_th
[thread
];
498 for (d
= 0; d
< DIM
; d
++)
500 /* Determine up to where our thread needs to copy from the
501 * thread-local charge spreading grid to the rank-local FFT grid.
502 * This is up to our spreading grid end minus order-1 and
503 * not beyond the local FFT grid.
506 std::min(pmegrid
->offset
[d
] + pmegrid
->n
[d
] - (pmegrid
->order
- 1),
509 /* Determine up to where our thread needs to copy from the
510 * thread-local charge spreading grid to the communication buffer.
511 * Note: only relevant with communication, ignored otherwise.
513 commcopy_end
[d
] = localcopy_end
[d
];
514 if (pmegrid
->ci
[d
] == pmegrids
->nc
[d
] - 1)
516 /* The last thread should copy up to the last pme grid line.
517 * When the rank-local FFT grid is narrower than pme-order,
518 * we need the max below to ensure copying of all data.
520 commcopy_end
[d
] = std::max(commcopy_end
[d
], pme
->pme_order
);
524 offx
= pmegrid
->offset
[XX
];
525 offy
= pmegrid
->offset
[YY
];
526 offz
= pmegrid
->offset
[ZZ
];
533 /* Now loop over all the thread data blocks that contribute
534 * to the grid region we (our thread) are operating on.
536 /* Note that fft_nx/y is equal to the number of grid points
537 * between the first point of our node grid and the one of the next node.
539 for (sx
= 0; sx
>= -pmegrids
->nthread_comm
[XX
]; sx
--)
541 fx
= pmegrid
->ci
[XX
] + sx
;
546 fx
+= pmegrids
->nc
[XX
];
548 bCommX
= (pme
->nnodes_major
> 1);
550 pmegrid_g
= &pmegrids
->grid_th
[fx
*pmegrids
->nc
[YY
]*pmegrids
->nc
[ZZ
]];
551 ox
+= pmegrid_g
->offset
[XX
];
552 /* Determine the end of our part of the source grid.
553 * Use our thread local source grid and target grid part
555 tx1
= std::min(ox
+ pmegrid_g
->n
[XX
],
556 !bCommX
? localcopy_end
[XX
] : commcopy_end
[XX
]);
558 for (sy
= 0; sy
>= -pmegrids
->nthread_comm
[YY
]; sy
--)
560 fy
= pmegrid
->ci
[YY
] + sy
;
565 fy
+= pmegrids
->nc
[YY
];
567 bCommY
= (pme
->nnodes_minor
> 1);
569 pmegrid_g
= &pmegrids
->grid_th
[fy
*pmegrids
->nc
[ZZ
]];
570 oy
+= pmegrid_g
->offset
[YY
];
571 /* Determine the end of our part of the source grid.
572 * Use our thread local source grid and target grid part
574 ty1
= std::min(oy
+ pmegrid_g
->n
[YY
],
575 !bCommY
? localcopy_end
[YY
] : commcopy_end
[YY
]);
577 for (sz
= 0; sz
>= -pmegrids
->nthread_comm
[ZZ
]; sz
--)
579 fz
= pmegrid
->ci
[ZZ
] + sz
;
583 fz
+= pmegrids
->nc
[ZZ
];
586 pmegrid_g
= &pmegrids
->grid_th
[fz
];
587 oz
+= pmegrid_g
->offset
[ZZ
];
588 tz1
= std::min(oz
+ pmegrid_g
->n
[ZZ
], localcopy_end
[ZZ
]);
590 if (sx
== 0 && sy
== 0 && sz
== 0)
592 /* We have already added our local contribution
593 * before calling this routine, so skip it here.
598 thread_f
= (fx
*pmegrids
->nc
[YY
] + fy
)*pmegrids
->nc
[ZZ
] + fz
;
600 pmegrid_f
= &pmegrids
->grid_th
[thread_f
];
602 grid_th
= pmegrid_f
->grid
;
604 nsy
= pmegrid_f
->s
[YY
];
605 nsz
= pmegrid_f
->s
[ZZ
];
607 #ifdef DEBUG_PME_REDUCE
608 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
609 pme
->nodeid
, thread
, thread_f
,
610 pme
->pmegrid_start_ix
,
611 pme
->pmegrid_start_iy
,
612 pme
->pmegrid_start_iz
,
614 offx
-ox
, tx1
-ox
, offx
, tx1
,
615 offy
-oy
, ty1
-oy
, offy
, ty1
,
616 offz
-oz
, tz1
-oz
, offz
, tz1
);
619 if (!(bCommX
|| bCommY
))
621 /* Copy from the thread local grid to the node grid */
622 for (x
= offx
; x
< tx1
; x
++)
624 for (y
= offy
; y
< ty1
; y
++)
626 i0
= (x
*fft_my
+ y
)*fft_mz
;
627 i0t
= ((x
- ox
)*nsy
+ (y
- oy
))*nsz
- oz
;
628 for (z
= offz
; z
< tz1
; z
++)
630 fftgrid
[i0
+z
] += grid_th
[i0t
+z
];
637 /* The order of this conditional decides
638 * where the corner volume gets stored with x+y decomp.
643 /* The y-size of the communication buffer is set by
644 * the overlap of the grid part of our local slab
645 * with the part starting at the next slab.
648 pme
->overlap
[1].s2g1
[pme
->nodeid_minor
] -
649 pme
->overlap
[1].s2g0
[pme
->nodeid_minor
+1];
652 /* We index commbuf modulo the local grid size */
653 commbuf
+= buf_my
*fft_nx
*fft_nz
;
655 bClearBuf
= bClearBufXY
;
660 bClearBuf
= bClearBufY
;
668 bClearBuf
= bClearBufX
;
672 /* Copy to the communication buffer */
673 for (x
= offx
; x
< tx1
; x
++)
675 for (y
= offy
; y
< ty1
; y
++)
677 i0
= (x
*buf_my
+ y
)*fft_nz
;
678 i0t
= ((x
- ox
)*nsy
+ (y
- oy
))*nsz
- oz
;
682 /* First access of commbuf, initialize it */
683 for (z
= offz
; z
< tz1
; z
++)
685 commbuf
[i0
+z
] = grid_th
[i0t
+z
];
690 for (z
= offz
; z
< tz1
; z
++)
692 commbuf
[i0
+z
] += grid_th
[i0t
+z
];
704 static void sum_fftgrid_dd(const gmx_pme_t
*pme
, real
*fftgrid
, int grid_index
)
706 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
707 int send_index0
, send_nindex
;
714 int x
, y
, z
, indg
, indb
;
716 /* Note that this routine is only used for forward communication.
717 * Since the force gathering, unlike the coefficient spreading,
718 * can be trivially parallelized over the particles,
719 * the backwards process is much simpler and can use the "old"
720 * communication setup.
723 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
],
728 if (pme
->nnodes_minor
> 1)
730 /* Major dimension */
731 const pme_overlap_t
*overlap
= &pme
->overlap
[1];
733 if (pme
->nnodes_major
> 1)
735 size_yx
= pme
->overlap
[0].comm_data
[0].send_nindex
;
742 int datasize
= (local_fft_ndata
[XX
] + size_yx
)*local_fft_ndata
[ZZ
];
744 int send_size_y
= overlap
->send_size
;
747 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
750 overlap
->comm_data
[ipulse
].send_index0
-
751 overlap
->comm_data
[0].send_index0
;
752 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
753 /* We don't use recv_index0, as we always receive starting at 0 */
754 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
755 recv_size_y
= overlap
->comm_data
[ipulse
].recv_size
;
757 auto *sendptr
= const_cast<real
*>(overlap
->sendbuf
.data()) + send_index0
* local_fft_ndata
[ZZ
];
758 auto *recvptr
= const_cast<real
*>(overlap
->recvbuf
.data());
760 if (debug
!= nullptr)
762 fprintf(debug
, "PME fftgrid comm y %2d x %2d x %2d\n",
763 local_fft_ndata
[XX
], send_nindex
, local_fft_ndata
[ZZ
]);
767 int send_id
= overlap
->comm_data
[ipulse
].send_id
;
768 int recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
769 MPI_Sendrecv(sendptr
, send_size_y
*datasize
, GMX_MPI_REAL
,
771 recvptr
, recv_size_y
*datasize
, GMX_MPI_REAL
,
773 overlap
->mpi_comm
, &stat
);
776 for (x
= 0; x
< local_fft_ndata
[XX
]; x
++)
778 for (y
= 0; y
< recv_nindex
; y
++)
780 indg
= (x
*local_fft_size
[YY
] + y
)*local_fft_size
[ZZ
];
781 indb
= (x
*recv_size_y
+ y
)*local_fft_ndata
[ZZ
];
782 for (z
= 0; z
< local_fft_ndata
[ZZ
]; z
++)
784 fftgrid
[indg
+z
] += recvptr
[indb
+z
];
789 if (pme
->nnodes_major
> 1)
791 /* Copy from the received buffer to the send buffer for dim 0 */
792 sendptr
= const_cast<real
*>(pme
->overlap
[0].sendbuf
.data());
793 for (x
= 0; x
< size_yx
; x
++)
795 for (y
= 0; y
< recv_nindex
; y
++)
797 indg
= (x
*local_fft_ndata
[YY
] + y
)*local_fft_ndata
[ZZ
];
798 indb
= ((local_fft_ndata
[XX
] + x
)*recv_size_y
+ y
)*local_fft_ndata
[ZZ
];
799 for (z
= 0; z
< local_fft_ndata
[ZZ
]; z
++)
801 sendptr
[indg
+z
] += recvptr
[indb
+z
];
809 /* We only support a single pulse here.
810 * This is not a severe limitation, as this code is only used
811 * with OpenMP and with OpenMP the (PME) domains can be larger.
813 if (pme
->nnodes_major
> 1)
815 /* Major dimension */
816 const pme_overlap_t
*overlap
= &pme
->overlap
[0];
820 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
821 /* We don't use recv_index0, as we always receive starting at 0 */
822 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
824 if (debug
!= nullptr)
826 fprintf(debug
, "PME fftgrid comm x %2d x %2d x %2d\n",
827 send_nindex
, local_fft_ndata
[YY
], local_fft_ndata
[ZZ
]);
831 int datasize
= local_fft_ndata
[YY
]*local_fft_ndata
[ZZ
];
832 int send_id
= overlap
->comm_data
[ipulse
].send_id
;
833 int recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
834 auto *sendptr
= const_cast<real
*>(overlap
->sendbuf
.data());
835 auto *recvptr
= const_cast<real
*>(overlap
->recvbuf
.data());
836 MPI_Sendrecv(sendptr
, send_nindex
*datasize
, GMX_MPI_REAL
,
838 recvptr
, recv_nindex
*datasize
, GMX_MPI_REAL
,
840 overlap
->mpi_comm
, &stat
);
843 for (x
= 0; x
< recv_nindex
; x
++)
845 for (y
= 0; y
< local_fft_ndata
[YY
]; y
++)
847 indg
= (x
*local_fft_size
[YY
] + y
)*local_fft_size
[ZZ
];
848 indb
= (x
*local_fft_ndata
[YY
] + y
)*local_fft_ndata
[ZZ
];
849 for (z
= 0; z
< local_fft_ndata
[ZZ
]; z
++)
851 fftgrid
[indg
+ z
] += overlap
->recvbuf
[indb
+ z
];
858 void spread_on_grid(const gmx_pme_t
*pme
,
859 const pme_atomcomm_t
*atc
, const pmegrids_t
*grids
,
860 gmx_bool bCalcSplines
, gmx_bool bSpread
,
861 real
*fftgrid
, gmx_bool bDoSplines
, int grid_index
)
864 #ifdef PME_TIME_THREADS
865 gmx_cycles_t c1
, c2
, c3
, ct1a
, ct1b
, ct1c
;
866 static double cs1
= 0, cs2
= 0, cs3
= 0;
867 static double cs1a
[6] = {0, 0, 0, 0, 0, 0};
871 nthread
= pme
->nthread
;
874 #ifdef PME_TIME_THREADS
875 c1
= omp_cyc_start();
879 #pragma omp parallel for num_threads(nthread) schedule(static)
880 for (thread
= 0; thread
< nthread
; thread
++)
886 start
= atc
->n
* thread
/nthread
;
887 end
= atc
->n
*(thread
+1)/nthread
;
889 /* Compute fftgrid index for all atoms,
890 * with help of some extra variables.
892 calc_interpolation_idx(pme
, atc
, start
, grid_index
, end
, thread
);
894 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
897 #ifdef PME_TIME_THREADS
898 c1
= omp_cyc_end(c1
);
902 #ifdef PME_TIME_THREADS
903 c2
= omp_cyc_start();
905 #pragma omp parallel for num_threads(nthread) schedule(static)
906 for (thread
= 0; thread
< nthread
; thread
++)
910 splinedata_t
*spline
;
912 /* make local bsplines */
913 if (grids
== nullptr || !pme
->bUseThreads
)
915 spline
= &atc
->spline
[0];
921 spline
= &atc
->spline
[thread
];
923 if (grids
->nthread
== 1)
925 /* One thread, we operate on all coefficients */
930 /* Get the indices our thread should operate on */
931 make_thread_local_ind(atc
, thread
, spline
);
937 make_bsplines(spline
->theta
, spline
->dtheta
, pme
->pme_order
,
938 atc
->fractx
, spline
->n
, spline
->ind
, atc
->coefficient
, bDoSplines
);
943 /* put local atoms on grid. */
944 const pmegrid_t
*grid
= pme
->bUseThreads
? &grids
->grid_th
[thread
] : &grids
->grid
;
946 #ifdef PME_TIME_SPREAD
947 ct1a
= omp_cyc_start();
949 spread_coefficients_bsplines_thread(grid
, atc
, spline
, pme
->spline_work
);
951 if (pme
->bUseThreads
)
953 copy_local_grid(pme
, grids
, grid_index
, thread
, fftgrid
);
955 #ifdef PME_TIME_SPREAD
956 ct1a
= omp_cyc_end(ct1a
);
957 cs1a
[thread
] += (double)ct1a
;
961 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
963 #ifdef PME_TIME_THREADS
964 c2
= omp_cyc_end(c2
);
968 if (bSpread
&& pme
->bUseThreads
)
970 #ifdef PME_TIME_THREADS
971 c3
= omp_cyc_start();
973 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
974 for (thread
= 0; thread
< grids
->nthread
; thread
++)
978 reduce_threadgrid_overlap(pme
, grids
, thread
,
980 const_cast<real
*>(pme
->overlap
[0].sendbuf
.data()),
981 const_cast<real
*>(pme
->overlap
[1].sendbuf
.data()),
984 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
986 #ifdef PME_TIME_THREADS
987 c3
= omp_cyc_end(c3
);
993 /* Communicate the overlapping part of the fftgrid.
994 * For this communication call we need to check pme->bUseThreads
995 * to have all ranks communicate here, regardless of pme->nthread.
997 sum_fftgrid_dd(pme
, fftgrid
, grid_index
);
1001 #ifdef PME_TIME_THREADS
1005 printf("idx %.2f spread %.2f red %.2f",
1006 cs1
*1e-9, cs2
*1e-9, cs3
*1e-9);
1007 #ifdef PME_TIME_SPREAD
1008 for (thread
= 0; thread
< nthread
; thread
++)
1010 printf(" %.2f", cs1a
[thread
]*1e-9);