clang-tidy modernize
[gromacs.git] / src / gromacs / fft / fft5d.cpp
blob3e0cb7857777ffbd7af8105123c4e509fd31fd59
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #include "gmxpre.h"
37 #include "fft5d.h"
39 #include "config.h"
41 #include <cassert>
42 #include <cfloat>
43 #include <cmath>
44 #include <cstdio>
45 #include <cstdlib>
46 #include <cstring>
48 #include <algorithm>
50 #include "gromacs/gpu_utils/gpu_utils.h"
51 #include "gromacs/gpu_utils/hostallocator.h"
52 #include "gromacs/gpu_utils/pinning.h"
53 #include "gromacs/utility/alignedallocator.h"
54 #include "gromacs/utility/exceptions.h"
55 #include "gromacs/utility/fatalerror.h"
56 #include "gromacs/utility/gmxmpi.h"
57 #include "gromacs/utility/smalloc.h"
59 #ifdef NOGMX
60 #define GMX_PARALLEL_ENV_INITIALIZED 1
61 #else
62 #if GMX_MPI
63 #define GMX_PARALLEL_ENV_INITIALIZED 1
64 #else
65 #define GMX_PARALLEL_ENV_INITIALIZED 0
66 #endif
67 #endif
69 #if GMX_OPENMP
70 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
71 #define FFT5D_THREADS
72 /* requires fftw compiled with openmp */
73 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
74 #endif
76 #ifndef __FLT_EPSILON__
77 #define __FLT_EPSILON__ FLT_EPSILON
78 #define __DBL_EPSILON__ DBL_EPSILON
79 #endif
81 #ifdef NOGMX
82 FILE* debug = 0;
83 #endif
85 #if GMX_FFT_FFTW3
87 #include "gromacs/utility/exceptions.h"
88 #include "gromacs/utility/mutex.h"
89 /* none of the fftw3 calls, except execute(), are thread-safe, so
90 we need to serialize them with this mutex. */
91 static gmx::Mutex big_fftw_mutex;
92 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
93 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
94 #endif /* GMX_FFT_FFTW3 */
96 #if GMX_MPI
97 /* largest factor smaller than sqrt */
98 static int lfactor(int z)
100 int i = static_cast<int>(sqrt(static_cast<double>(z)));
101 while (z%i != 0)
103 i--;
105 return i;
107 #endif
109 #if !GMX_MPI
110 #if HAVE_GETTIMEOFDAY
111 #include <sys/time.h>
112 double MPI_Wtime()
114 struct timeval tv;
115 gettimeofday(&tv, 0);
116 return tv.tv_sec+tv.tv_usec*1e-6;
118 #else
119 double MPI_Wtime()
121 return 0.0;
123 #endif
124 #endif
126 static int vmax(const int* a, int s)
128 int i, max = 0;
129 for (i = 0; i < s; i++)
131 if (a[i] > max)
133 max = a[i];
136 return max;
140 /* NxMxK the size of the data
141 * comm communicator to use for fft5d
142 * P0 number of processor in 1st axes (can be null for automatic)
143 * lin is allocated by fft5d because size of array is only known after planning phase
144 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
146 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads, gmx::PinningPolicy realGridAllocationPinningPolicy)
149 int P[2], bMaster, prank[2], i, t;
150 int rNG, rMG, rKG;
151 int *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr, *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
152 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {nullptr}, *oNin[3] = {nullptr}, *iNout[3] = {nullptr}, *oNout[3] = {nullptr};
153 int C[3], rC[3], nP[2];
154 int lsize;
155 t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
156 fft5d_plan plan;
157 int s;
159 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
160 #if GMX_MPI
161 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
163 MPI_Comm_size(comm[0], &P[0]);
164 MPI_Comm_rank(comm[0], &prank[0]);
166 else
167 #endif
169 P[0] = 1;
170 prank[0] = 0;
172 #if GMX_MPI
173 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
175 MPI_Comm_size(comm[1], &P[1]);
176 MPI_Comm_rank(comm[1], &prank[1]);
178 else
179 #endif
181 P[1] = 1;
182 prank[1] = 0;
185 bMaster = (prank[0] == 0 && prank[1] == 0);
188 if (debug)
190 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
191 P[0], P[1], prank[0], prank[1]);
194 if (bMaster)
196 if (debug)
198 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
199 NG, MG, KG, P[0], P[1], int((flags&FFT5D_REALCOMPLEX) > 0), int((flags&FFT5D_BACKWARD) > 0), int((flags&FFT5D_ORDER_YZ) > 0), int((flags&FFT5D_DEBUG) > 0));
201 /* The check below is not correct, one prime factor 11 or 13 is ok.
202 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
203 printf("WARNING: FFT very slow with prime factors larger 7\n");
204 printf("Change FFT size or in case you cannot change it look at\n");
205 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
210 if (NG == 0 || MG == 0 || KG == 0)
212 if (bMaster)
214 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
216 return nullptr;
219 rNG = NG; rMG = MG; rKG = KG;
221 if (flags&FFT5D_REALCOMPLEX)
223 if (!(flags&FFT5D_BACKWARD))
225 NG = NG/2+1;
227 else
229 if (!(flags&FFT5D_ORDER_YZ))
231 MG = MG/2+1;
233 else
235 KG = KG/2+1;
241 /*for transpose we need to know the size for each processor not only our own size*/
243 N0 = static_cast<int*>(malloc(P[0]*sizeof(int))); N1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
244 M0 = static_cast<int*>(malloc(P[0]*sizeof(int))); M1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
245 K0 = static_cast<int*>(malloc(P[0]*sizeof(int))); K1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
246 oN0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oN1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
247 oM0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oM1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
248 oK0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oK1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
250 for (i = 0; i < P[0]; i++)
252 #define EVENDIST
253 #ifndef EVENDIST
254 oN0[i] = i*ceil((double)NG/P[0]);
255 oM0[i] = i*ceil((double)MG/P[0]);
256 oK0[i] = i*ceil((double)KG/P[0]);
257 #else
258 oN0[i] = (NG*i)/P[0];
259 oM0[i] = (MG*i)/P[0];
260 oK0[i] = (KG*i)/P[0];
261 #endif
263 for (i = 0; i < P[1]; i++)
265 #ifndef EVENDIST
266 oN1[i] = i*ceil((double)NG/P[1]);
267 oM1[i] = i*ceil((double)MG/P[1]);
268 oK1[i] = i*ceil((double)KG/P[1]);
269 #else
270 oN1[i] = (NG*i)/P[1];
271 oM1[i] = (MG*i)/P[1];
272 oK1[i] = (KG*i)/P[1];
273 #endif
275 for (i = 0; i < P[0]-1; i++)
277 N0[i] = oN0[i+1]-oN0[i];
278 M0[i] = oM0[i+1]-oM0[i];
279 K0[i] = oK0[i+1]-oK0[i];
281 N0[P[0]-1] = NG-oN0[P[0]-1];
282 M0[P[0]-1] = MG-oM0[P[0]-1];
283 K0[P[0]-1] = KG-oK0[P[0]-1];
284 for (i = 0; i < P[1]-1; i++)
286 N1[i] = oN1[i+1]-oN1[i];
287 M1[i] = oM1[i+1]-oM1[i];
288 K1[i] = oK1[i+1]-oK1[i];
290 N1[P[1]-1] = NG-oN1[P[1]-1];
291 M1[P[1]-1] = MG-oM1[P[1]-1];
292 K1[P[1]-1] = KG-oK1[P[1]-1];
294 /* for step 1-3 the local N,M,K sizes of the transposed system
295 C: contiguous dimension, and nP: number of processor in subcommunicator
296 for that step */
299 pM[0] = M0[prank[0]];
300 oM[0] = oM0[prank[0]];
301 pK[0] = K1[prank[1]];
302 oK[0] = oK1[prank[1]];
303 C[0] = NG;
304 rC[0] = rNG;
305 if (!(flags&FFT5D_ORDER_YZ))
307 N[0] = vmax(N1, P[1]);
308 M[0] = M0[prank[0]];
309 K[0] = vmax(K1, P[1]);
310 pN[0] = N1[prank[1]];
311 iNout[0] = N1;
312 oNout[0] = oN1;
313 nP[0] = P[1];
314 C[1] = KG;
315 rC[1] = rKG;
316 N[1] = vmax(K0, P[0]);
317 pN[1] = K0[prank[0]];
318 iNin[1] = K1;
319 oNin[1] = oK1;
320 iNout[1] = K0;
321 oNout[1] = oK0;
322 M[1] = vmax(M0, P[0]);
323 pM[1] = M0[prank[0]];
324 oM[1] = oM0[prank[0]];
325 K[1] = N1[prank[1]];
326 pK[1] = N1[prank[1]];
327 oK[1] = oN1[prank[1]];
328 nP[1] = P[0];
329 C[2] = MG;
330 rC[2] = rMG;
331 iNin[2] = M0;
332 oNin[2] = oM0;
333 M[2] = vmax(K0, P[0]);
334 pM[2] = K0[prank[0]];
335 oM[2] = oK0[prank[0]];
336 K[2] = vmax(N1, P[1]);
337 pK[2] = N1[prank[1]];
338 oK[2] = oN1[prank[1]];
339 free(N0); free(oN0); /*these are not used for this order*/
340 free(M1); free(oM1); /*the rest is freed in destroy*/
342 else
344 N[0] = vmax(N0, P[0]);
345 M[0] = vmax(M0, P[0]);
346 K[0] = K1[prank[1]];
347 pN[0] = N0[prank[0]];
348 iNout[0] = N0;
349 oNout[0] = oN0;
350 nP[0] = P[0];
351 C[1] = MG;
352 rC[1] = rMG;
353 N[1] = vmax(M1, P[1]);
354 pN[1] = M1[prank[1]];
355 iNin[1] = M0;
356 oNin[1] = oM0;
357 iNout[1] = M1;
358 oNout[1] = oM1;
359 M[1] = N0[prank[0]];
360 pM[1] = N0[prank[0]];
361 oM[1] = oN0[prank[0]];
362 K[1] = vmax(K1, P[1]);
363 pK[1] = K1[prank[1]];
364 oK[1] = oK1[prank[1]];
365 nP[1] = P[1];
366 C[2] = KG;
367 rC[2] = rKG;
368 iNin[2] = K1;
369 oNin[2] = oK1;
370 M[2] = vmax(N0, P[0]);
371 pM[2] = N0[prank[0]];
372 oM[2] = oN0[prank[0]];
373 K[2] = vmax(M1, P[1]);
374 pK[2] = M1[prank[1]];
375 oK[2] = oM1[prank[1]];
376 free(N1); free(oN1); /*these are not used for this order*/
377 free(K0); free(oK0); /*the rest is freed in destroy*/
379 N[2] = pN[2] = -1; /*not used*/
382 Difference between x-y-z regarding 2d decomposition is whether they are
383 distributed along axis 1, 2 or both
386 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
387 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
388 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
389 if (!(flags&FFT5D_NOMALLOC))
391 // only needed for PME GPU mixed mode
392 if (realGridAllocationPinningPolicy == gmx::PinningPolicy::CanBePinned)
394 const std::size_t numBytes = lsize * sizeof(t_complex);
395 lin = static_cast<t_complex *>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
396 gmx::pinBuffer(lin, numBytes);
398 else
400 snew_aligned(lin, lsize, 32);
402 snew_aligned(lout, lsize, 32);
403 if (nthreads > 1)
405 /* We need extra transpose buffers to avoid OpenMP barriers */
406 snew_aligned(lout2, lsize, 32);
407 snew_aligned(lout3, lsize, 32);
409 else
411 /* We can reuse the buffers to avoid cache misses */
412 lout2 = lin;
413 lout3 = lout;
416 else
418 lin = *rlin;
419 lout = *rlout;
420 if (nthreads > 1)
422 lout2 = *rlout2;
423 lout3 = *rlout3;
425 else
427 lout2 = lin;
428 lout3 = lout;
432 plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
435 if (debug)
437 fprintf(debug, "Running on %d threads\n", nthreads);
440 #if GMX_FFT_FFTW3
441 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
442 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
443 * generic functionality it is far better to extend the interface so we can use it for
444 * all FFT libraries instead of writing FFTW-specific code here.
447 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
448 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
449 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
450 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
451 * and that the 3d plan is faster than the 1d plan.
453 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
455 int fftwflags = FFTW_DESTROY_INPUT;
456 FFTW(iodim) dims[3];
457 int inNG = NG, outMG = MG, outKG = KG;
459 FFTW_LOCK;
461 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
463 if (flags&FFT5D_REALCOMPLEX)
465 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
467 inNG *= 2;
469 else /*output pointer is not complex*/
471 if (!(flags&FFT5D_ORDER_YZ))
473 outMG *= 2;
475 else
477 outKG *= 2;
482 if (!(flags&FFT5D_BACKWARD))
484 dims[0].n = KG;
485 dims[1].n = MG;
486 dims[2].n = rNG;
488 dims[0].is = inNG*MG; /*N M K*/
489 dims[1].is = inNG;
490 dims[2].is = 1;
491 if (!(flags&FFT5D_ORDER_YZ))
493 dims[0].os = MG; /*M K N*/
494 dims[1].os = 1;
495 dims[2].os = MG*KG;
497 else
499 dims[0].os = 1; /*K N M*/
500 dims[1].os = KG*NG;
501 dims[2].os = KG;
504 else
506 if (!(flags&FFT5D_ORDER_YZ))
508 dims[0].n = NG;
509 dims[1].n = KG;
510 dims[2].n = rMG;
512 dims[0].is = 1;
513 dims[1].is = NG*MG;
514 dims[2].is = NG;
516 dims[0].os = outMG*KG;
517 dims[1].os = outMG;
518 dims[2].os = 1;
520 else
522 dims[0].n = MG;
523 dims[1].n = NG;
524 dims[2].n = rKG;
526 dims[0].is = NG;
527 dims[1].is = 1;
528 dims[2].is = NG*MG;
530 dims[0].os = outKG*NG;
531 dims[1].os = outKG;
532 dims[2].os = 1;
535 #ifdef FFT5D_THREADS
536 #ifdef FFT5D_FFTW_THREADS
537 FFTW(plan_with_nthreads)(nthreads);
538 #endif
539 #endif
540 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
542 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
543 /*howmany*/ 0, /*howmany_dims*/ nullptr,
544 reinterpret_cast<real*>(lin), reinterpret_cast<FFTW(complex) *>(lout),
545 /*flags*/ fftwflags);
547 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
549 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
550 /*howmany*/ 0, /*howmany_dims*/ nullptr,
551 reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<real*>(lout),
552 /*flags*/ fftwflags);
554 else
556 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
557 /*howmany*/ 0, /*howmany_dims*/ nullptr,
558 reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<FFTW(complex) *>(lout),
559 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
561 #ifdef FFT5D_THREADS
562 #ifdef FFT5D_FFTW_THREADS
563 FFTW(plan_with_nthreads)(1);
564 #endif
565 #endif
566 FFTW_UNLOCK;
568 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
570 #endif /* GMX_FFT_FFTW3 */
571 for (s = 0; s < 3; s++)
573 if (debug)
575 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
576 s, rC[s], M[s], pK[s], C[s], lsize);
578 plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t)*nthreads));
580 /* Make sure that the init routines are only called by one thread at a time and in order
581 (later is only important to not confuse valgrind)
583 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
584 for (t = 0; t < nthreads; t++)
586 #pragma omp ordered
590 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
592 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
594 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
596 else
598 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
601 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
606 #if GMX_FFT_FFTW3
608 #endif
609 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
611 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
613 else
615 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
617 #ifdef FFT5D_MPI_TRANSPOSE
618 FFTW_LOCK;
619 for (s = 0; s < 2; s++)
621 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
623 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
625 else
627 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
630 FFTW_UNLOCK;
631 #endif
634 plan->lin = lin;
635 plan->lout = lout;
636 plan->lout2 = lout2;
637 plan->lout3 = lout3;
639 plan->NG = NG; plan->MG = MG; plan->KG = KG;
641 for (s = 0; s < 3; s++)
643 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
644 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
645 plan->C[s] = C[s]; plan->rC[s] = rC[s];
646 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
648 for (s = 0; s < 2; s++)
650 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
653 /* plan->fftorder=fftorder;
654 plan->direction=direction;
655 plan->realcomplex=realcomplex;
657 plan->flags = flags;
658 plan->nthreads = nthreads;
659 plan->pinningPolicy = realGridAllocationPinningPolicy;
660 *rlin = lin;
661 *rlout = lout;
662 *rlout2 = lout2;
663 *rlout3 = lout3;
664 return plan;
668 enum order {
669 XYZ,
670 XZY,
671 YXZ,
672 YZX,
673 ZXY,
679 /*here x,y,z and N,M,K is in rotated coordinate system!!
680 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
681 maxN,maxM,maxK is max size of local data
682 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
683 NG, MG, KG is size of global data*/
684 static void splitaxes(t_complex* lout, const t_complex* lin,
685 int maxN, int maxM, int maxK, int pM,
686 int P, int NG, const int *N, const int* oN, int starty, int startz, int endy, int endz)
688 int x, y, z, i;
689 int in_i, out_i, in_z, out_z, in_y, out_y;
690 int s_y, e_y;
692 for (z = startz; z < endz+1; z++) /*3. z l*/
694 if (z == startz)
696 s_y = starty;
698 else
700 s_y = 0;
702 if (z == endz)
704 e_y = endy;
706 else
708 e_y = pM;
710 out_z = z*maxN*maxM;
711 in_z = z*NG*pM;
713 for (i = 0; i < P; i++) /*index cube along long axis*/
715 out_i = out_z + i*maxN*maxM*maxK;
716 in_i = in_z + oN[i];
717 for (y = s_y; y < e_y; y++) /*2. y k*/
719 out_y = out_i + y*maxN;
720 in_y = in_i + y*NG;
721 for (x = 0; x < N[i]; x++) /*1. x j*/
723 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
724 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
725 /*before split data contiguos - thus if different processor get different amount oN is different*/
732 /*make axis contiguous again (after AllToAll) and also do local transpose*/
733 /*transpose mayor and major dimension
734 variables see above
735 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
736 N,M,K local dimensions
737 KG global size*/
738 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
739 int maxN, int maxM, int maxK, int pM,
740 int P, int KG, const int* K, const int* oK, int starty, int startx, int endy, int endx)
742 int i, x, y, z;
743 int out_i, in_i, out_x, in_x, out_z, in_z;
744 int s_y, e_y;
746 for (x = startx; x < endx+1; x++) /*1.j*/
748 if (x == startx)
750 s_y = starty;
752 else
754 s_y = 0;
756 if (x == endx)
758 e_y = endy;
760 else
762 e_y = pM;
765 out_x = x*KG*pM;
766 in_x = x;
768 for (i = 0; i < P; i++) /*index cube along long axis*/
770 out_i = out_x + oK[i];
771 in_i = in_x + i*maxM*maxN*maxK;
772 for (z = 0; z < K[i]; z++) /*3.l*/
774 out_z = out_i + z;
775 in_z = in_i + z*maxM*maxN;
776 for (y = s_y; y < e_y; y++) /*2.k*/
778 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
785 /*make axis contiguous again (after AllToAll) and also do local transpose
786 tranpose mayor and middle dimension
787 variables see above
788 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
789 N,M,K local size
790 MG, global size*/
791 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
792 int P, int MG, const int* M, const int* oM, int startx, int startz, int endx, int endz)
794 int i, z, y, x;
795 int out_i, in_i, out_z, in_z, out_x, in_x;
796 int s_x, e_x;
798 for (z = startz; z < endz+1; z++)
800 if (z == startz)
802 s_x = startx;
804 else
806 s_x = 0;
808 if (z == endz)
810 e_x = endx;
812 else
814 e_x = pN;
816 out_z = z*MG*pN;
817 in_z = z*maxM*maxN;
819 for (i = 0; i < P; i++) /*index cube along long axis*/
821 out_i = out_z + oM[i];
822 in_i = in_z + i*maxM*maxN*maxK;
823 for (x = s_x; x < e_x; x++)
825 out_x = out_i + x*MG;
826 in_x = in_i + x;
827 for (y = 0; y < M[i]; y++)
829 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
837 static void rotate_offsets(int x[])
839 int t = x[0];
840 /* x[0]=x[2];
841 x[2]=x[1];
842 x[1]=t;*/
843 x[0] = x[1];
844 x[1] = x[2];
845 x[2] = t;
848 /*compute the offset to compare or print transposed local data in original input coordinates
849 xs matrix dimension size, xl dimension length, xc decomposition offset
850 s: step in computation = number of transposes*/
851 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
853 /* int direction = plan->direction;
854 int fftorder = plan->fftorder;*/
856 int o = 0;
857 int pos[3], i;
858 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
859 *C = plan->C, *rC = plan->rC;
861 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
863 if (!(plan->flags&FFT5D_ORDER_YZ))
865 switch (s)
867 case 0: o = XYZ; break;
868 case 1: o = ZYX; break;
869 case 2: o = YZX; break;
870 default: assert(0);
873 else
875 switch (s)
877 case 0: o = XYZ; break;
878 case 1: o = YXZ; break;
879 case 2: o = ZXY; break;
880 default: assert(0);
884 switch (o)
886 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
887 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
888 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
889 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
890 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
891 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
893 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
895 /*xs, xl give dimension size and data length in local transposed coordinate system
896 for 0(/1/2): x(/y/z) in original coordinate system*/
897 for (i = 0; i < 3; i++)
899 switch (pos[i])
901 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
902 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
903 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
906 /*input order is different for test program to match FFTW order
907 (important for complex to real)*/
908 if (plan->flags&FFT5D_BACKWARD)
910 rotate_offsets(xs);
911 rotate_offsets(xl);
912 rotate_offsets(xc);
913 rotate_offsets(NG);
914 if (plan->flags&FFT5D_ORDER_YZ)
916 rotate_offsets(xs);
917 rotate_offsets(xl);
918 rotate_offsets(xc);
919 rotate_offsets(NG);
922 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
924 xl[0] = rC[s];
928 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
930 int x, y, z, l;
931 int *coor = plan->coor;
932 int xs[3], xl[3], xc[3], NG[3];
933 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
934 compute_offsets(plan, xs, xl, xc, NG, s);
935 fprintf(debug, txt, coor[0], coor[1]);
936 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
937 for (z = 0; z < xl[2]; z++)
939 for (y = 0; y < xl[1]; y++)
941 fprintf(debug, "%d %d: ", coor[0], coor[1]);
942 for (x = 0; x < xl[0]; x++)
944 for (l = 0; l < ll; l++)
946 fprintf(debug, "%f ", reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
948 fprintf(debug, ",");
950 fprintf(debug, "\n");
955 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
957 t_complex *lin = plan->lin;
958 t_complex *lout = plan->lout;
959 t_complex *lout2 = plan->lout2;
960 t_complex *lout3 = plan->lout3;
961 t_complex *fftout, *joinin;
963 gmx_fft_t **p1d = plan->p1d;
964 #ifdef FFT5D_MPI_TRANSPOSE
965 FFTW(plan) *mpip = plan->mpip;
966 #endif
967 #if GMX_MPI
968 MPI_Comm *cart = plan->cart;
969 #endif
970 #ifdef NOGMX
971 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
972 #endif
973 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
974 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
975 int s = 0, tstart, tend, bParallelDim;
978 #if GMX_FFT_FFTW3
979 if (plan->p3d)
981 if (thread == 0)
983 #ifdef NOGMX
984 if (times != 0)
986 time = MPI_Wtime();
988 #endif
989 FFTW(execute)(plan->p3d);
990 #ifdef NOGMX
991 if (times != 0)
993 times->fft += MPI_Wtime()-time;
995 #endif
997 return;
999 #endif
1001 s = 0;
1003 /*lin: x,y,z*/
1004 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1006 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1009 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1012 #if GMX_MPI
1013 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1015 bParallelDim = 1;
1017 else
1018 #endif
1020 bParallelDim = 0;
1023 /* ---------- START FFT ------------ */
1024 #ifdef NOGMX
1025 if (times != 0 && thread == 0)
1027 time = MPI_Wtime();
1029 #endif
1031 if (bParallelDim || plan->nthreads == 1)
1033 fftout = lout;
1035 else
1037 if (s == 0)
1039 fftout = lout3;
1041 else
1043 fftout = lout2;
1047 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1048 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1050 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1052 else
1054 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1058 #ifdef NOGMX
1059 if (times != NULL && thread == 0)
1061 time_fft += MPI_Wtime()-time;
1063 #endif
1064 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1066 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1068 /* ---------- END FFT ------------ */
1070 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1071 if (bParallelDim)
1073 #ifdef NOGMX
1074 if (times != NULL && thread == 0)
1076 time = MPI_Wtime();
1078 #endif
1079 /*prepare for A
1080 llToAll
1081 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1082 for sending*/
1083 if (pM[s] > 0)
1085 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1086 tstart /= C[s];
1087 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1089 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1090 #ifdef NOGMX
1091 if (times != NULL && thread == 0)
1093 time_local += MPI_Wtime()-time;
1095 #endif
1097 /* ---------- END SPLIT , START TRANSPOSE------------ */
1099 if (thread == 0)
1101 #ifdef NOGMX
1102 if (times != 0)
1104 time = MPI_Wtime();
1106 #else
1107 wallcycle_start(times, ewcPME_FFTCOMM);
1108 #endif
1109 #ifdef FFT5D_MPI_TRANSPOSE
1110 FFTW(execute)(mpip[s]);
1111 #else
1112 #if GMX_MPI
1113 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1115 MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1117 else
1119 MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1121 #else
1122 gmx_incons("fft5d MPI call without MPI configuration");
1123 #endif /*GMX_MPI*/
1124 #endif /*FFT5D_MPI_TRANSPOSE*/
1125 #ifdef NOGMX
1126 if (times != 0)
1128 time_mpi[s] = MPI_Wtime()-time;
1130 #else
1131 wallcycle_stop(times, ewcPME_FFTCOMM);
1132 #endif
1133 } /*master*/
1134 } /* bPrallelDim */
1135 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1137 /* ---------- END SPLIT + TRANSPOSE------------ */
1139 /* ---------- START JOIN ------------ */
1140 #ifdef NOGMX
1141 if (times != NULL && thread == 0)
1143 time = MPI_Wtime();
1145 #endif
1147 if (bParallelDim)
1149 joinin = lout3;
1151 else
1153 joinin = fftout;
1155 /*bring back in matrix form
1156 thus make new 1. axes contiguos
1157 also local transpose 1 and 2/3
1158 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1160 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1162 if (pM[s] > 0)
1164 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1165 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1166 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1169 else
1171 if (pN[s] > 0)
1173 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1174 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1175 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1179 #ifdef NOGMX
1180 if (times != NULL && thread == 0)
1182 time_local += MPI_Wtime()-time;
1184 #endif
1185 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1187 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1189 /* ---------- END JOIN ------------ */
1191 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1192 } /* for(s=0;s<2;s++) */
1193 #ifdef NOGMX
1194 if (times != NULL && thread == 0)
1196 time = MPI_Wtime();
1198 #endif
1200 if (plan->flags&FFT5D_INPLACE)
1202 lout = lin; /*in place currently not supported*/
1205 /* ----------- FFT ----------- */
1206 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1207 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1209 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1211 else
1213 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1215 /* ------------ END FFT ---------*/
1217 #ifdef NOGMX
1218 if (times != NULL && thread == 0)
1220 time_fft += MPI_Wtime()-time;
1222 times->fft += time_fft;
1223 times->local += time_local;
1224 times->mpi2 += time_mpi[1];
1225 times->mpi1 += time_mpi[0];
1227 #endif
1229 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1231 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1235 void fft5d_destroy(fft5d_plan plan)
1237 int s, t;
1239 for (s = 0; s < 3; s++)
1241 if (plan->p1d[s])
1243 for (t = 0; t < plan->nthreads; t++)
1245 gmx_many_fft_destroy(plan->p1d[s][t]);
1247 free(plan->p1d[s]);
1249 if (plan->iNin[s])
1251 free(plan->iNin[s]);
1252 plan->iNin[s] = nullptr;
1254 if (plan->oNin[s])
1256 free(plan->oNin[s]);
1257 plan->oNin[s] = nullptr;
1259 if (plan->iNout[s])
1261 free(plan->iNout[s]);
1262 plan->iNout[s] = nullptr;
1264 if (plan->oNout[s])
1266 free(plan->oNout[s]);
1267 plan->oNout[s] = nullptr;
1270 #if GMX_FFT_FFTW3
1271 FFTW_LOCK;
1272 #ifdef FFT5D_MPI_TRANSPOS
1273 for (s = 0; s < 2; s++)
1275 FFTW(destroy_plan)(plan->mpip[s]);
1277 #endif /* FFT5D_MPI_TRANSPOS */
1278 if (plan->p3d)
1280 FFTW(destroy_plan)(plan->p3d);
1282 FFTW_UNLOCK;
1283 #endif /* GMX_FFT_FFTW3 */
1285 if (!(plan->flags&FFT5D_NOMALLOC))
1287 // only needed for PME GPU mixed mode
1288 if (plan->pinningPolicy == gmx::PinningPolicy::CanBePinned &&
1289 isHostMemoryPinned(plan->lin))
1291 gmx::unpinBuffer(plan->lin);
1293 sfree_aligned(plan->lin);
1294 sfree_aligned(plan->lout);
1295 if (plan->nthreads > 1)
1297 sfree_aligned(plan->lout2);
1298 sfree_aligned(plan->lout3);
1302 #ifdef FFT5D_THREADS
1303 #ifdef FFT5D_FFTW_THREADS
1304 /*FFTW(cleanup_threads)();*/
1305 #endif
1306 #endif
1308 free(plan);
1311 /*Is this better than direct access of plan? enough data?
1312 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1313 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1315 *N1 = plan->N[0];
1316 *M0 = plan->M[0];
1317 *K1 = plan->K[0];
1318 *K0 = plan->N[1];
1320 *coor = plan->coor;
1324 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1325 of processor dimensions*/
1326 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1328 MPI_Comm cart[2] = {MPI_COMM_NULL, MPI_COMM_NULL};
1329 #if GMX_MPI
1330 int size = 1, prank = 0;
1331 int P[2];
1332 int coor[2];
1333 int wrap[] = {0, 0};
1334 MPI_Comm gcart;
1335 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1337 MPI_Comm_size(comm, &size);
1338 MPI_Comm_rank(comm, &prank);
1340 if (P0 == 0)
1342 P0 = lfactor(size);
1344 if (size%P0 != 0)
1346 if (prank == 0)
1348 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1350 P0 = lfactor(size);
1353 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1355 /*Difference between x-y-z regarding 2d decomposition is whether they are
1356 distributed along axis 1, 2 or both*/
1358 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1359 MPI_Cart_get(gcart, 2, P, wrap, coor);
1360 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1361 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1362 #else
1363 (void)P0;
1364 (void)comm;
1365 #endif
1366 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1371 /*prints in original coordinate system of data (as the input to FFT)*/
1372 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1374 int xs[3], xl[3], xc[3], NG[3];
1375 int x, y, z, l;
1376 int *coor = plan->coor;
1377 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1378 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1380 ll = 1;
1383 compute_offsets(plan, xs, xl, xc, NG, 2);
1384 if (plan->flags&FFT5D_DEBUG)
1386 printf("Compare2\n");
1388 for (z = 0; z < xl[2]; z++)
1390 for (y = 0; y < xl[1]; y++)
1392 if (plan->flags&FFT5D_DEBUG)
1394 printf("%d %d: ", coor[0], coor[1]);
1396 for (x = 0; x < xl[0]; x++)
1398 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1400 real a, b;
1401 a = reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1402 if (normalize)
1404 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1406 if (!bothLocal)
1408 b = reinterpret_cast<const real*>(in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1410 else
1412 b = reinterpret_cast<const real*>(in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1414 if (plan->flags&FFT5D_DEBUG)
1416 printf("%f %f, ", a, b);
1418 else
1420 if (std::fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1422 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1424 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1427 if (plan->flags&FFT5D_DEBUG)
1429 printf(",");
1432 if (plan->flags&FFT5D_DEBUG)
1434 printf("\n");