changed reading hint
[gromacs/adressmacs.git] / src / fftw / fftwnd.c
blobe4ed2249aa170495027712b4da87a57386c0634a
1 /*
2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 /* $Id$ */
22 #include <fftw-int.h>
24 /* the number of buffers to use for buffered transforms: */
25 #define FFTWND_NBUFFERS 8
27 /* the default number of buffers to use: */
28 #define FFTWND_DEFAULT_NBUFFERS 0
30 /* the number of "padding" elements between consecutive buffer lines */
31 #define FFTWND_BUFFER_PADDING 8
33 static void destroy_plan_array(int rank, fftw_plan *plans);
35 static void init_test_array(fftw_complex *arr, int stride, int n)
37 int j;
39 for (j = 0; j < n; ++j) {
40 c_re(arr[stride * j]) = 0.0;
41 c_im(arr[stride * j]) = 0.0;
46 * Same as fftw_measure_runtime, except for fftwnd plan.
48 double fftwnd_measure_runtime(fftwnd_plan plan,
49 fftw_complex *in, int istride,
50 fftw_complex *out, int ostride)
52 fftw_time begin, end, start;
53 double t, tmax, tmin;
54 int i, iter;
55 int n;
56 int repeat;
58 if (plan->rank == 0)
59 return 0.0;
61 n = 1;
62 for (i = 0; i < plan->rank; ++i)
63 n *= plan->n[i];
65 iter = 1;
67 for (;;) {
68 tmin = 1.0E10;
69 tmax = -1.0E10;
70 init_test_array(in, istride, n);
72 start = fftw_get_time();
73 /* repeat the measurement FFTW_TIME_REPEAT times */
74 for (repeat = 0; repeat < FFTW_TIME_REPEAT; ++repeat) {
75 begin = fftw_get_time();
76 for (i = 0; i < iter; ++i) {
77 fftwnd(plan, 1, in, istride, 0, out, ostride, 0);
79 end = fftw_get_time();
81 t = fftw_time_to_sec(fftw_time_diff(end, begin));
82 if (t < tmin)
83 tmin = t;
84 if (t > tmax)
85 tmax = t;
87 /* do not run for too long */
88 t = fftw_time_to_sec(fftw_time_diff(end, start));
89 if (t > FFTW_TIME_LIMIT)
90 break;
93 if (tmin >= FFTW_TIME_MIN)
94 break;
96 iter *= 2;
99 tmin /= (double) iter;
100 tmax /= (double) iter;
102 return tmin;
105 /********************** Initializing the FFTWND Plan ***********************/
107 /* Initialize everything except for the 1D plans and the work array: */
108 fftwnd_plan fftwnd_create_plan_aux(int rank, const int *n,
109 fftw_direction dir, int flags)
111 int i;
112 fftwnd_plan p;
114 if (rank < 0)
115 return 0;
117 for (i = 0; i < rank; ++i)
118 if (n[i] <= 0)
119 return 0;
121 p = (fftwnd_plan) fftw_malloc(sizeof(fftwnd_data));
122 p->n = 0;
123 p->n_before = 0;
124 p->n_after = 0;
125 p->plans = 0;
126 p->work = 0;
127 p->dir = dir;
129 p->rank = rank;
130 p->is_in_place = flags & FFTW_IN_PLACE;
132 p->nwork = 0;
133 p->nbuffers = 0;
135 if (rank == 0)
136 return 0;
138 p->n = (int *) fftw_malloc(sizeof(int) * rank);
139 p->n_before = (int *) fftw_malloc(sizeof(int) * rank);
140 p->n_after = (int *) fftw_malloc(sizeof(int) * rank);
141 p->n_before[0] = 1;
142 p->n_after[rank - 1] = 1;
144 for (i = 0; i < rank; ++i) {
145 p->n[i] = n[i];
147 if (i) {
148 p->n_before[i] = p->n_before[i - 1] * n[i - 1];
149 p->n_after[rank - 1 - i] = p->n_after[rank - i] * n[rank - i];
153 return p;
156 /* create an empty new array of rank 1d plans */
157 fftw_plan *fftwnd_new_plan_array(int rank)
159 fftw_plan *plans;
160 int i;
162 plans = (fftw_plan *) fftw_malloc(rank * sizeof(fftw_plan));
163 if (!plans)
164 return 0;
165 for (i = 0; i < rank; ++i)
166 plans[i] = 0;
167 return plans;
171 * create an array of plans using the ordinary 1d fftw_create_plan,
172 * which allocates its own array and creates plans optimized for
173 * contiguous data.
175 fftw_plan *fftwnd_create_plans_generic(fftw_plan *plans,
176 int rank, const int *n,
177 fftw_direction dir, int flags)
179 if (rank <= 0)
180 return 0;
182 if (plans) {
183 int i, j;
184 int cur_flags;
186 for (i = 0; i < rank; ++i) {
187 if (i < rank - 1 || (flags & FFTW_IN_PLACE)) {
189 * fft's except the last dimension are always in-place
191 cur_flags = flags | FFTW_IN_PLACE;
192 for (j = i - 1; j >= 0 && n[i] != n[j]; --j);
193 } else {
194 cur_flags = flags;
196 * we must create a separate plan for the last
197 * dimension
199 j = -1;
202 if (j >= 0) {
204 * If a plan already exists for this size
205 * array, reuse it:
207 plans[i] = plans[j];
208 } else {
209 /* generate a new plan: */
210 plans[i] = fftw_create_plan(n[i], dir, cur_flags);
211 if (!plans[i]) {
212 destroy_plan_array(rank, plans);
213 return 0;
218 return plans;
221 static int get_maxdim(int rank, const int *n, int flags)
223 int i;
224 int maxdim = 0;
226 for (i = 0; i < rank - 1; ++i)
227 if (n[i] > maxdim)
228 maxdim = n[i];
229 if (rank > 0 && flags & FFTW_IN_PLACE && n[rank - 1] > maxdim)
230 maxdim = n[rank - 1];
232 return maxdim;
235 /* compute number of elements required for work array (has to
236 be big enough to hold ncopies of the largest dimension in
237 n that will need an in-place transform. */
238 int fftwnd_work_size(int rank, const int *n, int flags, int ncopies)
240 return (ncopies * get_maxdim(rank, n, flags)
241 + (ncopies - 1) * FFTWND_BUFFER_PADDING);
245 * create plans using the fftw_create_plan_specific planner, which
246 * allows us to create plans for each dimension that are specialized
247 * for the strides that we are going to use.
249 fftw_plan *fftwnd_create_plans_specific(fftw_plan *plans,
250 int rank, const int *n,
251 const int *n_after,
252 fftw_direction dir, int flags,
253 fftw_complex *in, int istride,
254 fftw_complex *out, int ostride)
256 if (rank <= 0)
257 return 0;
259 if (plans) {
260 int i, stride, cur_flags;
261 fftw_complex *work = 0;
262 int nwork;
264 nwork = fftwnd_work_size(rank, n, flags, 1);
265 if (nwork)
266 work = (fftw_complex*)fftw_malloc(nwork * sizeof(fftw_complex));
268 for (i = 0; i < rank; ++i) {
269 /* fft's except the last dimension are always in-place */
270 if (i < rank - 1)
271 cur_flags = flags | FFTW_IN_PLACE;
272 else
273 cur_flags = flags;
275 /* stride for transforming ith dimension */
276 stride = n_after[i];
278 if (cur_flags & FFTW_IN_PLACE)
279 plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
280 in, istride * stride,
281 work, 1);
282 else
283 plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags,
284 in, istride * stride,
285 out, ostride * stride);
286 if (!plans[i]) {
287 destroy_plan_array(rank, plans);
288 fftw_free(work);
289 return 0;
293 if (work)
294 fftw_free(work);
296 return plans;
300 * Create an fftwnd_plan specialized for specific arrays. (These
301 * arrays are ignored, however, if they are NULL or if the flags do
302 * not include FFTW_MEASURE.) The main advantage of being provided
303 * arrays like this is that we can do runtime timing measurements of
304 * our options, without worrying about allocating excessive scratch
305 * space.
307 fftwnd_plan fftwnd_create_plan_specific(int rank, const int *n,
308 fftw_direction dir, int flags,
309 fftw_complex *in, int istride,
310 fftw_complex *out, int ostride)
312 fftwnd_plan p;
314 if (!(p = fftwnd_create_plan_aux(rank, n, dir, flags)))
315 return 0;
317 if (!(flags & FFTW_MEASURE) || in == 0
318 || (!p->is_in_place && out == 0)) {
320 /**** use default plan ****/
322 p->plans = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
323 rank, n, dir, flags);
324 if (!p->plans) {
325 fftwnd_destroy_plan(p);
326 return 0;
328 if (flags & FFTWND_FORCE_BUFFERED)
329 p->nbuffers = FFTWND_NBUFFERS;
330 else
331 p->nbuffers = FFTWND_DEFAULT_NBUFFERS;
333 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
334 if (p->nwork && !(flags & FFTW_THREADSAFE)) {
335 p->work = (fftw_complex*) fftw_malloc(p->nwork
336 * sizeof(fftw_complex));
337 if (!p->work) {
338 fftwnd_destroy_plan(p);
339 return 0;
342 } else {
343 /**** use runtime measurements to pick plan ****/
345 fftw_plan *plans_buf, *plans_nobuf;
346 double t_buf, t_nobuf;
348 p->nwork = fftwnd_work_size(rank, n, flags, FFTWND_NBUFFERS + 1);
349 if (p->nwork && !(flags & FFTW_THREADSAFE)) {
350 p->work = (fftw_complex*) fftw_malloc(p->nwork
351 * sizeof(fftw_complex));
352 if (!p->work) {
353 fftwnd_destroy_plan(p);
354 return 0;
357 else
358 p->work = (fftw_complex*) NULL;
360 /* two possible sets of 1D plans: */
361 plans_buf = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank),
362 rank, n, dir, flags);
363 plans_nobuf =
364 fftwnd_create_plans_specific(fftwnd_new_plan_array(rank),
365 rank, n, p->n_after, dir,
366 flags, in, istride,
367 out, ostride);
368 if (!plans_buf || !plans_nobuf) {
369 destroy_plan_array(rank, plans_nobuf);
370 destroy_plan_array(rank, plans_buf);
371 fftwnd_destroy_plan(p);
372 return 0;
374 /* time the two possible plans */
375 p->plans = plans_nobuf;
376 p->nbuffers = 0;
377 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
378 t_nobuf = fftwnd_measure_runtime(p, in, istride, out, ostride);
379 p->plans = plans_buf;
380 p->nbuffers = FFTWND_NBUFFERS;
381 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
382 t_buf = fftwnd_measure_runtime(p, in, istride, out, ostride);
384 /* pick the better one: */
385 if (t_nobuf < t_buf) { /* use unbuffered transform */
386 p->plans = plans_nobuf;
387 p->nbuffers = 0;
389 /* work array is unnecessarily large */
390 if (p->work)
391 fftw_free(p->work);
392 p->work = 0;
394 destroy_plan_array(rank, plans_buf);
396 /* allocate a work array of the correct size: */
397 p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1);
398 if (p->nwork && !(flags & FFTW_THREADSAFE)) {
399 p->work = (fftw_complex*) fftw_malloc(p->nwork
400 * sizeof(fftw_complex));
401 if (!p->work) {
402 fftwnd_destroy_plan(p);
403 return 0;
406 } else { /* use buffered transform */
407 destroy_plan_array(rank, plans_nobuf);
411 return p;
414 fftwnd_plan fftw2d_create_plan_specific(int nx, int ny,
415 fftw_direction dir, int flags,
416 fftw_complex *in, int istride,
417 fftw_complex *out, int ostride)
419 int n[2];
421 n[0] = nx;
422 n[1] = ny;
424 return fftwnd_create_plan_specific(2, n, dir, flags,
425 in, istride, out, ostride);
428 fftwnd_plan fftw3d_create_plan_specific(int nx, int ny, int nz,
429 fftw_direction dir, int flags,
430 fftw_complex *in, int istride,
431 fftw_complex *out, int ostride)
433 int n[3];
435 n[0] = nx;
436 n[1] = ny;
437 n[2] = nz;
439 return fftwnd_create_plan_specific(3, n, dir, flags,
440 in, istride, out, ostride);
443 /* Create a generic fftwnd plan: */
445 fftwnd_plan fftwnd_create_plan(int rank, const int *n,
446 fftw_direction dir, int flags)
448 return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1);
451 fftwnd_plan fftw2d_create_plan(int nx, int ny,
452 fftw_direction dir, int flags)
454 return fftw2d_create_plan_specific(nx, ny, dir, flags, 0, 1, 0, 1);
457 fftwnd_plan fftw3d_create_plan(int nx, int ny, int nz,
458 fftw_direction dir, int flags)
460 return fftw3d_create_plan_specific(nx, ny, nz, dir, flags, 0, 1, 0, 1);
463 /************************ Freeing the FFTWND Plan ************************/
465 static void destroy_plan_array(int rank, fftw_plan *plans)
467 if (plans) {
468 int i, j;
470 for (i = 0; i < rank; ++i) {
471 for (j = i - 1;
472 j >= 0 && plans[i] != plans[j];
473 --j);
474 if (j < 0 && plans[i])
475 fftw_destroy_plan(plans[i]);
477 fftw_free(plans);
481 void fftwnd_destroy_plan(fftwnd_plan plan)
483 if (plan) {
484 destroy_plan_array(plan->rank, plan->plans);
486 if (plan->n)
487 fftw_free(plan->n);
489 if (plan->n_before)
490 fftw_free(plan->n_before);
492 if (plan->n_after)
493 fftw_free(plan->n_after);
495 if (plan->work)
496 fftw_free(plan->work);
498 fftw_free(plan);
502 /************************ Printing the FFTWND Plan ************************/
504 void fftwnd_fprint_plan(FILE *f, fftwnd_plan plan)
506 if (plan) {
507 int i, j;
509 if (plan->rank == 0) {
510 fprintf(f, "plan for rank 0 (null) transform.\n");
511 return;
513 fprintf(f, "plan for ");
514 for (i = 0; i < plan->rank; ++i)
515 fprintf(f, "%s%d", i ? "x" : "", plan->n[i]);
516 fprintf(f, " transform:\n");
518 if (plan->nbuffers > 0)
519 fprintf(f, " -- using buffered transforms (%d buffers)\n",
520 plan->nbuffers);
521 else
522 fprintf(f, " -- using unbuffered transform\n");
524 for (i = 0; i < plan->rank; ++i) {
525 fprintf(f, "* dimension %d (size %d) ", i, plan->n[i]);
527 for (j = i - 1; j >= 0; --j)
528 if (plan->plans[j] == plan->plans[i])
529 break;
531 if (j < 0)
532 fftw_fprint_plan(f, plan->plans[i]);
533 else
534 fprintf(f, "plan is same as dimension %d plan.\n", j);
539 void fftwnd_print_plan(fftwnd_plan plan)
541 fftwnd_fprint_plan(stdout, plan);
544 /********************* Buffered FFTW (in-place) *********************/
546 void fftw_buffered(fftw_plan p, int howmany,
547 fftw_complex *in, int istride, int idist,
548 fftw_complex *work,
549 int nbuffers, fftw_complex *buffers)
551 int i = 0, n, nb;
553 n = p->n;
554 nb = n + FFTWND_BUFFER_PADDING;
556 do {
557 for (; i <= howmany - nbuffers; i += nbuffers) {
558 fftw_complex *cur_in = in + i * idist;
559 int j, buf;
562 * First, copy nbuffers strided arrays to the
563 * contiguous buffer arrays (reading consecutive
564 * locations, assuming that idist is 1):
566 for (j = 0; j < n; ++j) {
567 fftw_complex *cur_in2 = cur_in + j * istride;
568 fftw_complex *cur_buffers = buffers + j;
570 for (buf = 0; buf <= nbuffers - 4; buf += 4) {
571 *cur_buffers = *cur_in2;
572 *(cur_buffers += nb) = *(cur_in2 += idist);
573 *(cur_buffers += nb) = *(cur_in2 += idist);
574 *(cur_buffers += nb) = *(cur_in2 += idist);
575 cur_buffers += nb;
576 cur_in2 += idist;
578 for (; buf < nbuffers; ++buf) {
579 *cur_buffers = *cur_in2;
580 cur_buffers += nb;
581 cur_in2 += idist;
586 * Now, compute the FFTs in the buffers (in-place
587 * using work):
589 fftw(p, nbuffers, buffers, 1, nb, work, 1, 0);
592 * Finally, copy the results back from the contiguous
593 * buffers to the strided arrays (writing consecutive
594 * locations):
596 for (j = 0; j < n; ++j) {
597 fftw_complex *cur_in2 = cur_in + j * istride;
598 fftw_complex *cur_buffers = buffers + j;
600 for (buf = 0; buf <= nbuffers - 4; buf += 4) {
601 *cur_in2 = *cur_buffers;
602 *(cur_in2 += idist) = *(cur_buffers += nb);
603 *(cur_in2 += idist) = *(cur_buffers += nb);
604 *(cur_in2 += idist) = *(cur_buffers += nb);
605 cur_buffers += nb;
606 cur_in2 += idist;
608 for (; buf < nbuffers; ++buf) {
609 *cur_in2 = *cur_buffers;
610 cur_buffers += nb;
611 cur_in2 += idist;
617 * we skip howmany % nbuffers ffts at the end of the loop,
618 * so we have to go back and do them:
620 nbuffers = howmany - i;
621 } while (i < howmany);
624 /********************* Computing the N-Dimensional FFT *********************/
626 void fftwnd_aux(fftwnd_plan p, int cur_dim,
627 fftw_complex *in, int istride,
628 fftw_complex *out, int ostride,
629 fftw_complex *work)
631 int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
633 if (cur_dim == p->rank - 2) {
634 /* just do the last dimension directly: */
635 if (p->is_in_place)
636 fftw(p->plans[p->rank - 1], n,
637 in, istride, n_after * istride,
638 work, 1, 0);
639 else
640 fftw(p->plans[p->rank - 1], n,
641 in, istride, n_after * istride,
642 out, ostride, n_after * ostride);
643 } else { /* we have at least two dimensions to go */
644 int i;
647 * process the subsequent dimensions recursively, in hyperslabs,
648 * to get maximum locality:
650 for (i = 0; i < n; ++i)
651 fftwnd_aux(p, cur_dim + 1,
652 in + i * n_after * istride, istride,
653 out + i * n_after * ostride, ostride, work);
656 /* do the current dimension (in-place): */
657 if (p->nbuffers == 0) {
658 fftw(p->plans[cur_dim], n_after,
659 out, n_after * ostride, ostride,
660 work, 1, 0);
661 } else /* using contiguous copy buffers: */
662 fftw_buffered(p->plans[cur_dim], n_after,
663 out, n_after * ostride, ostride,
664 work, p->nbuffers, work + n);
668 * alternate version of fftwnd_aux -- this version pushes the howmany
669 * loop down to the leaves of the computation, for greater locality in
670 * cases where dist < stride
672 void fftwnd_aux_howmany(fftwnd_plan p, int cur_dim,
673 int howmany,
674 fftw_complex *in, int istride, int idist,
675 fftw_complex *out, int ostride, int odist,
676 fftw_complex *work)
678 int n_after = p->n_after[cur_dim], n = p->n[cur_dim];
679 int k;
681 if (cur_dim == p->rank - 2) {
682 /* just do the last dimension directly: */
683 if (p->is_in_place)
684 for (k = 0; k < n; ++k)
685 fftw(p->plans[p->rank - 1], howmany,
686 in + k * n_after * istride, istride, idist,
687 work, 1, 0);
688 else
689 for (k = 0; k < n; ++k)
690 fftw(p->plans[p->rank - 1], howmany,
691 in + k * n_after * istride, istride, idist,
692 out + k * n_after * ostride, ostride, odist);
693 } else { /* we have at least two dimensions to go */
694 int i;
697 * process the subsequent dimensions recursively, in
698 * hyperslabs, to get maximum locality:
700 for (i = 0; i < n; ++i)
701 fftwnd_aux_howmany(p, cur_dim + 1, howmany,
702 in + i * n_after * istride, istride, idist,
703 out + i * n_after * ostride, ostride, odist,
704 work);
707 /* do the current dimension (in-place): */
708 if (p->nbuffers == 0)
709 for (k = 0; k < n_after; ++k)
710 fftw(p->plans[cur_dim], howmany,
711 out + k * ostride, n_after * ostride, odist,
712 work, 1, 0);
713 else /* using contiguous copy buffers: */
714 for (k = 0; k < n_after; ++k)
715 fftw_buffered(p->plans[cur_dim], howmany,
716 out + k * ostride, n_after * ostride, odist,
717 work, p->nbuffers, work + n);
720 void fftwnd(fftwnd_plan p, int howmany,
721 fftw_complex *in, int istride, int idist,
722 fftw_complex *out, int ostride, int odist)
724 fftw_complex *work;
726 #ifdef FFTW_DEBUG
727 if (p->rank > 0 && (p->plans[0]->flags & FFTW_THREADSAFE)
728 && p->nwork && p->work)
729 fftw_die("bug with FFTW_THREADSAFE flag");
730 #endif
732 if (p->nwork && !p->work)
733 work = (fftw_complex *) fftw_malloc(p->nwork * sizeof(fftw_complex));
734 else
735 work = p->work;
737 switch (p->rank) {
738 case 0:
739 break;
740 case 1:
741 if (p->is_in_place) /* fft is in-place */
742 fftw(p->plans[0], howmany, in, istride, idist,
743 work, 1, 0);
744 else
745 fftw(p->plans[0], howmany, in, istride, idist,
746 out, ostride, odist);
747 break;
748 default: /* rank >= 2 */
750 if (p->is_in_place) {
751 out = in;
752 ostride = istride;
753 odist = idist;
755 if (howmany > 1 && odist < ostride)
756 fftwnd_aux_howmany(p, 0, howmany,
757 in, istride, idist,
758 out, ostride, odist,
759 work);
760 else {
761 int i;
763 for (i = 0; i < howmany; ++i)
764 fftwnd_aux(p, 0,
765 in + i * idist, istride,
766 out + i * odist, ostride,
767 work);
772 if (p->nwork && !p->work)
773 fftw_free(work);
777 void fftwnd_one(fftwnd_plan p, fftw_complex *in, fftw_complex *out)
779 fftwnd(p, 1, in, 1, 1, out, 1, 1);