2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* the number of buffers to use for buffered transforms: */
25 #define FFTWND_NBUFFERS 8
27 /* the default number of buffers to use: */
28 #define FFTWND_DEFAULT_NBUFFERS 0
30 /* the number of "padding" elements between consecutive buffer lines */
31 #define FFTWND_BUFFER_PADDING 8
33 static void destroy_plan_array(int rank
, fftw_plan
*plans
);
35 static void init_test_array(fftw_complex
*arr
, int stride
, int n
)
39 for (j
= 0; j
< n
; ++j
) {
40 c_re(arr
[stride
* j
]) = 0.0;
41 c_im(arr
[stride
* j
]) = 0.0;
46 * Same as fftw_measure_runtime, except for fftwnd plan.
48 double fftwnd_measure_runtime(fftwnd_plan plan
,
49 fftw_complex
*in
, int istride
,
50 fftw_complex
*out
, int ostride
)
52 fftw_time begin
, end
, start
;
62 for (i
= 0; i
< plan
->rank
; ++i
)
70 init_test_array(in
, istride
, n
);
72 start
= fftw_get_time();
73 /* repeat the measurement FFTW_TIME_REPEAT times */
74 for (repeat
= 0; repeat
< FFTW_TIME_REPEAT
; ++repeat
) {
75 begin
= fftw_get_time();
76 for (i
= 0; i
< iter
; ++i
) {
77 fftwnd(plan
, 1, in
, istride
, 0, out
, ostride
, 0);
79 end
= fftw_get_time();
81 t
= fftw_time_to_sec(fftw_time_diff(end
, begin
));
87 /* do not run for too long */
88 t
= fftw_time_to_sec(fftw_time_diff(end
, start
));
89 if (t
> FFTW_TIME_LIMIT
)
93 if (tmin
>= FFTW_TIME_MIN
)
99 tmin
/= (double) iter
;
100 tmax
/= (double) iter
;
105 /********************** Initializing the FFTWND Plan ***********************/
107 /* Initialize everything except for the 1D plans and the work array: */
108 fftwnd_plan
fftwnd_create_plan_aux(int rank
, const int *n
,
109 fftw_direction dir
, int flags
)
117 for (i
= 0; i
< rank
; ++i
)
121 p
= (fftwnd_plan
) fftw_malloc(sizeof(fftwnd_data
));
130 p
->is_in_place
= flags
& FFTW_IN_PLACE
;
138 p
->n
= (int *) fftw_malloc(sizeof(int) * rank
);
139 p
->n_before
= (int *) fftw_malloc(sizeof(int) * rank
);
140 p
->n_after
= (int *) fftw_malloc(sizeof(int) * rank
);
142 p
->n_after
[rank
- 1] = 1;
144 for (i
= 0; i
< rank
; ++i
) {
148 p
->n_before
[i
] = p
->n_before
[i
- 1] * n
[i
- 1];
149 p
->n_after
[rank
- 1 - i
] = p
->n_after
[rank
- i
] * n
[rank
- i
];
156 /* create an empty new array of rank 1d plans */
157 fftw_plan
*fftwnd_new_plan_array(int rank
)
162 plans
= (fftw_plan
*) fftw_malloc(rank
* sizeof(fftw_plan
));
165 for (i
= 0; i
< rank
; ++i
)
171 * create an array of plans using the ordinary 1d fftw_create_plan,
172 * which allocates its own array and creates plans optimized for
175 fftw_plan
*fftwnd_create_plans_generic(fftw_plan
*plans
,
176 int rank
, const int *n
,
177 fftw_direction dir
, int flags
)
186 for (i
= 0; i
< rank
; ++i
) {
187 if (i
< rank
- 1 || (flags
& FFTW_IN_PLACE
)) {
189 * fft's except the last dimension are always in-place
191 cur_flags
= flags
| FFTW_IN_PLACE
;
192 for (j
= i
- 1; j
>= 0 && n
[i
] != n
[j
]; --j
);
196 * we must create a separate plan for the last
204 * If a plan already exists for this size
209 /* generate a new plan: */
210 plans
[i
] = fftw_create_plan(n
[i
], dir
, cur_flags
);
212 destroy_plan_array(rank
, plans
);
221 static int get_maxdim(int rank
, const int *n
, int flags
)
226 for (i
= 0; i
< rank
- 1; ++i
)
229 if (rank
> 0 && flags
& FFTW_IN_PLACE
&& n
[rank
- 1] > maxdim
)
230 maxdim
= n
[rank
- 1];
235 /* compute number of elements required for work array (has to
236 be big enough to hold ncopies of the largest dimension in
237 n that will need an in-place transform. */
238 int fftwnd_work_size(int rank
, const int *n
, int flags
, int ncopies
)
240 return (ncopies
* get_maxdim(rank
, n
, flags
)
241 + (ncopies
- 1) * FFTWND_BUFFER_PADDING
);
245 * create plans using the fftw_create_plan_specific planner, which
246 * allows us to create plans for each dimension that are specialized
247 * for the strides that we are going to use.
249 fftw_plan
*fftwnd_create_plans_specific(fftw_plan
*plans
,
250 int rank
, const int *n
,
252 fftw_direction dir
, int flags
,
253 fftw_complex
*in
, int istride
,
254 fftw_complex
*out
, int ostride
)
260 int i
, stride
, cur_flags
;
261 fftw_complex
*work
= 0;
264 nwork
= fftwnd_work_size(rank
, n
, flags
, 1);
266 work
= (fftw_complex
*)fftw_malloc(nwork
* sizeof(fftw_complex
));
268 for (i
= 0; i
< rank
; ++i
) {
269 /* fft's except the last dimension are always in-place */
271 cur_flags
= flags
| FFTW_IN_PLACE
;
275 /* stride for transforming ith dimension */
278 if (cur_flags
& FFTW_IN_PLACE
)
279 plans
[i
] = fftw_create_plan_specific(n
[i
], dir
, cur_flags
,
280 in
, istride
* stride
,
283 plans
[i
] = fftw_create_plan_specific(n
[i
], dir
, cur_flags
,
284 in
, istride
* stride
,
285 out
, ostride
* stride
);
287 destroy_plan_array(rank
, plans
);
300 * Create an fftwnd_plan specialized for specific arrays. (These
301 * arrays are ignored, however, if they are NULL or if the flags do
302 * not include FFTW_MEASURE.) The main advantage of being provided
303 * arrays like this is that we can do runtime timing measurements of
304 * our options, without worrying about allocating excessive scratch
307 fftwnd_plan
fftwnd_create_plan_specific(int rank
, const int *n
,
308 fftw_direction dir
, int flags
,
309 fftw_complex
*in
, int istride
,
310 fftw_complex
*out
, int ostride
)
314 if (!(p
= fftwnd_create_plan_aux(rank
, n
, dir
, flags
)))
317 if (!(flags
& FFTW_MEASURE
) || in
== 0
318 || (!p
->is_in_place
&& out
== 0)) {
320 /**** use default plan ****/
322 p
->plans
= fftwnd_create_plans_generic(fftwnd_new_plan_array(rank
),
323 rank
, n
, dir
, flags
);
325 fftwnd_destroy_plan(p
);
328 if (flags
& FFTWND_FORCE_BUFFERED
)
329 p
->nbuffers
= FFTWND_NBUFFERS
;
331 p
->nbuffers
= FFTWND_DEFAULT_NBUFFERS
;
333 p
->nwork
= fftwnd_work_size(rank
, n
, flags
, p
->nbuffers
+ 1);
334 if (p
->nwork
&& !(flags
& FFTW_THREADSAFE
)) {
335 p
->work
= (fftw_complex
*) fftw_malloc(p
->nwork
336 * sizeof(fftw_complex
));
338 fftwnd_destroy_plan(p
);
343 /**** use runtime measurements to pick plan ****/
345 fftw_plan
*plans_buf
, *plans_nobuf
;
346 double t_buf
, t_nobuf
;
348 p
->nwork
= fftwnd_work_size(rank
, n
, flags
, FFTWND_NBUFFERS
+ 1);
349 if (p
->nwork
&& !(flags
& FFTW_THREADSAFE
)) {
350 p
->work
= (fftw_complex
*) fftw_malloc(p
->nwork
351 * sizeof(fftw_complex
));
353 fftwnd_destroy_plan(p
);
358 p
->work
= (fftw_complex
*) NULL
;
360 /* two possible sets of 1D plans: */
361 plans_buf
= fftwnd_create_plans_generic(fftwnd_new_plan_array(rank
),
362 rank
, n
, dir
, flags
);
364 fftwnd_create_plans_specific(fftwnd_new_plan_array(rank
),
365 rank
, n
, p
->n_after
, dir
,
368 if (!plans_buf
|| !plans_nobuf
) {
369 destroy_plan_array(rank
, plans_nobuf
);
370 destroy_plan_array(rank
, plans_buf
);
371 fftwnd_destroy_plan(p
);
374 /* time the two possible plans */
375 p
->plans
= plans_nobuf
;
377 p
->nwork
= fftwnd_work_size(rank
, n
, flags
, p
->nbuffers
+ 1);
378 t_nobuf
= fftwnd_measure_runtime(p
, in
, istride
, out
, ostride
);
379 p
->plans
= plans_buf
;
380 p
->nbuffers
= FFTWND_NBUFFERS
;
381 p
->nwork
= fftwnd_work_size(rank
, n
, flags
, p
->nbuffers
+ 1);
382 t_buf
= fftwnd_measure_runtime(p
, in
, istride
, out
, ostride
);
384 /* pick the better one: */
385 if (t_nobuf
< t_buf
) { /* use unbuffered transform */
386 p
->plans
= plans_nobuf
;
389 /* work array is unnecessarily large */
394 destroy_plan_array(rank
, plans_buf
);
396 /* allocate a work array of the correct size: */
397 p
->nwork
= fftwnd_work_size(rank
, n
, flags
, p
->nbuffers
+ 1);
398 if (p
->nwork
&& !(flags
& FFTW_THREADSAFE
)) {
399 p
->work
= (fftw_complex
*) fftw_malloc(p
->nwork
400 * sizeof(fftw_complex
));
402 fftwnd_destroy_plan(p
);
406 } else { /* use buffered transform */
407 destroy_plan_array(rank
, plans_nobuf
);
414 fftwnd_plan
fftw2d_create_plan_specific(int nx
, int ny
,
415 fftw_direction dir
, int flags
,
416 fftw_complex
*in
, int istride
,
417 fftw_complex
*out
, int ostride
)
424 return fftwnd_create_plan_specific(2, n
, dir
, flags
,
425 in
, istride
, out
, ostride
);
428 fftwnd_plan
fftw3d_create_plan_specific(int nx
, int ny
, int nz
,
429 fftw_direction dir
, int flags
,
430 fftw_complex
*in
, int istride
,
431 fftw_complex
*out
, int ostride
)
439 return fftwnd_create_plan_specific(3, n
, dir
, flags
,
440 in
, istride
, out
, ostride
);
443 /* Create a generic fftwnd plan: */
445 fftwnd_plan
fftwnd_create_plan(int rank
, const int *n
,
446 fftw_direction dir
, int flags
)
448 return fftwnd_create_plan_specific(rank
, n
, dir
, flags
, 0, 1, 0, 1);
451 fftwnd_plan
fftw2d_create_plan(int nx
, int ny
,
452 fftw_direction dir
, int flags
)
454 return fftw2d_create_plan_specific(nx
, ny
, dir
, flags
, 0, 1, 0, 1);
457 fftwnd_plan
fftw3d_create_plan(int nx
, int ny
, int nz
,
458 fftw_direction dir
, int flags
)
460 return fftw3d_create_plan_specific(nx
, ny
, nz
, dir
, flags
, 0, 1, 0, 1);
463 /************************ Freeing the FFTWND Plan ************************/
465 static void destroy_plan_array(int rank
, fftw_plan
*plans
)
470 for (i
= 0; i
< rank
; ++i
) {
472 j
>= 0 && plans
[i
] != plans
[j
];
474 if (j
< 0 && plans
[i
])
475 fftw_destroy_plan(plans
[i
]);
481 void fftwnd_destroy_plan(fftwnd_plan plan
)
484 destroy_plan_array(plan
->rank
, plan
->plans
);
490 fftw_free(plan
->n_before
);
493 fftw_free(plan
->n_after
);
496 fftw_free(plan
->work
);
502 /************************ Printing the FFTWND Plan ************************/
504 void fftwnd_fprint_plan(FILE *f
, fftwnd_plan plan
)
509 if (plan
->rank
== 0) {
510 fprintf(f
, "plan for rank 0 (null) transform.\n");
513 fprintf(f
, "plan for ");
514 for (i
= 0; i
< plan
->rank
; ++i
)
515 fprintf(f
, "%s%d", i
? "x" : "", plan
->n
[i
]);
516 fprintf(f
, " transform:\n");
518 if (plan
->nbuffers
> 0)
519 fprintf(f
, " -- using buffered transforms (%d buffers)\n",
522 fprintf(f
, " -- using unbuffered transform\n");
524 for (i
= 0; i
< plan
->rank
; ++i
) {
525 fprintf(f
, "* dimension %d (size %d) ", i
, plan
->n
[i
]);
527 for (j
= i
- 1; j
>= 0; --j
)
528 if (plan
->plans
[j
] == plan
->plans
[i
])
532 fftw_fprint_plan(f
, plan
->plans
[i
]);
534 fprintf(f
, "plan is same as dimension %d plan.\n", j
);
539 void fftwnd_print_plan(fftwnd_plan plan
)
541 fftwnd_fprint_plan(stdout
, plan
);
544 /********************* Buffered FFTW (in-place) *********************/
546 void fftw_buffered(fftw_plan p
, int howmany
,
547 fftw_complex
*in
, int istride
, int idist
,
549 int nbuffers
, fftw_complex
*buffers
)
554 nb
= n
+ FFTWND_BUFFER_PADDING
;
557 for (; i
<= howmany
- nbuffers
; i
+= nbuffers
) {
558 fftw_complex
*cur_in
= in
+ i
* idist
;
562 * First, copy nbuffers strided arrays to the
563 * contiguous buffer arrays (reading consecutive
564 * locations, assuming that idist is 1):
566 for (j
= 0; j
< n
; ++j
) {
567 fftw_complex
*cur_in2
= cur_in
+ j
* istride
;
568 fftw_complex
*cur_buffers
= buffers
+ j
;
570 for (buf
= 0; buf
<= nbuffers
- 4; buf
+= 4) {
571 *cur_buffers
= *cur_in2
;
572 *(cur_buffers
+= nb
) = *(cur_in2
+= idist
);
573 *(cur_buffers
+= nb
) = *(cur_in2
+= idist
);
574 *(cur_buffers
+= nb
) = *(cur_in2
+= idist
);
578 for (; buf
< nbuffers
; ++buf
) {
579 *cur_buffers
= *cur_in2
;
586 * Now, compute the FFTs in the buffers (in-place
589 fftw(p
, nbuffers
, buffers
, 1, nb
, work
, 1, 0);
592 * Finally, copy the results back from the contiguous
593 * buffers to the strided arrays (writing consecutive
596 for (j
= 0; j
< n
; ++j
) {
597 fftw_complex
*cur_in2
= cur_in
+ j
* istride
;
598 fftw_complex
*cur_buffers
= buffers
+ j
;
600 for (buf
= 0; buf
<= nbuffers
- 4; buf
+= 4) {
601 *cur_in2
= *cur_buffers
;
602 *(cur_in2
+= idist
) = *(cur_buffers
+= nb
);
603 *(cur_in2
+= idist
) = *(cur_buffers
+= nb
);
604 *(cur_in2
+= idist
) = *(cur_buffers
+= nb
);
608 for (; buf
< nbuffers
; ++buf
) {
609 *cur_in2
= *cur_buffers
;
617 * we skip howmany % nbuffers ffts at the end of the loop,
618 * so we have to go back and do them:
620 nbuffers
= howmany
- i
;
621 } while (i
< howmany
);
624 /********************* Computing the N-Dimensional FFT *********************/
626 void fftwnd_aux(fftwnd_plan p
, int cur_dim
,
627 fftw_complex
*in
, int istride
,
628 fftw_complex
*out
, int ostride
,
631 int n_after
= p
->n_after
[cur_dim
], n
= p
->n
[cur_dim
];
633 if (cur_dim
== p
->rank
- 2) {
634 /* just do the last dimension directly: */
636 fftw(p
->plans
[p
->rank
- 1], n
,
637 in
, istride
, n_after
* istride
,
640 fftw(p
->plans
[p
->rank
- 1], n
,
641 in
, istride
, n_after
* istride
,
642 out
, ostride
, n_after
* ostride
);
643 } else { /* we have at least two dimensions to go */
647 * process the subsequent dimensions recursively, in hyperslabs,
648 * to get maximum locality:
650 for (i
= 0; i
< n
; ++i
)
651 fftwnd_aux(p
, cur_dim
+ 1,
652 in
+ i
* n_after
* istride
, istride
,
653 out
+ i
* n_after
* ostride
, ostride
, work
);
656 /* do the current dimension (in-place): */
657 if (p
->nbuffers
== 0) {
658 fftw(p
->plans
[cur_dim
], n_after
,
659 out
, n_after
* ostride
, ostride
,
661 } else /* using contiguous copy buffers: */
662 fftw_buffered(p
->plans
[cur_dim
], n_after
,
663 out
, n_after
* ostride
, ostride
,
664 work
, p
->nbuffers
, work
+ n
);
668 * alternate version of fftwnd_aux -- this version pushes the howmany
669 * loop down to the leaves of the computation, for greater locality in
670 * cases where dist < stride
672 void fftwnd_aux_howmany(fftwnd_plan p
, int cur_dim
,
674 fftw_complex
*in
, int istride
, int idist
,
675 fftw_complex
*out
, int ostride
, int odist
,
678 int n_after
= p
->n_after
[cur_dim
], n
= p
->n
[cur_dim
];
681 if (cur_dim
== p
->rank
- 2) {
682 /* just do the last dimension directly: */
684 for (k
= 0; k
< n
; ++k
)
685 fftw(p
->plans
[p
->rank
- 1], howmany
,
686 in
+ k
* n_after
* istride
, istride
, idist
,
689 for (k
= 0; k
< n
; ++k
)
690 fftw(p
->plans
[p
->rank
- 1], howmany
,
691 in
+ k
* n_after
* istride
, istride
, idist
,
692 out
+ k
* n_after
* ostride
, ostride
, odist
);
693 } else { /* we have at least two dimensions to go */
697 * process the subsequent dimensions recursively, in
698 * hyperslabs, to get maximum locality:
700 for (i
= 0; i
< n
; ++i
)
701 fftwnd_aux_howmany(p
, cur_dim
+ 1, howmany
,
702 in
+ i
* n_after
* istride
, istride
, idist
,
703 out
+ i
* n_after
* ostride
, ostride
, odist
,
707 /* do the current dimension (in-place): */
708 if (p
->nbuffers
== 0)
709 for (k
= 0; k
< n_after
; ++k
)
710 fftw(p
->plans
[cur_dim
], howmany
,
711 out
+ k
* ostride
, n_after
* ostride
, odist
,
713 else /* using contiguous copy buffers: */
714 for (k
= 0; k
< n_after
; ++k
)
715 fftw_buffered(p
->plans
[cur_dim
], howmany
,
716 out
+ k
* ostride
, n_after
* ostride
, odist
,
717 work
, p
->nbuffers
, work
+ n
);
720 void fftwnd(fftwnd_plan p
, int howmany
,
721 fftw_complex
*in
, int istride
, int idist
,
722 fftw_complex
*out
, int ostride
, int odist
)
727 if (p
->rank
> 0 && (p
->plans
[0]->flags
& FFTW_THREADSAFE
)
728 && p
->nwork
&& p
->work
)
729 fftw_die("bug with FFTW_THREADSAFE flag");
732 if (p
->nwork
&& !p
->work
)
733 work
= (fftw_complex
*) fftw_malloc(p
->nwork
* sizeof(fftw_complex
));
741 if (p
->is_in_place
) /* fft is in-place */
742 fftw(p
->plans
[0], howmany
, in
, istride
, idist
,
745 fftw(p
->plans
[0], howmany
, in
, istride
, idist
,
746 out
, ostride
, odist
);
748 default: /* rank >= 2 */
750 if (p
->is_in_place
) {
755 if (howmany
> 1 && odist
< ostride
)
756 fftwnd_aux_howmany(p
, 0, howmany
,
763 for (i
= 0; i
< howmany
; ++i
)
765 in
+ i
* idist
, istride
,
766 out
+ i
* odist
, ostride
,
772 if (p
->nwork
&& !p
->work
)
777 void fftwnd_one(fftwnd_plan p
, fftw_complex
*in
, fftw_complex
*out
)
779 fftwnd(p
, 1, in
, 1, 1, out
, 1, 1);