2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * rexec_threads.c -- execute the fft in parallel
27 #include <fftw_threads-int.h>
28 #include <rfftw_threads.h>
30 extern void rfftw_strided_copy(int n
, fftw_real
*in
, int ostride
,
32 extern void rfftw_executor_simple(int n
, fftw_real
*in
,
38 static void rexec_simple_threads(int n
, fftw_real
*in
,
54 static void *rexec_simple_thread_r2c(fftw_loop_data
*ldata
)
56 int min
= ldata
->min
, max
= ldata
->max
;
57 rexec_simple_data
*d
= (rexec_simple_data
*) ldata
->data
;
58 int m
= d
->m
, r
= d
->r
;
59 fftw_real
*in
= d
->in
;
60 fftw_real
*out
= d
->out
;
61 fftw_plan_node
*p
= d
->p
;
62 int istride
= d
->istride
, ostride
= d
->ostride
;
63 int nthreads
= d
->nthreads
;
65 for (; min
< max
; ++min
)
66 rexec_simple_threads(m
, in
+ min
* istride
,
67 out
+ min
* (m
* ostride
),
75 static void *rexec_simple_thread_c2r(fftw_loop_data
*ldata
)
77 int min
= ldata
->min
, max
= ldata
->max
;
78 rexec_simple_data
*d
= (rexec_simple_data
*) ldata
->data
;
79 int m
= d
->m
, r
= d
->r
;
80 fftw_real
*in
= d
->in
;
81 fftw_real
*out
= d
->out
;
82 fftw_plan_node
*p
= d
->p
;
83 int istride
= d
->istride
, ostride
= d
->ostride
;
84 int nthreads
= d
->nthreads
;
86 for (; min
< max
; ++min
)
87 rexec_simple_threads(m
, in
+ min
* (m
* istride
),
96 static void spawn_h2hc_recurse_threads(int m
, int r
,
107 d
.in
= in
; d
.out
= out
;
108 d
.p
= p
->nodeu
.hc2hc
.recurse
;
111 d
.nthreads
= nthreads
/ r
;
113 switch (p
->nodeu
.hc2hc
.dir
) {
114 case FFTW_REAL_TO_COMPLEX
:
115 fftw_thread_spawn_loop(r
, nthreads
,
116 rexec_simple_thread_r2c
, &d
);
118 case FFTW_COMPLEX_TO_REAL
:
119 fftw_thread_spawn_loop(r
, nthreads
,
120 rexec_simple_thread_c2r
, &d
);
125 static void rexec_simple_threads(int n
, fftw_real
*in
, fftw_real
*out
,
133 HACK_ALIGN_STACK_ODD();
134 (p
->nodeu
.real2hc
.codelet
) (in
, out
, out
+ n
* ostride
,
135 istride
, ostride
, -ostride
);
139 HACK_ALIGN_STACK_ODD();
140 (p
->nodeu
.hc2real
.codelet
) (in
, in
+ n
* istride
, out
,
141 istride
, -istride
, ostride
);
146 int r
= p
->nodeu
.hc2hc
.size
;
149 fftw_hc2hc_codelet
*codelet
;
153 switch (p
->nodeu
.hc2hc
.dir
) {
154 case FFTW_REAL_TO_COMPLEX
:
155 for (i
= 0; i
< r
; ++i
)
156 rfftw_executor_simple(m
,
158 out
+ i
* (m
* ostride
),
159 p
->nodeu
.hc2hc
.recurse
,
160 istride
* r
, ostride
);
162 W
= p
->nodeu
.hc2hc
.tw
->twarray
;
163 codelet
= p
->nodeu
.hc2hc
.codelet
;
164 HACK_ALIGN_STACK_EVEN();
165 codelet(out
, W
, m
* ostride
, m
, ostride
);
167 case FFTW_COMPLEX_TO_REAL
:
168 W
= p
->nodeu
.hc2hc
.tw
->twarray
;
169 codelet
= p
->nodeu
.hc2hc
.codelet
;
170 HACK_ALIGN_STACK_EVEN();
171 codelet(in
, W
, m
* istride
, m
, istride
);
173 for (i
= 0; i
< r
; ++i
)
174 rfftw_executor_simple(m
,
175 in
+ i
* (m
* istride
),
177 p
->nodeu
.hc2hc
.recurse
,
178 istride
, ostride
* r
);
185 switch (p
->nodeu
.hc2hc
.dir
) {
186 case FFTW_REAL_TO_COMPLEX
:
187 spawn_h2hc_recurse_threads(m
, r
, in
, out
, p
,
191 W
= p
->nodeu
.hc2hc
.tw
->twarray
;
192 codelet
= p
->nodeu
.hc2hc
.codelet
;
193 HACK_ALIGN_STACK_EVEN();
194 codelet(out
, W
, m
* ostride
, m
, ostride
);
197 case FFTW_COMPLEX_TO_REAL
:
198 W
= p
->nodeu
.hc2hc
.tw
->twarray
;
199 codelet
= p
->nodeu
.hc2hc
.codelet
;
200 HACK_ALIGN_STACK_EVEN();
201 codelet(in
, W
, m
* istride
, m
, istride
);
203 spawn_h2hc_recurse_threads(m
, r
, in
, out
, p
,
214 int r
= p
->nodeu
.rgeneric
.size
;
217 fftw_rgeneric_codelet
*codelet
= p
->nodeu
.rgeneric
.codelet
;
218 fftw_complex
*W
= p
->nodeu
.rgeneric
.tw
->twarray
;
221 switch (p
->nodeu
.rgeneric
.dir
) {
222 case FFTW_REAL_TO_COMPLEX
:
223 for (i
= 0; i
< r
; ++i
)
224 rfftw_executor_simple(m
,
226 out
+ i
* (m
* ostride
),
227 p
->nodeu
.rgeneric
.recurse
,
228 istride
* r
, ostride
);
230 codelet(out
, W
, m
, r
, n
, ostride
);
232 case FFTW_COMPLEX_TO_REAL
:
233 codelet(in
, W
, m
, r
, n
, istride
);
235 for (i
= 0; i
< r
; ++i
)
236 rfftw_executor_simple(m
,
237 in
+ i
* m
* istride
,
239 p
->nodeu
.rgeneric
.recurse
,
240 istride
, ostride
* r
);
246 switch (p
->nodeu
.hc2hc
.dir
) {
247 case FFTW_REAL_TO_COMPLEX
:
248 spawn_h2hc_recurse_threads(m
, r
, in
, out
, p
,
251 codelet(out
, W
, m
, r
, n
, ostride
);
253 case FFTW_COMPLEX_TO_REAL
:
254 codelet(in
, W
, m
, r
, n
, istride
);
255 spawn_h2hc_recurse_threads(m
, r
, in
, out
, p
,
266 fftw_die("BUG in rexecutor: invalid plan\n");
271 static void rexecutor_simple_inplace_threads(int n
, fftw_real
*in
,
279 HACK_ALIGN_STACK_ODD();
280 (p
->nodeu
.real2hc
.codelet
) (in
, in
, in
+ n
* istride
,
281 istride
, istride
, -istride
);
285 HACK_ALIGN_STACK_ODD();
286 (p
->nodeu
.hc2real
.codelet
) (in
, in
+ n
* istride
, in
,
287 istride
, -istride
, istride
);
297 tmp
= (fftw_real
*) fftw_malloc(n
* sizeof(fftw_real
));
299 rexec_simple_threads(n
, in
, tmp
, p
, istride
, 1, nthreads
);
300 rfftw_strided_copy(n
, tmp
, istride
, in
);
310 fftw_real2hc_codelet
*r2c_codelet
;
311 fftw_hc2real_codelet
*c2r_codelet
;
317 int idist
, odist
, istride
, ostride
;
320 static void *rexec_many_r2c_codelet_thread(fftw_loop_data
*ldata
)
322 int min
= ldata
->min
, max
= ldata
->max
;
323 rexec_many_data
*d
= (rexec_many_data
*) ldata
->data
;
324 fftw_real2hc_codelet
*r2c_codelet
= d
->u
.r2c_codelet
;
326 fftw_real
*in
= d
->in
;
327 fftw_real
*out
= d
->out
;
328 int idist
= d
->idist
, odist
= d
->odist
;
329 int istride
= d
->istride
, ostride
= d
->ostride
;
331 HACK_ALIGN_STACK_ODD();
332 for (; min
< max
; ++min
)
333 r2c_codelet(in
+ min
* idist
,
335 out
+ n
* ostride
+ min
* odist
,
336 istride
, ostride
, -ostride
);
340 static void *rexec_many_c2r_codelet_thread(fftw_loop_data
*ldata
)
342 int min
= ldata
->min
, max
= ldata
->max
;
343 rexec_many_data
*d
= (rexec_many_data
*) ldata
->data
;
344 fftw_hc2real_codelet
*c2r_codelet
= d
->u
.c2r_codelet
;
346 fftw_real
*in
= d
->in
;
347 fftw_real
*out
= d
->out
;
348 int idist
= d
->idist
, odist
= d
->odist
;
349 int istride
= d
->istride
, ostride
= d
->ostride
;
351 HACK_ALIGN_STACK_ODD();
352 for (; min
< max
; ++min
)
353 c2r_codelet(in
+ min
* idist
,
354 in
+ n
* istride
+ min
* idist
,
356 istride
, -istride
, ostride
);
360 static void *rexec_many_simple_thread(fftw_loop_data
*ldata
)
362 int min
= ldata
->min
, max
= ldata
->max
;
363 rexec_many_data
*d
= (rexec_many_data
*) ldata
->data
;
364 fftw_plan_node
*p
= d
->u
.p
;
366 fftw_real
*in
= d
->in
;
367 fftw_real
*out
= d
->out
;
368 int idist
= d
->idist
, odist
= d
->odist
;
369 int istride
= d
->istride
, ostride
= d
->ostride
;
371 for (; min
< max
; ++min
)
372 rfftw_executor_simple(n
, in
+ min
* idist
,
374 p
, istride
, ostride
);
379 static void rexecutor_many_threads(int n
, fftw_real
*in
,
384 int howmany
, int idist
, int odist
,
387 if (nthreads
> howmany
)
394 fftw_real2hc_codelet
*codelet
= p
->nodeu
.real2hc
.codelet
;
397 HACK_ALIGN_STACK_ODD();
398 for (s
= 0; s
< howmany
; ++s
)
399 codelet(in
+ s
* idist
, out
+ s
* odist
,
400 out
+ n
* ostride
+ s
* odist
,
401 istride
, ostride
, -ostride
);
409 d
.u
.r2c_codelet
= codelet
;
414 fftw_thread_spawn_loop(howmany
, nthreads
,
415 rexec_many_r2c_codelet_thread
, &d
);
424 fftw_hc2real_codelet
*codelet
= p
->nodeu
.hc2real
.codelet
;
427 HACK_ALIGN_STACK_ODD();
428 for (s
= 0; s
< howmany
; ++s
)
429 codelet(in
+ s
* idist
,
430 in
+ n
* istride
+ s
* idist
,
432 istride
, -istride
, ostride
);
440 d
.u
.c2r_codelet
= codelet
;
445 fftw_thread_spawn_loop(howmany
, nthreads
,
446 rexec_many_c2r_codelet_thread
, &d
);
457 for (s
= 0; s
< howmany
; ++s
) {
458 rfftw_executor_simple(n
, in
+ s
* idist
,
460 p
, istride
, ostride
);
465 d
.in
= in
; d
.out
= out
;
472 fftw_thread_spawn_loop(howmany
, nthreads
,
473 rexec_many_simple_thread
, &d
);
479 static void *rexec_many_simple_inplace_thread(fftw_loop_data
*ldata
)
481 int min
= ldata
->min
, max
= ldata
->max
;
482 rexec_many_data
*d
= (rexec_many_data
*) ldata
->data
;
483 fftw_plan_node
*p
= d
->u
.p
;
485 fftw_real
*in
= d
->in
;
486 fftw_real
*out
= d
->out
+ n
* ldata
->thread_num
;
487 int idist
= d
->idist
;
488 int istride
= d
->istride
;
490 for (; min
< max
; ++min
) {
491 rfftw_executor_simple(n
, in
+ min
* idist
, out
, p
, istride
, 1);
492 rfftw_strided_copy(n
, out
, istride
, in
+ min
* idist
);
498 static void rexecutor_many_inplace_threads(int n
, fftw_real
*in
,
502 int howmany
, int idist
,
509 fftw_real2hc_codelet
*codelet
= p
->nodeu
.real2hc
.codelet
;
512 HACK_ALIGN_STACK_ODD();
513 for (s
= 0; s
< howmany
; ++s
)
514 codelet(in
+ s
* idist
, in
+ s
* idist
,
515 in
+ n
* istride
+ s
* idist
,
516 istride
, istride
, -istride
);
524 d
.u
.r2c_codelet
= codelet
;
529 fftw_thread_spawn_loop(howmany
, nthreads
,
530 rexec_many_r2c_codelet_thread
, &d
);
539 fftw_hc2real_codelet
*codelet
= p
->nodeu
.hc2real
.codelet
;
542 HACK_ALIGN_STACK_ODD();
543 for (s
= 0; s
< howmany
; ++s
)
544 codelet(in
+ s
* idist
,
545 in
+ n
* istride
+ s
* idist
,
547 istride
, -istride
, istride
);
555 d
.u
.c2r_codelet
= codelet
;
560 fftw_thread_spawn_loop(howmany
, nthreads
,
561 rexec_many_c2r_codelet_thread
, &d
);
572 if (nthreads
> howmany
)
579 tmp
=(fftw_real
*) fftw_malloc(n
*
582 for (s
= 0; s
< howmany
; ++s
) {
583 rfftw_executor_simple(n
,
587 rfftw_strided_copy(n
, tmp
, istride
,
598 fftw_malloc(nthreads
* n
* sizeof(fftw_real
));
608 fftw_thread_spawn_loop(howmany
, nthreads
,
609 rexec_many_simple_inplace_thread
,&d
);
618 void rfftw_threads(int nthreads
,
619 fftw_plan plan
, int howmany
, fftw_real
*in
, int istride
,
620 int idist
, fftw_real
*out
, int ostride
, int odist
)
624 if (plan
->flags
& FFTW_IN_PLACE
) {
626 rexecutor_simple_inplace_threads(n
, in
, out
, plan
->root
,
629 rexecutor_many_inplace_threads(n
, in
, out
, plan
->root
,
630 istride
, howmany
, idist
,
635 rexec_simple_threads(n
, in
, out
, plan
->root
, istride
, ostride
,
638 rexecutor_many_threads(n
, in
, out
, plan
->root
, istride
, ostride
,
639 howmany
, idist
, odist
, nthreads
);
644 void rfftw_threads_one(int nthreads
,
645 fftw_plan plan
, fftw_real
*in
, fftw_real
*out
)
649 if (plan
->flags
& FFTW_IN_PLACE
)
650 rexecutor_simple_inplace_threads(n
, in
, out
, plan
->root
, 1,
653 rexec_simple_threads(n
, in
, out
, plan
->root
, 1, 1, nthreads
);