changed reading hint
[gromacs/adressmacs.git] / src / fftw / rexec_threads.c
blob89dad7646d1f579c0366e9738a9ad6a53b56d2ec
1 /*
2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * rexec_threads.c -- execute the fft in parallel
24 #include <stdio.h>
25 #include <stdlib.h>
27 #include <fftw_threads-int.h>
28 #include <rfftw_threads.h>
30 extern void rfftw_strided_copy(int n, fftw_real *in, int ostride,
31 fftw_real *out);
32 extern void rfftw_executor_simple(int n, fftw_real *in,
33 fftw_real *out,
34 fftw_plan_node *p,
35 int istride,
36 int ostride);
38 static void rexec_simple_threads(int n, fftw_real *in,
39 fftw_real *out,
40 fftw_plan_node *p,
41 int istride,
42 int ostride,
43 int nthreads);
45 typedef struct {
46 int m,r;
47 fftw_real *in;
48 fftw_real *out;
49 fftw_plan_node *p;
50 int istride, ostride;
51 int nthreads;
52 } rexec_simple_data;
54 static void *rexec_simple_thread_r2c(fftw_loop_data *ldata)
56 int min = ldata->min, max = ldata->max;
57 rexec_simple_data *d = (rexec_simple_data *) ldata->data;
58 int m = d->m, r = d->r;
59 fftw_real *in = d->in;
60 fftw_real *out = d->out;
61 fftw_plan_node *p = d->p;
62 int istride = d->istride, ostride = d->ostride;
63 int nthreads = d->nthreads;
65 for (; min < max; ++min)
66 rexec_simple_threads(m, in + min * istride,
67 out + min * (m * ostride),
69 istride * r, ostride,
70 nthreads);
72 return 0;
75 static void *rexec_simple_thread_c2r(fftw_loop_data *ldata)
77 int min = ldata->min, max = ldata->max;
78 rexec_simple_data *d = (rexec_simple_data *) ldata->data;
79 int m = d->m, r = d->r;
80 fftw_real *in = d->in;
81 fftw_real *out = d->out;
82 fftw_plan_node *p = d->p;
83 int istride = d->istride, ostride = d->ostride;
84 int nthreads = d->nthreads;
86 for (; min < max; ++min)
87 rexec_simple_threads(m, in + min * (m * istride),
88 out + min * ostride,
90 istride, ostride * r,
91 nthreads);
93 return 0;
96 static void spawn_h2hc_recurse_threads(int m, int r,
97 fftw_real *in,
98 fftw_real *out,
99 fftw_plan_node *p,
100 int istride,
101 int ostride,
102 int nthreads)
104 rexec_simple_data d;
106 d.m = m; d.r = r;
107 d.in = in; d.out = out;
108 d.p = p->nodeu.hc2hc.recurse;
109 d.istride = istride;
110 d.ostride = ostride;
111 d.nthreads = nthreads / r;
113 switch (p->nodeu.hc2hc.dir) {
114 case FFTW_REAL_TO_COMPLEX:
115 fftw_thread_spawn_loop(r, nthreads,
116 rexec_simple_thread_r2c, &d);
117 break;
118 case FFTW_COMPLEX_TO_REAL:
119 fftw_thread_spawn_loop(r, nthreads,
120 rexec_simple_thread_c2r, &d);
121 break;
125 static void rexec_simple_threads(int n, fftw_real *in, fftw_real *out,
126 fftw_plan_node *p,
127 int istride,
128 int ostride,
129 int nthreads)
131 switch (p->type) {
132 case FFTW_REAL2HC:
133 HACK_ALIGN_STACK_ODD();
134 (p->nodeu.real2hc.codelet) (in, out, out + n * ostride,
135 istride, ostride, -ostride);
136 break;
138 case FFTW_HC2REAL:
139 HACK_ALIGN_STACK_ODD();
140 (p->nodeu.hc2real.codelet) (in, in + n * istride, out,
141 istride, -istride, ostride);
142 break;
144 case FFTW_HC2HC:
146 int r = p->nodeu.hc2hc.size;
147 int m = n / r;
148 int i;
149 fftw_hc2hc_codelet *codelet;
150 fftw_complex *W;
152 if (nthreads <= 1) {
153 switch (p->nodeu.hc2hc.dir) {
154 case FFTW_REAL_TO_COMPLEX:
155 for (i = 0; i < r; ++i)
156 rfftw_executor_simple(m,
157 in + i * istride,
158 out + i * (m * ostride),
159 p->nodeu.hc2hc.recurse,
160 istride * r, ostride);
162 W = p->nodeu.hc2hc.tw->twarray;
163 codelet = p->nodeu.hc2hc.codelet;
164 HACK_ALIGN_STACK_EVEN();
165 codelet(out, W, m * ostride, m, ostride);
166 break;
167 case FFTW_COMPLEX_TO_REAL:
168 W = p->nodeu.hc2hc.tw->twarray;
169 codelet = p->nodeu.hc2hc.codelet;
170 HACK_ALIGN_STACK_EVEN();
171 codelet(in, W, m * istride, m, istride);
173 for (i = 0; i < r; ++i)
174 rfftw_executor_simple(m,
175 in + i * (m * istride),
176 out + i * ostride,
177 p->nodeu.hc2hc.recurse,
178 istride, ostride * r);
179 break;
180 default:
181 goto bug;
184 else
185 switch (p->nodeu.hc2hc.dir) {
186 case FFTW_REAL_TO_COMPLEX:
187 spawn_h2hc_recurse_threads(m, r, in, out, p,
188 istride, ostride,
189 nthreads);
191 W = p->nodeu.hc2hc.tw->twarray;
192 codelet = p->nodeu.hc2hc.codelet;
193 HACK_ALIGN_STACK_EVEN();
194 codelet(out, W, m * ostride, m, ostride);
196 break;
197 case FFTW_COMPLEX_TO_REAL:
198 W = p->nodeu.hc2hc.tw->twarray;
199 codelet = p->nodeu.hc2hc.codelet;
200 HACK_ALIGN_STACK_EVEN();
201 codelet(in, W, m * istride, m, istride);
203 spawn_h2hc_recurse_threads(m, r, in, out, p,
204 istride, ostride,
205 nthreads);
206 break;
209 break;
212 case FFTW_RGENERIC:
214 int r = p->nodeu.rgeneric.size;
215 int m = n / r;
216 int i;
217 fftw_rgeneric_codelet *codelet = p->nodeu.rgeneric.codelet;
218 fftw_complex *W = p->nodeu.rgeneric.tw->twarray;
220 if (nthreads <= 1)
221 switch (p->nodeu.rgeneric.dir) {
222 case FFTW_REAL_TO_COMPLEX:
223 for (i = 0; i < r; ++i)
224 rfftw_executor_simple(m,
225 in + i * istride,
226 out + i * (m * ostride),
227 p->nodeu.rgeneric.recurse,
228 istride * r, ostride);
230 codelet(out, W, m, r, n, ostride);
231 break;
232 case FFTW_COMPLEX_TO_REAL:
233 codelet(in, W, m, r, n, istride);
235 for (i = 0; i < r; ++i)
236 rfftw_executor_simple(m,
237 in + i * m * istride,
238 out + i * ostride,
239 p->nodeu.rgeneric.recurse,
240 istride, ostride * r);
241 break;
242 default:
243 goto bug;
245 else
246 switch (p->nodeu.hc2hc.dir) {
247 case FFTW_REAL_TO_COMPLEX:
248 spawn_h2hc_recurse_threads(m, r, in, out, p,
249 istride, ostride,
250 nthreads);
251 codelet(out, W, m, r, n, ostride);
252 break;
253 case FFTW_COMPLEX_TO_REAL:
254 codelet(in, W, m, r, n, istride);
255 spawn_h2hc_recurse_threads(m, r, in, out, p,
256 istride, ostride,
257 nthreads);
258 break;
261 break;
264 default:
265 bug:
266 fftw_die("BUG in rexecutor: invalid plan\n");
267 break;
271 static void rexecutor_simple_inplace_threads(int n, fftw_real *in,
272 fftw_real *out,
273 fftw_plan_node *p,
274 int istride,
275 int nthreads)
277 switch (p->type) {
278 case FFTW_REAL2HC:
279 HACK_ALIGN_STACK_ODD();
280 (p->nodeu.real2hc.codelet) (in, in, in + n * istride,
281 istride, istride, -istride);
282 break;
284 case FFTW_HC2REAL:
285 HACK_ALIGN_STACK_ODD();
286 (p->nodeu.hc2real.codelet) (in, in + n * istride, in,
287 istride, -istride, istride);
288 break;
290 default:
292 fftw_real *tmp;
294 if (out)
295 tmp = out;
296 else
297 tmp = (fftw_real *) fftw_malloc(n * sizeof(fftw_real));
299 rexec_simple_threads(n, in, tmp, p, istride, 1, nthreads);
300 rfftw_strided_copy(n, tmp, istride, in);
302 if (!out)
303 fftw_free(tmp);
308 typedef struct {
309 union {
310 fftw_real2hc_codelet *r2c_codelet;
311 fftw_hc2real_codelet *c2r_codelet;
312 fftw_plan_node *p;
313 } u;
314 int n;
315 fftw_real *in;
316 fftw_real *out;
317 int idist, odist, istride, ostride;
318 } rexec_many_data;
320 static void *rexec_many_r2c_codelet_thread(fftw_loop_data *ldata)
322 int min = ldata->min, max = ldata->max;
323 rexec_many_data *d = (rexec_many_data *) ldata->data;
324 fftw_real2hc_codelet *r2c_codelet = d->u.r2c_codelet;
325 int n = d->n;
326 fftw_real *in = d->in;
327 fftw_real *out = d->out;
328 int idist = d->idist, odist = d->odist;
329 int istride = d->istride, ostride = d->ostride;
331 HACK_ALIGN_STACK_ODD();
332 for (; min < max; ++min)
333 r2c_codelet(in + min * idist,
334 out + min * odist,
335 out + n * ostride + min * odist,
336 istride, ostride, -ostride);
337 return 0;
340 static void *rexec_many_c2r_codelet_thread(fftw_loop_data *ldata)
342 int min = ldata->min, max = ldata->max;
343 rexec_many_data *d = (rexec_many_data *) ldata->data;
344 fftw_hc2real_codelet *c2r_codelet = d->u.c2r_codelet;
345 int n = d->n;
346 fftw_real *in = d->in;
347 fftw_real *out = d->out;
348 int idist = d->idist, odist = d->odist;
349 int istride = d->istride, ostride = d->ostride;
351 HACK_ALIGN_STACK_ODD();
352 for (; min < max; ++min)
353 c2r_codelet(in + min * idist,
354 in + n * istride + min * idist,
355 out + min * odist,
356 istride, -istride, ostride);
357 return 0;
360 static void *rexec_many_simple_thread(fftw_loop_data *ldata)
362 int min = ldata->min, max = ldata->max;
363 rexec_many_data *d = (rexec_many_data *) ldata->data;
364 fftw_plan_node *p = d->u.p;
365 int n = d->n;
366 fftw_real *in = d->in;
367 fftw_real *out = d->out;
368 int idist = d->idist, odist = d->odist;
369 int istride = d->istride, ostride = d->ostride;
371 for (; min < max; ++min)
372 rfftw_executor_simple(n, in + min * idist,
373 out + min * odist,
374 p, istride, ostride);
376 return 0;
379 static void rexecutor_many_threads(int n, fftw_real *in,
380 fftw_real *out,
381 fftw_plan_node *p,
382 int istride,
383 int ostride,
384 int howmany, int idist, int odist,
385 int nthreads)
387 if (nthreads > howmany)
388 nthreads = howmany;
390 switch (p->type) {
391 case FFTW_REAL2HC:
393 int s;
394 fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
396 if (nthreads <= 1) {
397 HACK_ALIGN_STACK_ODD();
398 for (s = 0; s < howmany; ++s)
399 codelet(in + s * idist, out + s * odist,
400 out + n * ostride + s * odist,
401 istride, ostride, -ostride);
403 else {
404 rexec_many_data d;
406 d.n = n;
407 d.in = in;
408 d.out = out;
409 d.u.r2c_codelet = codelet;
410 d.istride = istride;
411 d.ostride = ostride;
412 d.idist = idist;
413 d.odist = odist;
414 fftw_thread_spawn_loop(howmany, nthreads,
415 rexec_many_r2c_codelet_thread, &d);
418 break;
421 case FFTW_HC2REAL:
423 int s;
424 fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
426 if (nthreads <= 1) {
427 HACK_ALIGN_STACK_ODD();
428 for (s = 0; s < howmany; ++s)
429 codelet(in + s * idist,
430 in + n * istride + s * idist,
431 out + s * odist,
432 istride, -istride, ostride);
434 else {
435 rexec_many_data d;
437 d.n = n;
438 d.in = in;
439 d.out = out;
440 d.u.c2r_codelet = codelet;
441 d.istride = istride;
442 d.ostride = ostride;
443 d.idist = idist;
444 d.odist = odist;
445 fftw_thread_spawn_loop(howmany, nthreads,
446 rexec_many_c2r_codelet_thread, &d);
449 break;
452 default:
454 int s;
456 if (nthreads <= 1)
457 for (s = 0; s < howmany; ++s) {
458 rfftw_executor_simple(n, in + s * idist,
459 out + s * odist,
460 p, istride, ostride);
462 else {
463 rexec_many_data d;
465 d.in = in; d.out = out;
466 d.n = n;
467 d.u.p = p;
468 d.istride = istride;
469 d.ostride = ostride;
470 d.idist = idist;
471 d.odist = odist;
472 fftw_thread_spawn_loop(howmany, nthreads,
473 rexec_many_simple_thread, &d);
479 static void *rexec_many_simple_inplace_thread(fftw_loop_data *ldata)
481 int min = ldata->min, max = ldata->max;
482 rexec_many_data *d = (rexec_many_data *) ldata->data;
483 fftw_plan_node *p = d->u.p;
484 int n = d->n;
485 fftw_real *in = d->in;
486 fftw_real *out = d->out + n * ldata->thread_num;
487 int idist = d->idist;
488 int istride = d->istride;
490 for (; min < max; ++min) {
491 rfftw_executor_simple(n, in + min * idist, out, p, istride, 1);
492 rfftw_strided_copy(n, out, istride, in + min * idist);
495 return 0;
498 static void rexecutor_many_inplace_threads(int n, fftw_real *in,
499 fftw_real *out,
500 fftw_plan_node *p,
501 int istride,
502 int howmany, int idist,
503 int nthreads)
505 switch (p->type) {
506 case FFTW_REAL2HC:
508 int s;
509 fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
511 if (nthreads <= 1) {
512 HACK_ALIGN_STACK_ODD();
513 for (s = 0; s < howmany; ++s)
514 codelet(in + s * idist, in + s * idist,
515 in + n * istride + s * idist,
516 istride, istride, -istride);
518 else {
519 rexec_many_data d;
521 d.n = n;
522 d.in = in;
523 d.out = in;
524 d.u.r2c_codelet = codelet;
525 d.istride = istride;
526 d.ostride = istride;
527 d.idist = idist;
528 d.odist = idist;
529 fftw_thread_spawn_loop(howmany, nthreads,
530 rexec_many_r2c_codelet_thread, &d);
533 break;
536 case FFTW_HC2REAL:
538 int s;
539 fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
541 if (nthreads <= 1) {
542 HACK_ALIGN_STACK_ODD();
543 for (s = 0; s < howmany; ++s)
544 codelet(in + s * idist,
545 in + n * istride + s * idist,
546 in + s * idist,
547 istride, -istride, istride);
549 else {
550 rexec_many_data d;
552 d.n = n;
553 d.in = in;
554 d.out = in;
555 d.u.c2r_codelet = codelet;
556 d.istride = istride;
557 d.ostride = istride;
558 d.idist = idist;
559 d.odist = idist;
560 fftw_thread_spawn_loop(howmany, nthreads,
561 rexec_many_c2r_codelet_thread, &d);
564 break;
567 default:
569 int s;
570 fftw_real *tmp;
572 if (nthreads > howmany)
573 nthreads = howmany;
575 if (nthreads <= 1) {
576 if (out)
577 tmp = out;
578 else
579 tmp =(fftw_real *) fftw_malloc(n *
580 sizeof(fftw_real));
582 for (s = 0; s < howmany; ++s) {
583 rfftw_executor_simple(n,
584 in + s * idist,
585 tmp,
586 p, istride, 1);
587 rfftw_strided_copy(n, tmp, istride,
588 in + s * idist);
591 if (!out)
592 fftw_free(tmp);
594 else {
595 rexec_many_data d;
597 tmp = (fftw_real *)
598 fftw_malloc(nthreads * n * sizeof(fftw_real));
600 d.in = in;
601 d.out = tmp;
602 d.n = n;
603 d.u.p = p;
604 d.istride = istride;
605 d.ostride = 1;
606 d.idist = idist;
607 d.odist = 0;
608 fftw_thread_spawn_loop(howmany, nthreads,
609 rexec_many_simple_inplace_thread,&d);
611 fftw_free(tmp);
617 /* user interface */
618 void rfftw_threads(int nthreads,
619 fftw_plan plan, int howmany, fftw_real *in, int istride,
620 int idist, fftw_real *out, int ostride, int odist)
622 int n = plan->n;
624 if (plan->flags & FFTW_IN_PLACE) {
625 if (howmany == 1) {
626 rexecutor_simple_inplace_threads(n, in, out, plan->root,
627 istride, nthreads);
628 } else {
629 rexecutor_many_inplace_threads(n, in, out, plan->root,
630 istride, howmany, idist,
631 nthreads);
633 } else {
634 if (howmany == 1) {
635 rexec_simple_threads(n, in, out, plan->root, istride, ostride,
636 nthreads);
637 } else {
638 rexecutor_many_threads(n, in, out, plan->root, istride, ostride,
639 howmany, idist, odist, nthreads);
644 void rfftw_threads_one(int nthreads,
645 fftw_plan plan, fftw_real *in, fftw_real *out)
647 int n = plan->n;
649 if (plan->flags & FFTW_IN_PLACE)
650 rexecutor_simple_inplace_threads(n, in, out, plan->root, 1,
651 nthreads);
652 else
653 rexec_simple_threads(n, in, out, plan->root, 1, 1, nthreads);