changed reading hint
[gromacs/adressmacs.git] / src / fftw / rader.c
blobe81fcb6a8ca026c9caf6b8751cc2e7cb6a203c4d
1 /*
2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * Compute transforms of prime sizes using Rader's trick: turn them
22 * into convolutions of size n - 1, which you then perform via a pair
23 * of FFTs.
26 #include <stdlib.h>
27 #include <math.h>
29 #include <fftw-int.h>
31 #ifdef FFTW_USING_CILK
32 #include <cilk.h>
33 #include <cilk-compat.h>
34 #endif
36 #ifdef FFTW_DEBUG
37 #define WHEN_DEBUG(a) a
38 #else
39 #define WHEN_DEBUG(a)
40 #endif
42 /* compute n^m mod p, where m >= 0 and p > 0. */
43 static int power_mod(int n, int m, int p)
45 if (m == 0)
46 return 1;
47 else if (m % 2 == 0) {
48 int x = power_mod(n, m / 2, p);
49 return ((x * x) % p);
50 } else
51 return ((n * power_mod(n, m - 1, p)) % p);
55 * Find the period of n in the multiplicative group mod p (p prime).
56 * That is, return the smallest m such that n^m == 1 mod p.
58 static int period(int n, int p)
60 int prod = n, period = 1;
62 while (prod != 1) {
63 prod = (prod * n) % p;
64 ++period;
65 if (prod == 0)
66 fftw_die("non-prime order in Rader\n");
68 return period;
71 /* find a generator for the multiplicative group mod p, where p is prime */
72 static int find_generator(int p)
74 int g;
76 for (g = 1; g < p; ++g)
77 if (period(g, p) == p - 1)
78 break;
79 if (g == p)
80 fftw_die("couldn't find generator for Rader\n");
81 return g;
84 /***************************************************************************/
86 static fftw_rader_data *create_rader_aux(int p, int flags)
88 fftw_complex *omega, *work;
89 int g, ginv, gpower;
90 int i;
91 FFTW_TRIG_REAL twoPiOverN;
92 fftw_real scale = 1.0 / (p - 1); /* for convolution */
93 fftw_plan plan;
94 fftw_rader_data *d;
96 if (p < 2)
97 fftw_die("non-prime order in Rader\n");
99 flags &= ~FFTW_IN_PLACE;
101 d = (fftw_rader_data *) fftw_malloc(sizeof(fftw_rader_data));
103 g = find_generator(p);
104 ginv = power_mod(g, p - 2, p);
106 omega = (fftw_complex *) fftw_malloc((p - 1) * sizeof(fftw_complex));
108 plan = fftw_create_plan(p - 1, FFTW_FORWARD, flags);
110 work = (fftw_complex *) fftw_malloc((p - 1) * sizeof(fftw_complex));
112 twoPiOverN = FFTW_K2PI / (FFTW_TRIG_REAL) p;
113 gpower = 1;
114 for (i = 0; i < p - 1; ++i) {
115 c_re(work[i]) = scale * FFTW_TRIG_COS(twoPiOverN * gpower);
116 c_im(work[i]) = FFTW_FORWARD * scale * FFTW_TRIG_SIN(twoPiOverN * gpower);
117 gpower = (gpower * ginv) % p;
120 /* fft permuted roots of unity */
121 fftw_executor_simple(p - 1, work, omega, plan->root, 1, 1);
123 fftw_free(work);
125 d->plan = plan;
126 d->omega = omega;
127 d->g = g;
128 d->ginv = ginv;
129 d->p = p;
130 d->flags = flags;
131 d->refcount = 1;
132 d->next = NULL;
134 d->cdesc = (fftw_codelet_desc *) fftw_malloc(sizeof(fftw_codelet_desc));
135 d->cdesc->name = NULL;
136 d->cdesc->codelet = NULL;
137 d->cdesc->size = p;
138 d->cdesc->dir = FFTW_FORWARD;
139 d->cdesc->type = FFTW_RADER;
140 d->cdesc->signature = g;
141 d->cdesc->ntwiddle = 0;
142 d->cdesc->twiddle_order = NULL;
143 return d;
146 /***************************************************************************/
148 static fftw_rader_data *fftw_create_rader(int p, int flags)
150 fftw_rader_data *d = fftw_rader_top;
152 flags &= ~FFTW_IN_PLACE;
153 while (d && (d->p != p || d->flags != flags))
154 d = d->next;
155 if (d) {
156 d->refcount++;
157 return d;
159 d = create_rader_aux(p, flags);
160 d->next = fftw_rader_top;
161 fftw_rader_top = d;
162 return d;
165 /***************************************************************************/
167 /* Compute the prime FFTs, premultiplied by twiddle factors. Below, we
168 * extensively use the identity that fft(x*)* = ifft(x) in order to
169 * share data between forward and backward transforms and to obviate
170 * the necessity of having separate forward and backward plans. */
172 void fftw_twiddle_rader(fftw_complex *A, const fftw_complex *W,
173 int m, int r, int stride,
174 fftw_rader_data * d)
176 fftw_complex *tmp = (fftw_complex *)
177 fftw_malloc((r - 1) * sizeof(fftw_complex));
178 int i, k, gpower = 1, g = d->g, ginv = d->ginv;
179 fftw_real a0r, a0i;
180 fftw_complex *omega = d->omega;
182 for (i = 0; i < m; ++i, A += stride, W += r - 1) {
184 * Here, we fft W[k-1] * A[k*(m*stride)], using Rader.
185 * (Actually, W is pre-permuted to match the permutation that we
186 * will do on A.)
189 /* First, permute the input and multiply by W, storing in tmp: */
190 /* gpower == g^k mod r in the following loop */
191 for (k = 0; k < r - 1; ++k, gpower = (gpower * g) % r) {
192 fftw_real rA, iA, rW, iW;
193 rW = c_re(W[k]);
194 iW = c_im(W[k]);
195 rA = c_re(A[gpower * (m * stride)]);
196 iA = c_im(A[gpower * (m * stride)]);
197 c_re(tmp[k]) = rW * rA - iW * iA;
198 c_im(tmp[k]) = rW * iA + iW * rA;
201 WHEN_DEBUG( {
202 if (gpower != 1)
203 fftw_die("incorrect generator in Rader\n");
207 /* FFT tmp to A: */
208 fftw_executor_simple(r - 1, tmp, A + (m * stride),
209 d->plan->root, 1, m * stride);
211 /* set output DC component: */
212 a0r = c_re(A[0]);
213 a0i = c_im(A[0]);
214 c_re(A[0]) += c_re(A[(m * stride)]);
215 c_im(A[0]) += c_im(A[(m * stride)]);
217 /* now, multiply by omega: */
218 for (k = 0; k < r - 1; ++k) {
219 fftw_real rA, iA, rW, iW;
220 rW = c_re(omega[k]);
221 iW = c_im(omega[k]);
222 rA = c_re(A[(k + 1) * (m * stride)]);
223 iA = c_im(A[(k + 1) * (m * stride)]);
224 c_re(A[(k + 1) * (m * stride)]) = rW * rA - iW * iA;
225 c_im(A[(k + 1) * (m * stride)]) = -(rW * iA + iW * rA);
228 /* this will add A[0] to all of the outputs after the ifft */
229 c_re(A[(m * stride)]) += a0r;
230 c_im(A[(m * stride)]) -= a0i;
232 /* inverse FFT: */
233 fftw_executor_simple(r - 1, A + (m * stride), tmp,
234 d->plan->root, m * stride, 1);
236 /* finally, do inverse permutation to unshuffle the output: */
237 for (k = 0; k < r - 1; ++k, gpower = (gpower * ginv) % r) {
238 c_re(A[gpower * (m * stride)]) = c_re(tmp[k]);
239 c_im(A[gpower * (m * stride)]) = -c_im(tmp[k]);
242 WHEN_DEBUG( {
243 if (gpower != 1)
244 fftw_die("incorrect generator in Rader\n");
250 fftw_free(tmp);
253 void fftwi_twiddle_rader(fftw_complex *A, const fftw_complex *W,
254 int m, int r, int stride,
255 fftw_rader_data * d)
257 fftw_complex *tmp = (fftw_complex *)
258 fftw_malloc((r - 1) * sizeof(fftw_complex));
259 int i, k, gpower = 1, g = d->g, ginv = d->ginv;
260 fftw_real a0r, a0i;
261 fftw_complex *omega = d->omega;
263 for (i = 0; i < m; ++i, A += stride, W += r - 1) {
265 * Here, we fft W[k-1]* * A[k*(m*stride)], using Rader.
266 * (Actually, W is pre-permuted to match the permutation that
267 * we will do on A.)
270 /* First, permute the input and multiply by W*, storing in tmp: */
271 /* gpower == g^k mod r in the following loop */
272 for (k = 0; k < r - 1; ++k, gpower = (gpower * g) % r) {
273 fftw_real rA, iA, rW, iW;
274 rW = c_re(W[k]);
275 iW = c_im(W[k]);
276 rA = c_re(A[gpower * (m * stride)]);
277 iA = c_im(A[gpower * (m * stride)]);
278 c_re(tmp[k]) = rW * rA + iW * iA;
279 c_im(tmp[k]) = iW * rA - rW * iA;
282 WHEN_DEBUG( {
283 if (gpower != 1)
284 fftw_die("incorrect generator in Rader\n");
288 /* FFT tmp to A: */
289 fftw_executor_simple(r - 1, tmp, A + (m * stride),
290 d->plan->root, 1, m * stride);
292 /* set output DC component: */
293 a0r = c_re(A[0]);
294 a0i = c_im(A[0]);
295 c_re(A[0]) += c_re(A[(m * stride)]);
296 c_im(A[0]) -= c_im(A[(m * stride)]);
298 /* now, multiply by omega: */
299 for (k = 0; k < r - 1; ++k) {
300 fftw_real rA, iA, rW, iW;
301 rW = c_re(omega[k]);
302 iW = c_im(omega[k]);
303 rA = c_re(A[(k + 1) * (m * stride)]);
304 iA = c_im(A[(k + 1) * (m * stride)]);
305 c_re(A[(k + 1) * (m * stride)]) = rW * rA - iW * iA;
306 c_im(A[(k + 1) * (m * stride)]) = -(rW * iA + iW * rA);
309 /* this will add A[0] to all of the outputs after the ifft */
310 c_re(A[(m * stride)]) += a0r;
311 c_im(A[(m * stride)]) += a0i;
313 /* inverse FFT: */
314 fftw_executor_simple(r - 1, A + (m * stride), tmp,
315 d->plan->root, m * stride, 1);
317 /* finally, do inverse permutation to unshuffle the output: */
318 for (k = 0; k < r - 1; ++k, gpower = (gpower * ginv) % r) {
319 A[gpower * (m * stride)] = tmp[k];
322 WHEN_DEBUG( {
323 if (gpower != 1)
324 fftw_die("incorrect generator in Rader\n");
329 fftw_free(tmp);
332 /***************************************************************************/
335 * Make an FFTW_RADER plan node. Note that this function must go
336 * here, rather than in putils.c, because it indirectly calls the
337 * fftw_planner. If we included it in putils.c, which is also used
338 * by rfftw, then any program using rfftw would be linked with all
339 * of the FFTW codelets, even if they were not needed. I wish that the
340 * darn linkers operated on a function rather than a file granularity.
342 fftw_plan_node *fftw_make_node_rader(int n, int size, fftw_direction dir,
343 fftw_plan_node *recurse,
344 int flags)
346 fftw_plan_node *p = fftw_make_node();
348 p->type = FFTW_RADER;
349 p->nodeu.rader.size = size;
350 p->nodeu.rader.codelet = dir == FFTW_FORWARD ?
351 fftw_twiddle_rader : fftwi_twiddle_rader;
352 p->nodeu.rader.rader_data = fftw_create_rader(size, flags);
353 p->nodeu.rader.recurse = recurse;
354 fftw_use_node(recurse);
356 if (flags & FFTW_MEASURE)
357 p->nodeu.rader.tw =
358 fftw_create_twiddle(n, p->nodeu.rader.rader_data->cdesc);
359 else
360 p->nodeu.rader.tw = 0;
361 return p;