Don't rely on terminate in a catch block giving a useful message
[openal-soft.git] / common / pffft.cpp
blob6cddcf09acf7bf9aa8d8f8f5aa6241200efc8a19
1 //$ nobt
3 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
4 * Copyright (c) 2023 Christopher Robinson
6 * Based on original fortran 77 code from FFTPACKv4 from NETLIB
7 * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
8 * of NCAR, in 1985.
10 * As confirmed by the NCAR fftpack software curators, the following
11 * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
12 * released under the same terms.
14 * FFTPACK license:
16 * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
18 * Copyright (c) 2004 the University Corporation for Atmospheric
19 * Research ("UCAR"). All rights reserved. Developed by NCAR's
20 * Computational and Information Systems Laboratory, UCAR,
21 * www.cisl.ucar.edu.
23 * Redistribution and use of the Software in source and binary forms,
24 * with or without modification, is permitted provided that the
25 * following conditions are met:
27 * - Neither the names of NCAR's Computational and Information Systems
28 * Laboratory, the University Corporation for Atmospheric Research,
29 * nor the names of its sponsors or contributors may be used to
30 * endorse or promote products derived from this Software without
31 * specific prior written permission.
33 * - Redistributions of source code must retain the above copyright
34 * notices, this list of conditions, and the disclaimer below.
36 * - Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions, and the disclaimer below in the
38 * documentation and/or other materials provided with the
39 * distribution.
41 * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42 * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
43 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
44 * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
45 * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
46 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
49 * SOFTWARE.
52 * PFFFT : a Pretty Fast FFT.
54 * This file is largerly based on the original FFTPACK implementation, modified
55 * in order to take advantage of SIMD instructions of modern CPUs.
58 #include "pffft.h"
60 #include <algorithm>
61 #include <array>
62 #include <cassert>
63 #include <cmath>
64 #include <cstdint>
65 #include <cstdio>
66 #include <cstring>
67 #include <new>
68 #include <utility>
69 #include <vector>
71 #include "albit.h"
72 #include "almalloc.h"
73 #include "alnumbers.h"
74 #include "alnumeric.h"
75 #include "alspan.h"
76 #include "opthelpers.h"
79 using uint = unsigned int;
81 namespace {
83 #if defined(__GNUC__) || defined(_MSC_VER)
84 #define RESTRICT __restrict
85 #else
86 #define RESTRICT
87 #endif
89 /* Vector support macros: the rest of the code is independent of
90 * SSE/Altivec/NEON -- adding support for other platforms with 4-element
91 * vectors should be limited to these macros
94 /* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
95 //#define PFFFT_SIMD_DISABLE
97 #ifndef PFFFT_SIMD_DISABLE
99 * Altivec support macros
101 #if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
102 #include <altivec.h>
103 using v4sf = vector float;
104 constexpr uint SimdSize{4};
105 force_inline v4sf vzero() noexcept { return (vector float)vec_splat_u8(0); }
106 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vec_madd(a, b, vzero()); }
107 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vec_add(a, b); }
108 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vec_madd(a, b, c); }
109 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vec_sub(a, b); }
110 force_inline v4sf ld_ps1(float a) noexcept { return vec_splats(a); }
112 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
114 /* There a more efficient way to do this? */
115 alignas(16) std::array<float,4> vals{{a, b, c, d}};
116 return vec_ld(0, vals.data());
118 force_inline v4sf vinsert0(v4sf v, float a) noexcept { return vec_insert(a, v, 0); }
119 force_inline float vextract0(v4sf v) noexcept { return vec_extract(v, 0); }
121 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
123 out1 = vec_mergeh(in1, in2);
124 out2 = vec_mergel(in1, in2);
126 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
128 out1 = vec_perm(in1, in2, (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27});
129 out2 = vec_perm(in1, in2, (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31});
130 out1 = tmp;
133 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
135 v4sf y0{vec_mergeh(x0, x2)};
136 v4sf y1{vec_mergel(x0, x2)};
137 v4sf y2{vec_mergeh(x1, x3)};
138 v4sf y3{vec_mergel(x1, x3)};
139 x0 = vec_mergeh(y0, y2);
140 x1 = vec_mergel(y0, y2);
141 x2 = vec_mergeh(y1, y3);
142 x3 = vec_mergel(y1, y3);
145 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
146 { return vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}); }
149 * SSE1 support macros
151 #elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
152 (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
154 #include <xmmintrin.h>
155 using v4sf = __m128;
156 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/
157 * finalize functions anyway so you will have to work if you want to enable AVX
158 * with its 256-bit vectors.
160 constexpr uint SimdSize{4};
161 force_inline v4sf vzero() noexcept { return _mm_setzero_ps(); }
162 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return _mm_mul_ps(a, b); }
163 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return _mm_add_ps(a, b); }
164 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return _mm_add_ps(_mm_mul_ps(a,b), c); }
165 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return _mm_sub_ps(a, b); }
166 force_inline v4sf ld_ps1(float a) noexcept { return _mm_set1_ps(a); }
168 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
169 { return _mm_setr_ps(a, b, c, d); }
170 force_inline v4sf vinsert0(const v4sf v, const float a) noexcept
171 { return _mm_move_ss(v, _mm_set_ss(a)); }
172 force_inline float vextract0(v4sf v) noexcept
173 { return _mm_cvtss_f32(v); }
175 force_inline void interleave2(const v4sf in1, const v4sf in2, v4sf &out1, v4sf &out2) noexcept
177 out1 = _mm_unpacklo_ps(in1, in2);
178 out2 = _mm_unpackhi_ps(in1, in2);
180 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
182 out1 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0));
183 out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
186 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
187 { _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
189 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
190 { return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)); }
193 * ARM NEON support macros
195 #elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)
197 #include <arm_neon.h>
198 using v4sf = float32x4_t;
199 constexpr uint SimdSize{4};
200 force_inline v4sf vzero() noexcept { return vdupq_n_f32(0.0f); }
201 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vmulq_f32(a, b); }
202 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vaddq_f32(a, b); }
203 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vmlaq_f32(c, a, b); }
204 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vsubq_f32(a, b); }
205 force_inline v4sf ld_ps1(float a) noexcept { return vdupq_n_f32(a); }
207 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
209 float32x4_t ret{vmovq_n_f32(a)};
210 ret = vsetq_lane_f32(b, ret, 1);
211 ret = vsetq_lane_f32(c, ret, 2);
212 ret = vsetq_lane_f32(d, ret, 3);
213 return ret;
215 force_inline v4sf vinsert0(v4sf v, float a) noexcept
216 { return vsetq_lane_f32(a, v, 0); }
217 force_inline float vextract0(v4sf v) noexcept
218 { return vgetq_lane_f32(v, 0); }
220 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
222 float32x4x2_t tmp{vzipq_f32(in1, in2)};
223 out1 = tmp.val[0];
224 out2 = tmp.val[1];
226 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
228 float32x4x2_t tmp{vuzpq_f32(in1, in2)};
229 out1 = tmp.val[0];
230 out2 = tmp.val[1];
233 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
235 /* marginally faster version:
236 * asm("vtrn.32 %q0, %q1;\n"
237 * "vtrn.32 %q2, %q3\n
238 * "vswp %f0, %e2\n
239 * "vswp %f1, %e3"
240 * : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
242 float32x4x2_t t0_{vzipq_f32(x0, x2)};
243 float32x4x2_t t1_{vzipq_f32(x1, x3)};
244 float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
245 float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
246 x0 = u0_.val[0];
247 x1 = u0_.val[1];
248 x2 = u1_.val[0];
249 x3 = u1_.val[1];
252 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
253 { return vcombine_f32(vget_low_f32(b), vget_high_f32(a)); }
256 * Generic GCC vector macros
258 #elif defined(__GNUC__)
260 using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
261 constexpr uint SimdSize{4};
262 force_inline constexpr v4sf vzero() noexcept { return v4sf{0.0f, 0.0f, 0.0f, 0.0f}; }
263 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
264 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
265 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
266 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
267 force_inline constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
269 force_inline constexpr v4sf vset4(float a, float b, float c, float d) noexcept
270 { return v4sf{a, b, c, d}; }
271 force_inline constexpr v4sf vinsert0(v4sf v, float a) noexcept
272 { return v4sf{a, v[1], v[2], v[3]}; }
273 force_inline float vextract0(v4sf v) noexcept
274 { return v[0]; }
276 force_inline v4sf unpacklo(v4sf a, v4sf b) noexcept
277 { return v4sf{a[0], b[0], a[1], b[1]}; }
278 force_inline v4sf unpackhi(v4sf a, v4sf b) noexcept
279 { return v4sf{a[2], b[2], a[3], b[3]}; }
281 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
283 out1 = unpacklo(in1, in2);
284 out2 = unpackhi(in1, in2);
286 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
288 out1 = v4sf{in1[0], in1[2], in2[0], in2[2]};
289 out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
292 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
294 v4sf tmp0{unpacklo(x0, x1)};
295 v4sf tmp2{unpacklo(x2, x3)};
296 v4sf tmp1{unpackhi(x0, x1)};
297 v4sf tmp3{unpackhi(x2, x3)};
298 x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]};
299 x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]};
300 x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
301 x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
304 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
305 { return v4sf{b[0], b[1], a[2], a[3]}; }
307 #else
309 #warning "building with simd disabled !\n";
310 #define PFFFT_SIMD_DISABLE // fallback to scalar code
311 #endif
313 #endif /* PFFFT_SIMD_DISABLE */
315 // fallback mode for situations where SIMD is not available, use scalar mode instead
316 #ifdef PFFFT_SIMD_DISABLE
317 using v4sf = float;
318 constexpr uint SimdSize{1};
319 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
320 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
321 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
322 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
323 force_inline constexpr v4sf ld_ps1(float a) noexcept { return a; }
325 #else
327 [[maybe_unused]] inline
328 auto valigned(const float *ptr) noexcept -> bool
330 static constexpr uintptr_t alignmask{SimdSize*sizeof(float) - 1};
331 return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
333 #endif
335 // shortcuts for complex multiplications
336 force_inline void vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
338 v4sf tmp{vmul(ar, bi)};
339 ar = vsub(vmul(ar, br), vmul(ai, bi));
340 ai = vmadd(ai, br, tmp);
342 force_inline void vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
344 v4sf tmp{vmul(ar, bi)};
345 ar = vmadd(ai, bi, vmul(ar, br));
346 ai = vsub(vmul(ai, br), tmp);
349 #if !defined(PFFFT_SIMD_DISABLE)
351 inline void assertv4(const al::span<const float,4> v_f [[maybe_unused]],
352 const float f0 [[maybe_unused]], const float f1 [[maybe_unused]],
353 const float f2 [[maybe_unused]], const float f3 [[maybe_unused]])
354 { assert(v_f[0] == f0 && v_f[1] == f1 && v_f[2] == f2 && v_f[3] == f3); }
356 template<typename T, T ...N>
357 constexpr auto make_float_array(std::integer_sequence<T,N...>)
358 { return std::array{static_cast<float>(N)...}; }
360 /* detect bugs with the vector support macros */
361 [[maybe_unused]] void validate_pffft_simd()
363 using float4 = std::array<float,4>;
364 static constexpr auto f = make_float_array(std::make_index_sequence<16>{});
366 v4sf a0_v{vset4(f[ 0], f[ 1], f[ 2], f[ 3])};
367 v4sf a1_v{vset4(f[ 4], f[ 5], f[ 6], f[ 7])};
368 v4sf a2_v{vset4(f[ 8], f[ 9], f[10], f[11])};
369 v4sf a3_v{vset4(f[12], f[13], f[14], f[15])};
370 v4sf u_v{};
372 auto t_v = vzero();
373 auto t_f = al::bit_cast<float4>(t_v);
374 printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
375 assertv4(t_f, 0, 0, 0, 0);
377 t_v = vadd(a1_v, a2_v);
378 t_f = al::bit_cast<float4>(t_v);
379 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
380 assertv4(t_f, 12, 14, 16, 18);
382 t_v = vmul(a1_v, a2_v);
383 t_f = al::bit_cast<float4>(t_v);
384 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
385 assertv4(t_f, 32, 45, 60, 77);
387 t_v = vmadd(a1_v, a2_v, a0_v);
388 t_f = al::bit_cast<float4>(t_v);
389 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
390 assertv4(t_f, 32, 46, 62, 80);
392 interleave2(a1_v, a2_v, t_v, u_v);
393 t_f = al::bit_cast<float4>(t_v);
394 auto u_f = al::bit_cast<float4>(u_v);
395 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
396 t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
397 assertv4(t_f, 4, 8, 5, 9);
398 assertv4(u_f, 6, 10, 7, 11);
400 uninterleave2(a1_v, a2_v, t_v, u_v);
401 t_f = al::bit_cast<float4>(t_v);
402 u_f = al::bit_cast<float4>(u_v);
403 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
404 t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
405 assertv4(t_f, 4, 6, 8, 10);
406 assertv4(u_f, 5, 7, 9, 11);
408 t_v = ld_ps1(f[15]);
409 t_f = al::bit_cast<float4>(t_v);
410 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
411 assertv4(t_f, 15, 15, 15, 15);
413 t_v = vswaphl(a1_v, a2_v);
414 t_f = al::bit_cast<float4>(t_v);
415 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
416 assertv4(t_f, 8, 9, 6, 7);
418 vtranspose4(a0_v, a1_v, a2_v, a3_v);
419 auto a0_f = al::bit_cast<float4>(a0_v);
420 auto a1_f = al::bit_cast<float4>(a1_v);
421 auto a2_f = al::bit_cast<float4>(a2_v);
422 auto a3_f = al::bit_cast<float4>(a3_v);
423 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
424 a0_f[0], a0_f[1], a0_f[2], a0_f[3], a1_f[0], a1_f[1], a1_f[2], a1_f[3],
425 a2_f[0], a2_f[1], a2_f[2], a2_f[3], a3_f[0], a3_f[1], a3_f[2], a3_f[3]);
426 assertv4(a0_f, 0, 4, 8, 12);
427 assertv4(a1_f, 1, 5, 9, 13);
428 assertv4(a2_f, 2, 6, 10, 14);
429 assertv4(a3_f, 3, 7, 11, 15);
431 #endif //!PFFFT_SIMD_DISABLE
433 /* SSE and co like 16-bytes aligned pointers */
434 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
435 constexpr auto V4sfAlignment = size_t(64);
436 constexpr auto V4sfAlignVal = std::align_val_t(V4sfAlignment);
438 /* NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
439 * FIXME: Converting this from raw pointers to spans or something will probably
440 * need significant work to maintain performance, given non-sequential range-
441 * checked accesses and lack of 'restrict' to indicate non-aliased memory. At
442 * least, some tests should be done to check the impact of using range-checked
443 * spans here before blindly switching.
446 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
448 NOINLINE void passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
449 const float *const wa1, const float fsign)
451 const size_t l1ido{l1*ido};
452 if(ido <= 2)
454 for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
456 ch[0] = vadd(cc[0], cc[ido+0]);
457 ch[l1ido] = vsub(cc[0], cc[ido+0]);
458 ch[1] = vadd(cc[1], cc[ido+1]);
459 ch[l1ido + 1] = vsub(cc[1], cc[ido+1]);
462 else
464 for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
466 for(size_t i{0};i < ido-1;i += 2)
468 v4sf tr2{vsub(cc[i+0], cc[i+ido+0])};
469 v4sf ti2{vsub(cc[i+1], cc[i+ido+1])};
470 v4sf wr{ld_ps1(wa1[i])}, wi{ld_ps1(wa1[i+1]*fsign)};
471 ch[i] = vadd(cc[i+0], cc[i+ido+0]);
472 ch[i+1] = vadd(cc[i+1], cc[i+ido+1]);
473 vcplxmul(tr2, ti2, wr, wi);
474 ch[i+l1ido] = tr2;
475 ch[i+l1ido+1] = ti2;
482 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
484 NOINLINE void passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
485 const float *const wa1, const float fsign)
487 assert(ido > 2);
489 const v4sf taur{ld_ps1(-0.5f)};
490 const v4sf taui{ld_ps1(0.866025403784439f*fsign)};
491 const size_t l1ido{l1*ido};
492 const auto wa2 = wa1 + ido;
493 for(size_t k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
495 for(size_t i{0};i < ido-1;i += 2)
497 v4sf tr2{vadd(cc[i+ido], cc[i+2*ido])};
498 v4sf cr2{vmadd(taur, tr2, cc[i])};
499 ch[i] = vadd(tr2, cc[i]);
500 v4sf ti2{vadd(cc[i+ido+1], cc[i+2*ido+1])};
501 v4sf ci2{vmadd(taur, ti2, cc[i+1])};
502 ch[i+1] = vadd(cc[i+1], ti2);
503 v4sf cr3{vmul(taui, vsub(cc[i+ido], cc[i+2*ido]))};
504 v4sf ci3{vmul(taui, vsub(cc[i+ido+1], cc[i+2*ido+1]))};
505 v4sf dr2{vsub(cr2, ci3)};
506 v4sf dr3{vadd(cr2, ci3)};
507 v4sf di2{vadd(ci2, cr3)};
508 v4sf di3{vsub(ci2, cr3)};
509 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
510 vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
511 ch[i+l1ido] = dr2;
512 ch[i+l1ido + 1] = di2;
513 vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
514 ch[i+2*l1ido] = dr3;
515 ch[i+2*l1ido+1] = di3;
518 } /* passf3 */
520 NOINLINE void passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
521 const float *const wa1, const float fsign)
523 /* fsign == -1 for forward transform and +1 for backward transform */
524 const v4sf vsign{ld_ps1(fsign)};
525 const size_t l1ido{l1*ido};
526 if(ido == 2)
528 for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
530 v4sf tr1{vsub(cc[0], cc[2*ido + 0])};
531 v4sf tr2{vadd(cc[0], cc[2*ido + 0])};
532 v4sf ti1{vsub(cc[1], cc[2*ido + 1])};
533 v4sf ti2{vadd(cc[1], cc[2*ido + 1])};
534 v4sf ti4{vmul(vsub(cc[1*ido + 0], cc[3*ido + 0]), vsign)};
535 v4sf tr4{vmul(vsub(cc[3*ido + 1], cc[1*ido + 1]), vsign)};
536 v4sf tr3{vadd(cc[ido + 0], cc[3*ido + 0])};
537 v4sf ti3{vadd(cc[ido + 1], cc[3*ido + 1])};
539 ch[0*l1ido + 0] = vadd(tr2, tr3);
540 ch[0*l1ido + 1] = vadd(ti2, ti3);
541 ch[1*l1ido + 0] = vadd(tr1, tr4);
542 ch[1*l1ido + 1] = vadd(ti1, ti4);
543 ch[2*l1ido + 0] = vsub(tr2, tr3);
544 ch[2*l1ido + 1] = vsub(ti2, ti3);
545 ch[3*l1ido + 0] = vsub(tr1, tr4);
546 ch[3*l1ido + 1] = vsub(ti1, ti4);
549 else
551 const auto wa2 = wa1 + ido;
552 const auto wa3 = wa2 + ido;
553 for(size_t k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
555 for(size_t i{0};i < ido-1;i+=2)
557 v4sf tr1{vsub(cc[i + 0], cc[i + 2*ido + 0])};
558 v4sf tr2{vadd(cc[i + 0], cc[i + 2*ido + 0])};
559 v4sf ti1{vsub(cc[i + 1], cc[i + 2*ido + 1])};
560 v4sf ti2{vadd(cc[i + 1], cc[i + 2*ido + 1])};
561 v4sf tr4{vmul(vsub(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)};
562 v4sf ti4{vmul(vsub(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)};
563 v4sf tr3{vadd(cc[i + ido + 0], cc[i + 3*ido + 0])};
564 v4sf ti3{vadd(cc[i + ido + 1], cc[i + 3*ido + 1])};
566 ch[i] = vadd(tr2, tr3);
567 v4sf cr3{vsub(tr2, tr3)};
568 ch[i + 1] = vadd(ti2, ti3);
569 v4sf ci3{vsub(ti2, ti3)};
571 v4sf cr2{vadd(tr1, tr4)};
572 v4sf cr4{vsub(tr1, tr4)};
573 v4sf ci2{vadd(ti1, ti4)};
574 v4sf ci4{vsub(ti1, ti4)};
575 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
576 vcplxmul(cr2, ci2, ld_ps1(wr1), ld_ps1(wi1));
577 float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
578 ch[i + l1ido] = cr2;
579 ch[i + l1ido + 1] = ci2;
581 vcplxmul(cr3, ci3, ld_ps1(wr2), ld_ps1(wi2));
582 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
583 ch[i + 2*l1ido] = cr3;
584 ch[i + 2*l1ido + 1] = ci3;
586 vcplxmul(cr4, ci4, ld_ps1(wr3), ld_ps1(wi3));
587 ch[i + 3*l1ido] = cr4;
588 ch[i + 3*l1ido + 1] = ci4;
592 } /* passf4 */
595 * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
597 NOINLINE void passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
598 const float *const wa1, const float fsign)
600 const v4sf tr11{ld_ps1(0.309016994374947f)};
601 const v4sf tr12{ld_ps1(-0.809016994374947f)};
602 const v4sf ti11{ld_ps1(0.951056516295154f*fsign)};
603 const v4sf ti12{ld_ps1(0.587785252292473f*fsign)};
605 auto cc_ref = [&cc,ido](size_t a_1, size_t a_2) noexcept -> auto&
606 { return cc[(a_2-1)*ido + a_1 + 1]; };
607 auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_3) noexcept -> auto&
608 { return ch[(a_3-1)*l1*ido + a_1 + 1]; };
610 assert(ido > 2);
612 const auto wa2 = wa1 + ido;
613 const auto wa3 = wa2 + ido;
614 const auto wa4 = wa3 + ido;
615 for(size_t k{0};k < l1;++k, cc += 5*ido, ch += ido)
617 for(size_t i{0};i < ido-1;i += 2)
619 v4sf ti5{vsub(cc_ref(i , 2), cc_ref(i , 5))};
620 v4sf ti2{vadd(cc_ref(i , 2), cc_ref(i , 5))};
621 v4sf ti4{vsub(cc_ref(i , 3), cc_ref(i , 4))};
622 v4sf ti3{vadd(cc_ref(i , 3), cc_ref(i , 4))};
623 v4sf tr5{vsub(cc_ref(i-1, 2), cc_ref(i-1, 5))};
624 v4sf tr2{vadd(cc_ref(i-1, 2), cc_ref(i-1, 5))};
625 v4sf tr4{vsub(cc_ref(i-1, 3), cc_ref(i-1, 4))};
626 v4sf tr3{vadd(cc_ref(i-1, 3), cc_ref(i-1, 4))};
627 ch_ref(i-1, 1) = vadd(cc_ref(i-1, 1), vadd(tr2, tr3));
628 ch_ref(i , 1) = vadd(cc_ref(i , 1), vadd(ti2, ti3));
629 v4sf cr2{vadd(cc_ref(i-1, 1), vmadd(tr11, tr2, vmul(tr12, tr3)))};
630 v4sf ci2{vadd(cc_ref(i , 1), vmadd(tr11, ti2, vmul(tr12, ti3)))};
631 v4sf cr3{vadd(cc_ref(i-1, 1), vmadd(tr12, tr2, vmul(tr11, tr3)))};
632 v4sf ci3{vadd(cc_ref(i , 1), vmadd(tr12, ti2, vmul(tr11, ti3)))};
633 v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
634 v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
635 v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
636 v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
637 v4sf dr3{vsub(cr3, ci4)};
638 v4sf dr4{vadd(cr3, ci4)};
639 v4sf di3{vadd(ci3, cr4)};
640 v4sf di4{vsub(ci3, cr4)};
641 v4sf dr5{vadd(cr2, ci5)};
642 v4sf dr2{vsub(cr2, ci5)};
643 v4sf di5{vsub(ci2, cr5)};
644 v4sf di2{vadd(ci2, cr5)};
645 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
646 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
647 vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
648 ch_ref(i - 1, 2) = dr2;
649 ch_ref(i, 2) = di2;
650 vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
651 ch_ref(i - 1, 3) = dr3;
652 ch_ref(i, 3) = di3;
653 vcplxmul(dr4, di4, ld_ps1(wr3), ld_ps1(wi3));
654 ch_ref(i - 1, 4) = dr4;
655 ch_ref(i, 4) = di4;
656 vcplxmul(dr5, di5, ld_ps1(wr4), ld_ps1(wi4));
657 ch_ref(i - 1, 5) = dr5;
658 ch_ref(i, 5) = di5;
663 NOINLINE void radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
664 v4sf *RESTRICT ch, const float *const wa1)
666 const size_t l1ido{l1*ido};
667 for(size_t k{0};k < l1ido;k += ido)
669 v4sf a{cc[k]}, b{cc[k + l1ido]};
670 ch[2*k] = vadd(a, b);
671 ch[2*(k+ido)-1] = vsub(a, b);
673 if(ido < 2)
674 return;
675 if(ido != 2)
677 for(size_t k{0};k < l1ido;k += ido)
679 for(size_t i{2};i < ido;i += 2)
681 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
682 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
683 vcplxmulconj(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
684 ch[i + 2*k] = vadd(bi, ti2);
685 ch[2*(k+ido) - i] = vsub(ti2, bi);
686 ch[i - 1 + 2*k] = vadd(br, tr2);
687 ch[2*(k+ido) - i -1] = vsub(br, tr2);
690 if((ido&1) == 1)
691 return;
693 const v4sf minus_one{ld_ps1(-1.0f)};
694 for(size_t k{0};k < l1ido;k += ido)
696 ch[2*k + ido] = vmul(minus_one, cc[ido-1 + k + l1ido]);
697 ch[2*k + ido-1] = cc[k + ido-1];
699 } /* radf2 */
702 NOINLINE void radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
703 const float *const wa1)
705 const size_t l1ido{l1*ido};
706 for(size_t k{0};k < l1ido;k += ido)
708 v4sf a{cc[2*k]};
709 v4sf b{cc[2*(k+ido) - 1]};
710 ch[k] = vadd(a, b);
711 ch[k + l1ido] = vsub(a, b);
713 if(ido < 2)
714 return;
715 if(ido != 2)
717 for(size_t k{0};k < l1ido;k += ido)
719 for(size_t i{2};i < ido;i += 2)
721 v4sf a{cc[i-1 + 2*k]};
722 v4sf b{cc[2*(k + ido) - i - 1]};
723 v4sf c{cc[i+0 + 2*k]};
724 v4sf d{cc[2*(k + ido) - i + 0]};
725 ch[i-1 + k] = vadd(a, b);
726 v4sf tr2{vsub(a, b)};
727 ch[i+0 + k] = vsub(c, d);
728 v4sf ti2{vadd(c, d)};
729 vcplxmul(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
730 ch[i-1 + k + l1ido] = tr2;
731 ch[i+0 + k + l1ido] = ti2;
734 if((ido&1) == 1)
735 return;
737 const v4sf minus_two{ld_ps1(-2.0f)};
738 for(size_t k{0};k < l1ido;k += ido)
740 v4sf a{cc[2*k + ido-1]};
741 v4sf b{cc[2*k + ido]};
742 ch[k + ido-1] = vadd(a,a);
743 ch[k + ido-1 + l1ido] = vmul(minus_two, b);
745 } /* radb2 */
747 void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
748 const float *const wa1)
750 const v4sf taur{ld_ps1(-0.5f)};
751 const v4sf taui{ld_ps1(0.866025403784439f)};
752 for(size_t k{0};k < l1;++k)
754 v4sf cr2{vadd(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
755 ch[ (3*k )*ido] = vadd(cc[k*ido], cr2);
756 ch[ (3*k + 2)*ido] = vmul(taui, vsub(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
757 ch[ido-1 + (3*k + 1)*ido] = vmadd(taur, cr2, cc[k*ido]);
759 if(ido == 1)
760 return;
762 const auto wa2 = wa1 + ido;
763 for(size_t k{0};k < l1;++k)
765 for(size_t i{2};i < ido;i += 2)
767 const size_t ic{ido - i};
768 v4sf wr1{ld_ps1(wa1[i - 2])};
769 v4sf wi1{ld_ps1(wa1[i - 1])};
770 v4sf dr2{cc[i - 1 + (k + l1)*ido]};
771 v4sf di2{cc[i + (k + l1)*ido]};
772 vcplxmulconj(dr2, di2, wr1, wi1);
774 v4sf wr2{ld_ps1(wa2[i - 2])};
775 v4sf wi2{ld_ps1(wa2[i - 1])};
776 v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
777 v4sf di3{cc[i + (k + l1*2)*ido]};
778 vcplxmulconj(dr3, di3, wr2, wi2);
780 v4sf cr2{vadd(dr2, dr3)};
781 v4sf ci2{vadd(di2, di3)};
782 ch[i - 1 + 3*k*ido] = vadd(cc[i - 1 + k*ido], cr2);
783 ch[i + 3*k*ido] = vadd(cc[i + k*ido], ci2);
784 v4sf tr2{vmadd(taur, cr2, cc[i - 1 + k*ido])};
785 v4sf ti2{vmadd(taur, ci2, cc[i + k*ido])};
786 v4sf tr3{vmul(taui, vsub(di2, di3))};
787 v4sf ti3{vmul(taui, vsub(dr3, dr2))};
788 ch[i - 1 + (3*k + 2)*ido] = vadd(tr2, tr3);
789 ch[ic - 1 + (3*k + 1)*ido] = vsub(tr2, tr3);
790 ch[i + (3*k + 2)*ido] = vadd(ti2, ti3);
791 ch[ic + (3*k + 1)*ido] = vsub(ti3, ti2);
794 } /* radf3 */
797 void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
798 const float *const wa1)
800 static constexpr float taur{-0.5f};
801 static constexpr float taui{0.866025403784439f};
802 static constexpr float taui_2{taui*2.0f};
804 const v4sf vtaur{ld_ps1(taur)};
805 const v4sf vtaui_2{ld_ps1(taui_2)};
806 for(size_t k{0};k < l1;++k)
808 v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
809 tr2 = vadd(tr2,tr2);
810 v4sf cr2 = vmadd(vtaur, tr2, cc[3*k*ido]);
811 ch[k*ido] = vadd(cc[3*k*ido], tr2);
812 v4sf ci3 = vmul(vtaui_2, cc[(3*k + 2)*ido]);
813 ch[(k + l1)*ido] = vsub(cr2, ci3);
814 ch[(k + 2*l1)*ido] = vadd(cr2, ci3);
816 if(ido == 1)
817 return;
819 const auto wa2 = wa1 + ido;
820 const v4sf vtaui{ld_ps1(taui)};
821 for(size_t k{0};k < l1;++k)
823 for(size_t i{2};i < ido;i += 2)
825 const size_t ic{ido - i};
826 v4sf tr2{vadd(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
827 v4sf cr2{vmadd(vtaur, tr2, cc[i - 1 + 3*k*ido])};
828 ch[i - 1 + k*ido] = vadd(cc[i - 1 + 3*k*ido], tr2);
829 v4sf ti2{vsub(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])};
830 v4sf ci2{vmadd(vtaur, ti2, cc[i + 3*k*ido])};
831 ch[i + k*ido] = vadd(cc[i + 3*k*ido], ti2);
832 v4sf cr3{vmul(vtaui, vsub(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))};
833 v4sf ci3{vmul(vtaui, vadd(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))};
834 v4sf dr2{vsub(cr2, ci3)};
835 v4sf dr3{vadd(cr2, ci3)};
836 v4sf di2{vadd(ci2, cr3)};
837 v4sf di3{vsub(ci2, cr3)};
838 vcplxmul(dr2, di2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
839 ch[i - 1 + (k + l1)*ido] = dr2;
840 ch[i + (k + l1)*ido] = di2;
841 vcplxmul(dr3, di3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
842 ch[i - 1 + (k + 2*l1)*ido] = dr3;
843 ch[i + (k + 2*l1)*ido] = di3;
846 } /* radb3 */
848 NOINLINE void radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
849 v4sf *RESTRICT ch, const float *const wa1)
851 const size_t l1ido{l1*ido};
853 const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido};
854 v4sf *RESTRICT ch_{ch};
855 while(cc != cc_end)
857 // this loop represents between 25% and 40% of total radf4_ps cost !
858 v4sf a0{cc[0]}, a1{cc[l1ido]};
859 v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]};
860 v4sf tr1{vadd(a1, a3)};
861 v4sf tr2{vadd(a0, a2)};
862 ch[2*ido-1] = vsub(a0, a2);
863 ch[2*ido ] = vsub(a3, a1);
864 ch[0 ] = vadd(tr1, tr2);
865 ch[4*ido-1] = vsub(tr2, tr1);
866 cc += ido; ch += 4*ido;
868 cc = cc_;
869 ch = ch_;
871 if(ido < 2)
872 return;
873 if(ido != 2)
875 const auto wa2 = wa1 + ido;
876 const auto wa3 = wa2 + ido;
878 for(size_t k{0};k < l1ido;k += ido)
880 const v4sf *RESTRICT pc{cc + 1 + k};
881 for(size_t i{2};i < ido;i += 2, pc += 2)
883 const size_t ic{ido - i};
885 v4sf cr2{pc[1*l1ido+0]};
886 v4sf ci2{pc[1*l1ido+1]};
887 v4sf wr{ld_ps1(wa1[i - 2])};
888 v4sf wi{ld_ps1(wa1[i - 1])};
889 vcplxmulconj(cr2,ci2,wr,wi);
891 v4sf cr3{pc[2*l1ido+0]};
892 v4sf ci3{pc[2*l1ido+1]};
893 wr = ld_ps1(wa2[i-2]);
894 wi = ld_ps1(wa2[i-1]);
895 vcplxmulconj(cr3, ci3, wr, wi);
897 v4sf cr4{pc[3*l1ido]};
898 v4sf ci4{pc[3*l1ido+1]};
899 wr = ld_ps1(wa3[i-2]);
900 wi = ld_ps1(wa3[i-1]);
901 vcplxmulconj(cr4, ci4, wr, wi);
903 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
905 v4sf tr1{vadd(cr2,cr4)};
906 v4sf tr4{vsub(cr4,cr2)};
907 v4sf tr2{vadd(pc[0],cr3)};
908 v4sf tr3{vsub(pc[0],cr3)};
909 ch[i - 1 + 4*k ] = vadd(tr2,tr1);
910 ch[ic - 1 + 4*k + 3*ido] = vsub(tr2,tr1); // at this point tr1 and tr2 can be disposed
911 v4sf ti1{vadd(ci2,ci4)};
912 v4sf ti4{vsub(ci2,ci4)};
913 ch[i - 1 + 4*k + 2*ido] = vadd(tr3,ti4);
914 ch[ic - 1 + 4*k + 1*ido] = vsub(tr3,ti4); // dispose tr3, ti4
915 v4sf ti2{vadd(pc[1],ci3)};
916 v4sf ti3{vsub(pc[1],ci3)};
917 ch[i + 4*k ] = vadd(ti1, ti2);
918 ch[ic + 4*k + 3*ido] = vsub(ti1, ti2);
919 ch[i + 4*k + 2*ido] = vadd(tr4, ti3);
920 ch[ic + 4*k + 1*ido] = vsub(tr4, ti3);
923 if((ido&1) == 1)
924 return;
926 const v4sf minus_hsqt2{ld_ps1(al::numbers::sqrt2_v<float> * -0.5f)};
927 for(size_t k{0};k < l1ido;k += ido)
929 v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
930 v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
931 v4sf ti1{vmul(minus_hsqt2, vadd(b, a))};
932 v4sf tr1{vmul(minus_hsqt2, vsub(b, a))};
933 ch[ido-1 + 4*k ] = vadd(c, tr1);
934 ch[ido-1 + 4*k + 2*ido] = vsub(c, tr1);
935 ch[ 4*k + 1*ido] = vsub(ti1, d);
936 ch[ 4*k + 3*ido] = vadd(ti1, d);
938 } /* radf4 */
941 NOINLINE void radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
942 v4sf *RESTRICT ch, const float *const wa1)
944 const v4sf two{ld_ps1(2.0f)};
945 const size_t l1ido{l1*ido};
947 const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
948 v4sf *ch_{ch};
949 while(ch != ch_end)
951 v4sf a{cc[0]}, b{cc[4*ido-1]};
952 v4sf c{cc[2*ido]}, d{cc[2*ido-1]};
953 v4sf tr3{vmul(two,d)};
954 v4sf tr2{vadd(a,b)};
955 v4sf tr1{vsub(a,b)};
956 v4sf tr4{vmul(two,c)};
957 ch[0*l1ido] = vadd(tr2, tr3);
958 ch[2*l1ido] = vsub(tr2, tr3);
959 ch[1*l1ido] = vsub(tr1, tr4);
960 ch[3*l1ido] = vadd(tr1, tr4);
962 cc += 4*ido; ch += ido;
964 cc = cc_; ch = ch_;
966 if(ido < 2)
967 return;
968 if(ido != 2)
970 const auto wa2 = wa1 + ido;
971 const auto wa3 = wa2 + ido;
973 for(size_t k{0};k < l1ido;k += ido)
975 const v4sf *RESTRICT pc{cc - 1 + 4*k};
976 v4sf *RESTRICT ph{ch + k + 1};
977 for(size_t i{2};i < ido;i += 2)
979 v4sf tr1{vsub(pc[ i], pc[4*ido - i])};
980 v4sf tr2{vadd(pc[ i], pc[4*ido - i])};
981 v4sf ti4{vsub(pc[2*ido + i], pc[2*ido - i])};
982 v4sf tr3{vadd(pc[2*ido + i], pc[2*ido - i])};
983 ph[0] = vadd(tr2, tr3);
984 v4sf cr3{vsub(tr2, tr3)};
986 v4sf ti3{vsub(pc[2*ido + i + 1], pc[2*ido - i + 1])};
987 v4sf tr4{vadd(pc[2*ido + i + 1], pc[2*ido - i + 1])};
988 v4sf cr2{vsub(tr1, tr4)};
989 v4sf cr4{vadd(tr1, tr4)};
991 v4sf ti1{vadd(pc[i + 1], pc[4*ido - i + 1])};
992 v4sf ti2{vsub(pc[i + 1], pc[4*ido - i + 1])};
994 ph[1] = vadd(ti2, ti3); ph += l1ido;
995 v4sf ci3{vsub(ti2, ti3)};
996 v4sf ci2{vadd(ti1, ti4)};
997 v4sf ci4{vsub(ti1, ti4)};
998 vcplxmul(cr2, ci2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
999 ph[0] = cr2;
1000 ph[1] = ci2; ph += l1ido;
1001 vcplxmul(cr3, ci3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
1002 ph[0] = cr3;
1003 ph[1] = ci3; ph += l1ido;
1004 vcplxmul(cr4, ci4, ld_ps1(wa3[i-2]), ld_ps1(wa3[i-1]));
1005 ph[0] = cr4;
1006 ph[1] = ci4; ph = ph - 3*l1ido + 2;
1009 if((ido&1) == 1)
1010 return;
1012 const v4sf minus_sqrt2{ld_ps1(-1.414213562373095f)};
1013 for(size_t k{0};k < l1ido;k += ido)
1015 const size_t i0{4*k + ido};
1016 v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
1017 v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
1018 v4sf tr1{vsub(c,d)};
1019 v4sf tr2{vadd(c,d)};
1020 v4sf ti1{vadd(b,a)};
1021 v4sf ti2{vsub(b,a)};
1022 ch[ido-1 + k + 0*l1ido] = vadd(tr2,tr2);
1023 ch[ido-1 + k + 1*l1ido] = vmul(minus_sqrt2, vsub(ti1, tr1));
1024 ch[ido-1 + k + 2*l1ido] = vadd(ti2, ti2);
1025 ch[ido-1 + k + 3*l1ido] = vmul(minus_sqrt2, vadd(ti1, tr1));
1027 } /* radb4 */
1029 void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1030 const float *const wa1)
1032 const v4sf tr11{ld_ps1(0.309016994374947f)};
1033 const v4sf ti11{ld_ps1(0.951056516295154f)};
1034 const v4sf tr12{ld_ps1(-0.809016994374947f)};
1035 const v4sf ti12{ld_ps1(0.587785252292473f)};
1037 auto cc_ref = [&cc,l1,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1038 { return cc[(a_3*l1 + a_2)*ido + a_1]; };
1039 auto ch_ref = [&ch,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1040 { return ch[(a_3*5 + a_2)*ido + a_1]; };
1042 /* Parameter adjustments */
1043 ch -= 1 + ido * 6;
1044 cc -= 1 + ido * (1 + l1);
1046 const auto wa2 = wa1 + ido;
1047 const auto wa3 = wa2 + ido;
1048 const auto wa4 = wa3 + ido;
1050 /* Function Body */
1051 for(size_t k{1};k <= l1;++k)
1053 v4sf cr2{vadd(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1054 v4sf ci5{vsub(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1055 v4sf cr3{vadd(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1056 v4sf ci4{vsub(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1057 ch_ref(1, 1, k) = vadd(cc_ref(1, k, 1), vadd(cr2, cr3));
1058 ch_ref(ido, 2, k) = vadd(cc_ref(1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)));
1059 ch_ref(1, 3, k) = vmadd(ti11, ci5, vmul(ti12, ci4));
1060 ch_ref(ido, 4, k) = vadd(cc_ref(1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)));
1061 ch_ref(1, 5, k) = vsub(vmul(ti12, ci5), vmul(ti11, ci4));
1062 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
1064 if(ido == 1)
1065 return;
1067 const size_t idp2{ido + 2};
1068 for(size_t k{1};k <= l1;++k)
1070 for(size_t i{3};i <= ido;i += 2)
1072 const size_t ic{idp2 - i};
1073 v4sf dr2{ld_ps1(wa1[i-3])};
1074 v4sf di2{ld_ps1(wa1[i-2])};
1075 v4sf dr3{ld_ps1(wa2[i-3])};
1076 v4sf di3{ld_ps1(wa2[i-2])};
1077 v4sf dr4{ld_ps1(wa3[i-3])};
1078 v4sf di4{ld_ps1(wa3[i-2])};
1079 v4sf dr5{ld_ps1(wa4[i-3])};
1080 v4sf di5{ld_ps1(wa4[i-2])};
1081 vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
1082 vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
1083 vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
1084 vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
1085 v4sf cr2{vadd(dr2, dr5)};
1086 v4sf ci5{vsub(dr5, dr2)};
1087 v4sf cr5{vsub(di2, di5)};
1088 v4sf ci2{vadd(di2, di5)};
1089 v4sf cr3{vadd(dr3, dr4)};
1090 v4sf ci4{vsub(dr4, dr3)};
1091 v4sf cr4{vsub(di3, di4)};
1092 v4sf ci3{vadd(di3, di4)};
1093 ch_ref(i - 1, 1, k) = vadd(cc_ref(i - 1, k, 1), vadd(cr2, cr3));
1094 ch_ref(i, 1, k) = vsub(cc_ref(i, k, 1), vadd(ci2, ci3));
1095 v4sf tr2{vadd(cc_ref(i - 1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)))};
1096 v4sf ti2{vsub(cc_ref(i, k, 1), vmadd(tr11, ci2, vmul(tr12, ci3)))};
1097 v4sf tr3{vadd(cc_ref(i - 1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)))};
1098 v4sf ti3{vsub(cc_ref(i, k, 1), vmadd(tr12, ci2, vmul(tr11, ci3)))};
1099 v4sf tr5{vmadd(ti11, cr5, vmul(ti12, cr4))};
1100 v4sf ti5{vmadd(ti11, ci5, vmul(ti12, ci4))};
1101 v4sf tr4{vsub(vmul(ti12, cr5), vmul(ti11, cr4))};
1102 v4sf ti4{vsub(vmul(ti12, ci5), vmul(ti11, ci4))};
1103 ch_ref(i - 1, 3, k) = vsub(tr2, tr5);
1104 ch_ref(ic - 1, 2, k) = vadd(tr2, tr5);
1105 ch_ref(i , 3, k) = vadd(ti5, ti2);
1106 ch_ref(ic , 2, k) = vsub(ti5, ti2);
1107 ch_ref(i - 1, 5, k) = vsub(tr3, tr4);
1108 ch_ref(ic - 1, 4, k) = vadd(tr3, tr4);
1109 ch_ref(i , 5, k) = vadd(ti4, ti3);
1110 ch_ref(ic , 4, k) = vsub(ti4, ti3);
1113 } /* radf5 */
1115 void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1116 const float *const wa1)
1118 const v4sf tr11{ld_ps1(0.309016994374947f)};
1119 const v4sf ti11{ld_ps1(0.951056516295154f)};
1120 const v4sf tr12{ld_ps1(-0.809016994374947f)};
1121 const v4sf ti12{ld_ps1(0.587785252292473f)};
1123 auto cc_ref = [&cc,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1124 { return cc[(a_3*5 + a_2)*ido + a_1]; };
1125 auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1126 { return ch[(a_3*l1 + a_2)*ido + a_1]; };
1128 /* Parameter adjustments */
1129 ch -= 1 + ido*(1 + l1);
1130 cc -= 1 + ido*6;
1132 const auto wa2 = wa1 + ido;
1133 const auto wa3 = wa2 + ido;
1134 const auto wa4 = wa3 + ido;
1136 /* Function Body */
1137 for(size_t k{1};k <= l1;++k)
1139 v4sf ti5{vadd(cc_ref( 1, 3, k), cc_ref(1, 3, k))};
1140 v4sf ti4{vadd(cc_ref( 1, 5, k), cc_ref(1, 5, k))};
1141 v4sf tr2{vadd(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
1142 v4sf tr3{vadd(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
1143 ch_ref(1, k, 1) = vadd(cc_ref(1, 1, k), vadd(tr2, tr3));
1144 v4sf cr2{vadd(cc_ref(1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1145 v4sf cr3{vadd(cc_ref(1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1146 v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1147 v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1148 ch_ref(1, k, 2) = vsub(cr2, ci5);
1149 ch_ref(1, k, 3) = vsub(cr3, ci4);
1150 ch_ref(1, k, 4) = vadd(cr3, ci4);
1151 ch_ref(1, k, 5) = vadd(cr2, ci5);
1153 if(ido == 1)
1154 return;
1156 const size_t idp2{ido + 2};
1157 for(size_t k{1};k <= l1;++k)
1159 for(size_t i{3};i <= ido;i += 2)
1161 const size_t ic{idp2 - i};
1162 v4sf ti5{vadd(cc_ref(i , 3, k), cc_ref(ic , 2, k))};
1163 v4sf ti2{vsub(cc_ref(i , 3, k), cc_ref(ic , 2, k))};
1164 v4sf ti4{vadd(cc_ref(i , 5, k), cc_ref(ic , 4, k))};
1165 v4sf ti3{vsub(cc_ref(i , 5, k), cc_ref(ic , 4, k))};
1166 v4sf tr5{vsub(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1167 v4sf tr2{vadd(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1168 v4sf tr4{vsub(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1169 v4sf tr3{vadd(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1170 ch_ref(i - 1, k, 1) = vadd(cc_ref(i-1, 1, k), vadd(tr2, tr3));
1171 ch_ref(i , k, 1) = vadd(cc_ref(i , 1, k), vadd(ti2, ti3));
1172 v4sf cr2{vadd(cc_ref(i-1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1173 v4sf ci2{vadd(cc_ref(i , 1, k), vmadd(tr11, ti2, vmul(tr12, ti3)))};
1174 v4sf cr3{vadd(cc_ref(i-1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1175 v4sf ci3{vadd(cc_ref(i , 1, k), vmadd(tr12, ti2, vmul(tr11, ti3)))};
1176 v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
1177 v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1178 v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
1179 v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1180 v4sf dr3{vsub(cr3, ci4)};
1181 v4sf dr4{vadd(cr3, ci4)};
1182 v4sf di3{vadd(ci3, cr4)};
1183 v4sf di4{vsub(ci3, cr4)};
1184 v4sf dr5{vadd(cr2, ci5)};
1185 v4sf dr2{vsub(cr2, ci5)};
1186 v4sf di5{vsub(ci2, cr5)};
1187 v4sf di2{vadd(ci2, cr5)};
1188 vcplxmul(dr2, di2, ld_ps1(wa1[i-3]), ld_ps1(wa1[i-2]));
1189 vcplxmul(dr3, di3, ld_ps1(wa2[i-3]), ld_ps1(wa2[i-2]));
1190 vcplxmul(dr4, di4, ld_ps1(wa3[i-3]), ld_ps1(wa3[i-2]));
1191 vcplxmul(dr5, di5, ld_ps1(wa4[i-3]), ld_ps1(wa4[i-2]));
1193 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
1194 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
1195 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
1196 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
1199 } /* radb5 */
1201 NOINLINE v4sf *rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1202 const float *wa, const al::span<const uint,15> ifac)
1204 assert(work1 != work2);
1206 const v4sf *in{input_readonly};
1207 v4sf *out{in == work2 ? work1 : work2};
1208 const size_t nf{ifac[1]};
1209 size_t l2{n};
1210 size_t iw{n-1};
1211 size_t k1{1};
1212 while(true)
1214 const size_t kh{nf - k1};
1215 const size_t ip{ifac[kh + 2]};
1216 const size_t l1{l2 / ip};
1217 const size_t ido{n / l2};
1218 iw -= (ip - 1)*ido;
1219 switch(ip)
1221 case 5:
1222 radf5_ps(ido, l1, in, out, &wa[iw]);
1223 break;
1224 case 4:
1225 radf4_ps(ido, l1, in, out, &wa[iw]);
1226 break;
1227 case 3:
1228 radf3_ps(ido, l1, in, out, &wa[iw]);
1229 break;
1230 case 2:
1231 radf2_ps(ido, l1, in, out, &wa[iw]);
1232 break;
1233 default:
1234 assert(0);
1236 if(++k1 > nf)
1237 return out;
1239 l2 = l1;
1240 if(out == work2)
1242 out = work1;
1243 in = work2;
1245 else
1247 out = work2;
1248 in = work1;
1251 } /* rfftf1 */
1253 NOINLINE v4sf *rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1254 const float *wa, const al::span<const uint,15> ifac)
1256 assert(work1 != work2);
1258 const v4sf *in{input_readonly};
1259 v4sf *out{in == work2 ? work1 : work2};
1260 const size_t nf{ifac[1]};
1261 size_t l1{1};
1262 size_t iw{0};
1263 size_t k1{1};
1264 while(true)
1266 const size_t ip{ifac[k1 + 1]};
1267 const size_t l2{ip*l1};
1268 const size_t ido{n / l2};
1269 switch(ip)
1271 case 5:
1272 radb5_ps(ido, l1, in, out, &wa[iw]);
1273 break;
1274 case 4:
1275 radb4_ps(ido, l1, in, out, &wa[iw]);
1276 break;
1277 case 3:
1278 radb3_ps(ido, l1, in, out, &wa[iw]);
1279 break;
1280 case 2:
1281 radb2_ps(ido, l1, in, out, &wa[iw]);
1282 break;
1283 default:
1284 assert(0);
1286 if(++k1 > nf)
1287 return out;
1289 l1 = l2;
1290 iw += (ip - 1)*ido;
1292 if(out == work2)
1294 out = work1;
1295 in = work2;
1297 else
1299 out = work2;
1300 in = work1;
1305 v4sf *cfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1306 const float *wa, const al::span<const uint,15> ifac, const float fsign)
1308 assert(work1 != work2);
1310 const v4sf *in{input_readonly};
1311 v4sf *out{in == work2 ? work1 : work2};
1312 const size_t nf{ifac[1]};
1313 size_t l1{1}, iw{0};
1314 size_t k1{2};
1315 while(true)
1317 const size_t ip{ifac[k1]};
1318 const size_t l2{ip*l1};
1319 const size_t ido{n / l2};
1320 const size_t idot{ido + ido};
1321 switch(ip)
1323 case 5:
1324 passf5_ps(idot, l1, in, out, &wa[iw], fsign);
1325 break;
1326 case 4:
1327 passf4_ps(idot, l1, in, out, &wa[iw], fsign);
1328 break;
1329 case 3:
1330 passf3_ps(idot, l1, in, out, &wa[iw], fsign);
1331 break;
1332 case 2:
1333 passf2_ps(idot, l1, in, out, &wa[iw], fsign);
1334 break;
1335 default:
1336 assert(0);
1338 if(++k1 > nf+1)
1339 return out;
1341 l1 = l2;
1342 iw += (ip - 1)*idot;
1343 if(out == work2)
1345 out = work1;
1346 in = work2;
1348 else
1350 out = work2;
1351 in = work1;
1357 uint decompose(const uint n, const al::span<uint,15> ifac, const al::span<const uint,4> ntryh)
1359 uint nl{n}, nf{0};
1360 for(const uint ntry : ntryh)
1362 while(nl != 1)
1364 const uint nq{nl / ntry};
1365 const uint nr{nl % ntry};
1366 if(nr != 0) break;
1368 ifac[2+nf++] = ntry;
1369 nl = nq;
1370 if(ntry == 2 && nf != 1)
1372 for(size_t i{2};i <= nf;++i)
1374 size_t ib{nf - i + 2};
1375 ifac[ib + 1] = ifac[ib];
1377 ifac[2] = 2;
1381 ifac[0] = n;
1382 ifac[1] = nf;
1383 return nf;
1386 void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1388 static constexpr std::array ntryh{4u,2u,3u,5u};
1390 const uint nf{decompose(n, ifac, ntryh)};
1391 const double argh{2.0*al::numbers::pi / n};
1392 size_t is{0};
1393 size_t nfm1{nf - 1};
1394 size_t l1{1};
1395 for(size_t k1{0};k1 < nfm1;++k1)
1397 const size_t ip{ifac[k1+2]};
1398 const size_t l2{l1*ip};
1399 const size_t ido{n / l2};
1400 const size_t ipm{ip - 1};
1401 size_t ld{0};
1402 for(size_t j{0};j < ipm;++j)
1404 size_t i{is};
1405 ld += l1;
1406 const double argld{static_cast<double>(ld)*argh};
1407 double fi{0.0};
1408 for(size_t ii{2};ii < ido;ii += 2)
1410 fi += 1.0;
1411 wa[i++] = static_cast<float>(std::cos(fi*argld));
1412 wa[i++] = static_cast<float>(std::sin(fi*argld));
1414 is += ido;
1416 l1 = l2;
1418 } /* rffti1 */
1420 void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1422 static constexpr std::array ntryh{5u,3u,4u,2u};
1424 const uint nf{decompose(n, ifac, ntryh)};
1425 const double argh{2.0*al::numbers::pi / n};
1426 size_t i{1};
1427 size_t l1{1};
1428 for(size_t k1{0};k1 < nf;++k1)
1430 const size_t ip{ifac[k1+2]};
1431 const size_t l2{l1*ip};
1432 const size_t ido{n / l2};
1433 const size_t idot{ido + ido + 2};
1434 const size_t ipm{ip - 1};
1435 size_t ld{0};
1436 for(size_t j{0};j < ipm;++j)
1438 size_t i1{i};
1439 wa[i-1] = 1;
1440 wa[i] = 0;
1441 ld += l1;
1442 const double argld{static_cast<double>(ld)*argh};
1443 double fi{0.0};
1444 for(size_t ii{3};ii < idot;ii += 2)
1446 fi += 1.0;
1447 wa[++i] = static_cast<float>(std::cos(fi*argld));
1448 wa[++i] = static_cast<float>(std::sin(fi*argld));
1450 if(ip > 5)
1452 wa[i1-1] = wa[i-1];
1453 wa[i1] = wa[i];
1456 l1 = l2;
1458 } /* cffti1 */
1460 } // namespace
1462 /* NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) */
1463 struct PFFFT_Setup {
1464 uint N{};
1465 uint Ncvec{}; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1466 std::array<uint,15> ifac{};
1467 pffft_transform_t transform{};
1469 float *twiddle{}; /* N/4 elements */
1470 al::span<v4sf> e; /* N/4*3 elements */
1472 alignas(V4sfAlignment) std::byte end;
1475 PFFFTSetupPtr pffft_new_setup(unsigned int N, pffft_transform_t transform)
1477 assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX);
1478 assert(N > 0);
1479 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1480 * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1481 * handle other cases (or maybe just switch to a scalar fft, I don't know..)
1483 if(transform == PFFFT_REAL)
1484 assert((N%(2*SimdSize*SimdSize)) == 0);
1485 else
1486 assert((N%(SimdSize*SimdSize)) == 0);
1488 const uint Ncvec{(transform == PFFFT_REAL ? N/2 : N) / SimdSize};
1490 const size_t storelen{std::max(offsetof(PFFFT_Setup, end) + 2_zu*Ncvec*sizeof(v4sf),
1491 sizeof(PFFFT_Setup))};
1492 auto storage = static_cast<gsl::owner<std::byte*>>(::operator new[](storelen, V4sfAlignVal));
1493 al::span extrastore{&storage[offsetof(PFFFT_Setup, end)], 2_zu*Ncvec*sizeof(v4sf)};
1495 PFFFTSetupPtr s{::new(storage) PFFFT_Setup{}};
1496 s->N = N;
1497 s->transform = transform;
1498 s->Ncvec = Ncvec;
1500 const size_t ecount{2_zu*Ncvec*(SimdSize-1)/SimdSize};
1501 s->e = {std::launder(reinterpret_cast<v4sf*>(extrastore.data())), ecount};
1502 s->twiddle = std::launder(reinterpret_cast<float*>(&extrastore[ecount*sizeof(v4sf)]));
1504 if constexpr(SimdSize > 1)
1506 auto e = std::vector<float>(s->e.size()*SimdSize, 0.0f);
1507 for(size_t k{0};k < s->Ncvec;++k)
1509 const size_t i{k / SimdSize};
1510 const size_t j{k % SimdSize};
1511 for(size_t m{0};m < SimdSize-1;++m)
1513 const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N};
1514 e[((i*3 + m)*2 + 0)*SimdSize + j] = static_cast<float>(std::cos(A));
1515 e[((i*3 + m)*2 + 1)*SimdSize + j] = static_cast<float>(std::sin(A));
1518 std::memcpy(s->e.data(), e.data(), e.size()*sizeof(float));
1520 if(transform == PFFFT_REAL)
1521 rffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1522 else
1523 cffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1525 /* check that N is decomposable with allowed prime factors */
1526 size_t m{1};
1527 for(size_t k{0};k < s->ifac[1];++k)
1528 m *= s->ifac[2+k];
1530 if(m != N/SimdSize)
1531 s = nullptr;
1533 return s;
1537 void pffft_destroy_setup(gsl::owner<PFFFT_Setup*> s) noexcept
1539 std::destroy_at(s);
1540 ::operator delete[](gsl::owner<void*>{s}, V4sfAlignVal);
1543 #if !defined(PFFFT_SIMD_DISABLE)
1545 namespace {
1547 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1548 void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *RESTRICT out)
1550 v4sf g0, g1;
1551 interleave2(in[0], in[1], g0, g1);
1552 in += in_stride;
1554 *--out = vswaphl(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1555 for(size_t k{1};k < N;++k)
1557 v4sf h0, h1;
1558 interleave2(in[0], in[1], h0, h1);
1559 in += in_stride;
1560 *--out = vswaphl(g1, h0);
1561 *--out = vswaphl(h0, h1);
1562 g1 = h1;
1564 *--out = vswaphl(g1, g0);
1567 void unreversed_copy(const size_t N, const v4sf *in, v4sf *RESTRICT out, const int out_stride)
1569 v4sf g0{in[0]}, g1{g0};
1570 ++in;
1571 for(size_t k{1};k < N;++k)
1573 v4sf h0{*in++}; v4sf h1{*in++};
1574 g1 = vswaphl(g1, h0);
1575 h0 = vswaphl(h0, h1);
1576 uninterleave2(h0, g1, out[0], out[1]);
1577 out += out_stride;
1578 g1 = h1;
1580 v4sf h0{*in++}, h1{g0};
1581 g1 = vswaphl(g1, h0);
1582 h0 = vswaphl(h0, h1);
1583 uninterleave2(h0, g1, out[0], out[1]);
1586 void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1588 assert(in != out);
1590 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1591 for(size_t k{0};k < dk;++k)
1593 v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1594 v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1595 v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1596 v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1597 vtranspose4(r0,r1,r2,r3);
1598 vtranspose4(i0,i1,i2,i3);
1599 vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
1600 vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
1601 vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
1603 v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1604 v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1605 v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1606 v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1608 /* transformation for each column is:
1610 * [1 1 1 1 0 0 0 0] [r0]
1611 * [1 0 -1 0 0 -1 0 1] [r1]
1612 * [1 -1 1 -1 0 0 0 0] [r2]
1613 * [1 0 -1 0 0 1 0 -1] [r3]
1614 * [0 0 0 0 1 1 1 1] * [i0]
1615 * [0 1 0 -1 1 0 -1 0] [i1]
1616 * [0 0 0 0 1 -1 1 -1] [i2]
1617 * [0 -1 0 1 1 0 -1 0] [i3]
1620 r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1621 r1 = vadd(dr0, di1); i1 = vsub(di0, dr1);
1622 r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1623 r3 = vsub(dr0, di1); i3 = vadd(di0, dr1);
1625 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1626 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1630 void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1632 assert(in != out);
1634 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1635 for(size_t k{0};k < dk;++k)
1637 v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1638 v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1639 v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1640 v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1642 v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1643 v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1644 v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1645 v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1647 r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1648 r1 = vsub(dr0, di1); i1 = vadd(di0, dr1);
1649 r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1650 r3 = vadd(dr0, di1); i3 = vsub(di0, dr1);
1652 vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
1653 vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
1654 vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
1656 vtranspose4(r0,r1,r2,r3);
1657 vtranspose4(i0,i1,i2,i3);
1659 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1660 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1665 force_inline void pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1666 const v4sf *e, v4sf *RESTRICT out)
1668 v4sf r0{*in0}, i0{*in1};
1669 v4sf r1{*in++}; v4sf i1{*in++};
1670 v4sf r2{*in++}; v4sf i2{*in++};
1671 v4sf r3{*in++}; v4sf i3{*in++};
1672 vtranspose4(r0,r1,r2,r3);
1673 vtranspose4(i0,i1,i2,i3);
1675 /* transformation for each column is:
1677 * [1 1 1 1 0 0 0 0] [r0]
1678 * [1 0 -1 0 0 -1 0 1] [r1]
1679 * [1 0 -1 0 0 1 0 -1] [r2]
1680 * [1 -1 1 -1 0 0 0 0] [r3]
1681 * [0 0 0 0 1 1 1 1] * [i0]
1682 * [0 -1 0 1 -1 0 1 0] [i1]
1683 * [0 -1 0 1 1 0 -1 0] [i2]
1684 * [0 0 0 0 -1 1 -1 1] [i3]
1687 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1688 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1690 vcplxmul(r1,i1,e[0],e[1]);
1691 vcplxmul(r2,i2,e[2],e[3]);
1692 vcplxmul(r3,i3,e[4],e[5]);
1694 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1695 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1697 v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0,r2)};
1698 v4sf sr1{vadd(r1,r3)}, dr1{vsub(r3,r1)};
1699 v4sf si0{vadd(i0,i2)}, di0{vsub(i0,i2)};
1700 v4sf si1{vadd(i1,i3)}, di1{vsub(i3,i1)};
1702 r0 = vadd(sr0, sr1);
1703 r3 = vsub(sr0, sr1);
1704 i0 = vadd(si0, si1);
1705 i3 = vsub(si1, si0);
1706 r1 = vadd(dr0, di1);
1707 r2 = vsub(dr0, di1);
1708 i1 = vsub(dr1, di0);
1709 i2 = vadd(dr1, di0);
1711 *out++ = r0;
1712 *out++ = i0;
1713 *out++ = r1;
1714 *out++ = i1;
1715 *out++ = r2;
1716 *out++ = i2;
1717 *out++ = r3;
1718 *out++ = i3;
1721 NOINLINE void pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1722 const v4sf *e)
1724 static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
1726 assert(in != out);
1727 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1728 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1730 const v4sf zero{vzero()};
1731 const auto cr = al::bit_cast<std::array<float,SimdSize>>(in[0]);
1732 const auto ci = al::bit_cast<std::array<float,SimdSize>>(in[Ncvec*2-1]);
1733 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1735 /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1737 * [Xr(1)] ] [1 1 1 1 0 0 0 0]
1738 * [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1739 * [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1740 * [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1741 * [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1742 * [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1743 * [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1744 * [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1747 const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; out[0] = vinsert0(out[0], xr0);
1748 const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; out[1] = vinsert0(out[1], xi0);
1749 const float xr2{(cr[0]-cr[2])}; out[4] = vinsert0(out[4], xr2);
1750 const float xi2{(cr[3]-cr[1])}; out[5] = vinsert0(out[5], xi2);
1751 const float xr1{ ci[0] + s*(ci[1]-ci[3])}; out[2] = vinsert0(out[2], xr1);
1752 const float xi1{-ci[2] - s*(ci[1]+ci[3])}; out[3] = vinsert0(out[3], xi1);
1753 const float xr3{ ci[0] - s*(ci[1]-ci[3])}; out[6] = vinsert0(out[6], xr3);
1754 const float xi3{ ci[2] - s*(ci[1]+ci[3])}; out[7] = vinsert0(out[7], xi3);
1756 for(size_t k{1};k < dk;++k)
1757 pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
1760 force_inline void pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
1761 const bool first)
1763 v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
1764 v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]};
1766 /* transformation for each column is:
1768 * [1 1 1 1 0 0 0 0] [r0]
1769 * [1 0 0 -1 0 -1 -1 0] [r1]
1770 * [1 -1 -1 1 0 0 0 0] [r2]
1771 * [1 0 0 -1 0 1 1 0] [r3]
1772 * [0 0 0 0 1 -1 1 -1] * [i0]
1773 * [0 -1 1 0 1 0 0 1] [i1]
1774 * [0 0 0 0 1 1 -1 -1] [i2]
1775 * [0 1 -1 0 1 0 0 1] [i3]
1778 v4sf sr0{vadd(r0,r3)}, dr0{vsub(r0,r3)};
1779 v4sf sr1{vadd(r1,r2)}, dr1{vsub(r1,r2)};
1780 v4sf si0{vadd(i0,i3)}, di0{vsub(i0,i3)};
1781 v4sf si1{vadd(i1,i2)}, di1{vsub(i1,i2)};
1783 r0 = vadd(sr0, sr1);
1784 r2 = vsub(sr0, sr1);
1785 r1 = vsub(dr0, si1);
1786 r3 = vadd(dr0, si1);
1787 i0 = vsub(di0, di1);
1788 i2 = vadd(di0, di1);
1789 i1 = vsub(si0, dr1);
1790 i3 = vadd(si0, dr1);
1792 vcplxmulconj(r1,i1,e[0],e[1]);
1793 vcplxmulconj(r2,i2,e[2],e[3]);
1794 vcplxmulconj(r3,i3,e[4],e[5]);
1796 vtranspose4(r0,r1,r2,r3);
1797 vtranspose4(i0,i1,i2,i3);
1799 if(!first)
1801 *out++ = r0;
1802 *out++ = i0;
1804 *out++ = r1;
1805 *out++ = i1;
1806 *out++ = r2;
1807 *out++ = i2;
1808 *out++ = r3;
1809 *out++ = i3;
1812 NOINLINE void pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1813 const v4sf *e)
1815 static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
1817 assert(in != out);
1818 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1819 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1821 std::array<float,SimdSize> Xr{}, Xi{};
1822 for(size_t k{0};k < SimdSize;++k)
1824 Xr[k] = vextract0(in[2*k]);
1825 Xi[k] = vextract0(in[2*k + 1]);
1828 pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
1830 /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1832 * [cr0] [1 0 2 0 1 0 0 0]
1833 * [cr1] [1 0 0 0 -1 0 -2 0]
1834 * [cr2] [1 0 -2 0 1 0 0 0]
1835 * [cr3] [1 0 0 0 -1 0 2 0]
1836 * [ci0] [0 2 0 2 0 0 0 0]
1837 * [ci1] [0 s 0 -s 0 -s 0 -s]
1838 * [ci2] [0 0 0 0 0 -2 0 2]
1839 * [ci3] [0 -s 0 s 0 -s 0 -s]
1841 for(size_t k{1};k < dk;++k)
1842 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
1844 const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
1845 const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
1846 const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
1847 const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
1848 out[0] = vset4(cr0, cr1, cr2, cr3);
1849 const float ci0{ 2*(Xr[1]+Xr[3])};
1850 const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1851 const float ci2{ 2*(Xi[3]-Xi[1])};
1852 const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1853 out[2*Ncvec-1] = vset4(ci0, ci1, ci2, ci3);
1857 void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
1858 v4sf *scratch, const pffft_direction_t direction, const bool ordered)
1860 assert(scratch != nullptr);
1861 assert(voutput != scratch);
1863 const size_t Ncvec{setup->Ncvec};
1864 const bool nf_odd{(setup->ifac[1]&1) != 0};
1866 std::array buff{voutput, scratch};
1867 bool ib{nf_odd != ordered};
1868 if(direction == PFFFT_FORWARD)
1870 /* Swap the initial work buffer for forward FFTs, which helps avoid an
1871 * extra copy for output.
1873 ib = !ib;
1874 if(setup->transform == PFFFT_REAL)
1876 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
1877 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1879 else
1881 v4sf *tmp{buff[ib]};
1882 for(size_t k=0; k < Ncvec; ++k)
1883 uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1885 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
1886 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1888 if(ordered)
1889 pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
1890 reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
1891 else
1892 ib = !ib;
1894 else
1896 if(vinput == buff[ib])
1897 ib = !ib; // may happen when finput == foutput
1899 if(ordered)
1901 pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
1902 reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
1903 vinput = buff[ib];
1904 ib = !ib;
1906 if(setup->transform == PFFFT_REAL)
1908 pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1909 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
1911 else
1913 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1914 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
1915 for(size_t k{0};k < Ncvec;++k)
1916 interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1920 if(buff[ib] != voutput)
1922 /* extra copy required -- this situation should only happen when finput == foutput */
1923 assert(vinput==voutput);
1924 for(size_t k{0};k < Ncvec;++k)
1926 v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
1927 voutput[2*k] = a; voutput[2*k+1] = b;
1932 } // namespace
1934 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
1935 pffft_direction_t direction)
1937 assert(in != out);
1939 const size_t N{setup->N}, Ncvec{setup->Ncvec};
1940 const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
1941 v4sf *RESTRICT vout{reinterpret_cast<v4sf*>(out)};
1942 if(setup->transform == PFFFT_REAL)
1944 const size_t dk{N/32};
1945 if(direction == PFFFT_FORWARD)
1947 for(size_t k{0};k < dk;++k)
1949 interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1950 interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1952 reversed_copy(dk, vin+2, 8, vout + N/SimdSize/2);
1953 reversed_copy(dk, vin+6, 8, vout + N/SimdSize);
1955 else
1957 for(size_t k{0};k < dk;++k)
1959 uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1960 uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1962 unreversed_copy(dk, vin + N/SimdSize/4, vout + N/SimdSize - 6, -8);
1963 unreversed_copy(dk, vin + 3_uz*N/SimdSize/4, vout + N/SimdSize - 2, -8);
1966 else
1968 if(direction == PFFFT_FORWARD)
1970 for(size_t k{0};k < Ncvec;++k)
1972 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1973 interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1976 else
1978 for(size_t k{0};k < Ncvec;++k)
1980 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1981 uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1987 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
1988 float *ab, float scaling)
1990 const size_t Ncvec{s->Ncvec};
1991 const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
1992 const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
1993 v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
1995 #ifdef __arm__
1996 __builtin_prefetch(va);
1997 __builtin_prefetch(vb);
1998 __builtin_prefetch(vab);
1999 __builtin_prefetch(va+2);
2000 __builtin_prefetch(vb+2);
2001 __builtin_prefetch(vab+2);
2002 __builtin_prefetch(va+4);
2003 __builtin_prefetch(vb+4);
2004 __builtin_prefetch(vab+4);
2005 __builtin_prefetch(va+6);
2006 __builtin_prefetch(vb+6);
2007 __builtin_prefetch(vab+6);
2008 #ifndef __clang__
2009 #define ZCONVOLVE_USING_INLINE_NEON_ASM
2010 #endif
2011 #endif
2013 const float ar1{vextract0(va[0])};
2014 const float ai1{vextract0(va[1])};
2015 const float br1{vextract0(vb[0])};
2016 const float bi1{vextract0(vb[1])};
2017 const float abr1{vextract0(vab[0])};
2018 const float abi1{vextract0(vab[1])};
2020 #ifdef ZCONVOLVE_USING_INLINE_ASM
2021 /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
2022 * Ubuntu. So this will be restricted to GCC.
2024 * Does it still miscompile with Clang? Is it even needed with today's
2025 * optimizers?
2027 const float *a_{a}, *b_{b}; float *ab_{ab};
2028 size_t N{Ncvec};
2029 asm volatile("mov r8, %2 \n"
2030 "vdup.f32 q15, %4 \n"
2031 "1: \n"
2032 "pld [%0,#64] \n"
2033 "pld [%1,#64] \n"
2034 "pld [%2,#64] \n"
2035 "pld [%0,#96] \n"
2036 "pld [%1,#96] \n"
2037 "pld [%2,#96] \n"
2038 "vld1.f32 {q0,q1}, [%0,:128]! \n"
2039 "vld1.f32 {q4,q5}, [%1,:128]! \n"
2040 "vld1.f32 {q2,q3}, [%0,:128]! \n"
2041 "vld1.f32 {q6,q7}, [%1,:128]! \n"
2042 "vld1.f32 {q8,q9}, [r8,:128]! \n"
2044 "vmul.f32 q10, q0, q4 \n"
2045 "vmul.f32 q11, q0, q5 \n"
2046 "vmul.f32 q12, q2, q6 \n"
2047 "vmul.f32 q13, q2, q7 \n"
2048 "vmls.f32 q10, q1, q5 \n"
2049 "vmla.f32 q11, q1, q4 \n"
2050 "vld1.f32 {q0,q1}, [r8,:128]! \n"
2051 "vmls.f32 q12, q3, q7 \n"
2052 "vmla.f32 q13, q3, q6 \n"
2053 "vmla.f32 q8, q10, q15 \n"
2054 "vmla.f32 q9, q11, q15 \n"
2055 "vmla.f32 q0, q12, q15 \n"
2056 "vmla.f32 q1, q13, q15 \n"
2057 "vst1.f32 {q8,q9},[%2,:128]! \n"
2058 "vst1.f32 {q0,q1},[%2,:128]! \n"
2059 "subs %3, #2 \n"
2060 "bne 1b \n"
2061 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
2063 #else
2065 /* Default routine, works fine for non-arm cpus with current compilers. */
2066 const v4sf vscal{ld_ps1(scaling)};
2067 for(size_t i{0};i < Ncvec;i += 2)
2069 v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2070 v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2071 vcplxmul(ar4, ai4, br4, bi4);
2072 vab[2*i+0] = vmadd(ar4, vscal, vab[2*i+0]);
2073 vab[2*i+1] = vmadd(ai4, vscal, vab[2*i+1]);
2074 ar4 = va[2*i+2]; ai4 = va[2*i+3];
2075 br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2076 vcplxmul(ar4, ai4, br4, bi4);
2077 vab[2*i+2] = vmadd(ar4, vscal, vab[2*i+2]);
2078 vab[2*i+3] = vmadd(ai4, vscal, vab[2*i+3]);
2080 #endif
2082 if(s->transform == PFFFT_REAL)
2084 vab[0] = vinsert0(vab[0], abr1 + ar1*br1*scaling);
2085 vab[1] = vinsert0(vab[1], abi1 + ai1*bi1*scaling);
2089 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2091 const size_t Ncvec{s->Ncvec};
2092 const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
2093 const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
2094 v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
2096 #ifdef __arm__
2097 __builtin_prefetch(va);
2098 __builtin_prefetch(vb);
2099 __builtin_prefetch(vab);
2100 __builtin_prefetch(va+2);
2101 __builtin_prefetch(vb+2);
2102 __builtin_prefetch(vab+2);
2103 __builtin_prefetch(va+4);
2104 __builtin_prefetch(vb+4);
2105 __builtin_prefetch(vab+4);
2106 __builtin_prefetch(va+6);
2107 __builtin_prefetch(vb+6);
2108 __builtin_prefetch(vab+6);
2109 #endif
2111 const float ar1{vextract0(va[0])};
2112 const float ai1{vextract0(va[1])};
2113 const float br1{vextract0(vb[0])};
2114 const float bi1{vextract0(vb[1])};
2115 const float abr1{vextract0(vab[0])};
2116 const float abi1{vextract0(vab[1])};
2118 /* No inline assembly for this version. I'm not familiar enough with NEON
2119 * assembly, and I don't know that it's needed with today's optimizers.
2121 for(size_t i{0};i < Ncvec;i += 2)
2123 v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2124 v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2125 vcplxmul(ar4, ai4, br4, bi4);
2126 vab[2*i+0] = vadd(ar4, vab[2*i+0]);
2127 vab[2*i+1] = vadd(ai4, vab[2*i+1]);
2128 ar4 = va[2*i+2]; ai4 = va[2*i+3];
2129 br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2130 vcplxmul(ar4, ai4, br4, bi4);
2131 vab[2*i+2] = vadd(ar4, vab[2*i+2]);
2132 vab[2*i+3] = vadd(ai4, vab[2*i+3]);
2135 if(s->transform == PFFFT_REAL)
2137 vab[0] = vinsert0(vab[0], abr1 + ar1*br1);
2138 vab[1] = vinsert0(vab[1], abi1 + ai1*bi1);
2143 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2144 pffft_direction_t direction)
2146 assert(valigned(input) && valigned(output) && valigned(work));
2147 pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2148 reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2149 reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
2152 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2153 float *work, pffft_direction_t direction)
2155 assert(valigned(input) && valigned(output) && valigned(work));
2156 pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2157 reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2158 reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
2161 #else // defined(PFFFT_SIMD_DISABLE)
2163 // standard routine using scalar floats, without SIMD stuff.
2165 namespace {
2167 void pffft_transform_internal(const PFFFT_Setup *setup, const float *input, float *output,
2168 float *scratch, const pffft_direction_t direction, bool ordered)
2170 const size_t Ncvec{setup->Ncvec};
2171 const bool nf_odd{(setup->ifac[1]&1) != 0};
2173 assert(scratch != nullptr);
2175 /* z-domain data for complex transforms is already ordered without SIMD. */
2176 if(setup->transform == PFFFT_COMPLEX)
2177 ordered = false;
2179 float *buff[2]{output, scratch};
2180 bool ib{nf_odd != ordered};
2181 if(direction == PFFFT_FORWARD)
2183 if(setup->transform == PFFFT_REAL)
2184 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
2185 else
2186 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
2187 if(ordered)
2189 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
2190 ib = !ib;
2193 else
2195 if (input == buff[ib])
2196 ib = !ib; // may happen when finput == foutput
2198 if(ordered)
2200 pffft_zreorder(setup, input, buff[ib], PFFFT_BACKWARD);
2201 input = buff[ib];
2202 ib = !ib;
2204 if(setup->transform == PFFFT_REAL)
2205 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
2206 else
2207 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
2209 if(buff[ib] != output)
2211 // extra copy required -- this situation should happens only when finput == foutput
2212 assert(input==output);
2213 for(size_t k{0};k < Ncvec;++k)
2215 float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
2216 output[2*k] = a; output[2*k+1] = b;
2221 } // namespace
2223 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *RESTRICT out,
2224 pffft_direction_t direction)
2226 const size_t N{setup->N};
2227 if(setup->transform == PFFFT_COMPLEX)
2229 for(size_t k{0};k < 2*N;++k)
2230 out[k] = in[k];
2231 return;
2233 else if(direction == PFFFT_FORWARD)
2235 float x_N{in[N-1]};
2236 for(size_t k{N-1};k > 1;--k)
2237 out[k] = in[k-1];
2238 out[0] = in[0];
2239 out[1] = x_N;
2241 else
2243 float x_N{in[1]};
2244 for(size_t k{1};k < N-1;++k)
2245 out[k] = in[k+1];
2246 out[0] = in[0];
2247 out[N-1] = x_N;
2251 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
2252 float *ab, float scaling)
2254 size_t Ncvec{s->Ncvec};
2256 if(s->transform == PFFFT_REAL)
2258 // take care of the fftpack ordering
2259 ab[0] += a[0]*b[0]*scaling;
2260 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
2261 ++ab; ++a; ++b; --Ncvec;
2263 for(size_t i{0};i < Ncvec;++i)
2265 float ar{a[2*i+0]}, ai{a[2*i+1]};
2266 const float br{b[2*i+0]}, bi{b[2*i+1]};
2267 vcplxmul(ar, ai, br, bi);
2268 ab[2*i+0] += ar*scaling;
2269 ab[2*i+1] += ai*scaling;
2273 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2275 size_t Ncvec{s->Ncvec};
2277 if(s->transform == PFFFT_REAL)
2279 // take care of the fftpack ordering
2280 ab[0] += a[0]*b[0];
2281 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
2282 ++ab; ++a; ++b; --Ncvec;
2284 for(size_t i{0};i < Ncvec;++i)
2286 float ar{a[2*i+0]}, ai{a[2*i+1]};
2287 const float br{b[2*i+0]}, bi{b[2*i+1]};
2288 vcplxmul(ar, ai, br, bi);
2289 ab[2*i+0] += ar;
2290 ab[2*i+1] += ai;
2295 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2296 pffft_direction_t direction)
2298 pffft_transform_internal(setup, input, output, work, direction, false);
2301 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2302 float *work, pffft_direction_t direction)
2304 pffft_transform_internal(setup, input, output, work, direction, true);
2307 #endif /* defined(PFFFT_SIMD_DISABLE) */
2308 /* NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) */