Use kAudioObjectPropertyElementMaster on macOS for compatibility
[openal-soft.git] / common / pffft.cpp
blobfb26b847a76787c5dddcd0ecac9ad906d04e5157
1 //$ nobt
3 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
4 * Copyright (c) 2023 Christopher Robinson
6 * Based on original fortran 77 code from FFTPACKv4 from NETLIB
7 * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
8 * of NCAR, in 1985.
10 * As confirmed by the NCAR fftpack software curators, the following
11 * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
12 * released under the same terms.
14 * FFTPACK license:
16 * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
18 * Copyright (c) 2004 the University Corporation for Atmospheric
19 * Research ("UCAR"). All rights reserved. Developed by NCAR's
20 * Computational and Information Systems Laboratory, UCAR,
21 * www.cisl.ucar.edu.
23 * Redistribution and use of the Software in source and binary forms,
24 * with or without modification, is permitted provided that the
25 * following conditions are met:
27 * - Neither the names of NCAR's Computational and Information Systems
28 * Laboratory, the University Corporation for Atmospheric Research,
29 * nor the names of its sponsors or contributors may be used to
30 * endorse or promote products derived from this Software without
31 * specific prior written permission.
33 * - Redistributions of source code must retain the above copyright
34 * notices, this list of conditions, and the disclaimer below.
36 * - Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions, and the disclaimer below in the
38 * documentation and/or other materials provided with the
39 * distribution.
41 * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42 * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
43 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
44 * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
45 * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
46 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
49 * SOFTWARE.
52 * PFFFT : a Pretty Fast FFT.
54 * This file is largerly based on the original FFTPACK implementation, modified
55 * in order to take advantage of SIMD instructions of modern CPUs.
58 #include "pffft.h"
60 #include <algorithm>
61 #include <array>
62 #include <cassert>
63 #include <cmath>
64 #include <cstdint>
65 #include <cstdio>
66 #include <cstring>
67 #include <new>
68 #include <utility>
69 #include <vector>
71 #include "albit.h"
72 #include "almalloc.h"
73 #include "alnumbers.h"
74 #include "alnumeric.h"
75 #include "alspan.h"
76 #include "opthelpers.h"
79 using uint = unsigned int;
81 namespace {
83 #if defined(__GNUC__) || defined(_MSC_VER)
84 #define RESTRICT __restrict
85 #else
86 #define RESTRICT
87 #endif
89 /* Vector support macros: the rest of the code is independent of
90 * SSE/Altivec/NEON -- adding support for other platforms with 4-element
91 * vectors should be limited to these macros
94 /* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
95 //#define PFFFT_SIMD_DISABLE
97 #ifndef PFFFT_SIMD_DISABLE
99 * Altivec support macros
101 #if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
102 #include <altivec.h>
103 using v4sf = vector float;
104 constexpr uint SimdSize{4};
105 force_inline v4sf vzero() noexcept { return (vector float)vec_splat_u8(0); }
106 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vec_madd(a, b, vzero()); }
107 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vec_add(a, b); }
108 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vec_madd(a, b, c); }
109 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vec_sub(a, b); }
110 force_inline v4sf ld_ps1(float a) noexcept { return vec_splats(a); }
112 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
114 /* There a more efficient way to do this? */
115 alignas(16) std::array<float,4> vals{{a, b, c, d}};
116 return vec_ld(0, vals.data());
118 force_inline v4sf vinsert0(v4sf v, float a) noexcept { return vec_insert(a, v, 0); }
119 force_inline float vextract0(v4sf v) noexcept { return vec_extract(v, 0); }
121 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
123 out1 = vec_mergeh(in1, in2);
124 out2 = vec_mergel(in1, in2);
126 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
128 out1 = vec_perm(in1, in2, (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27});
129 out2 = vec_perm(in1, in2, (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31});
132 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
134 v4sf y0{vec_mergeh(x0, x2)};
135 v4sf y1{vec_mergel(x0, x2)};
136 v4sf y2{vec_mergeh(x1, x3)};
137 v4sf y3{vec_mergel(x1, x3)};
138 x0 = vec_mergeh(y0, y2);
139 x1 = vec_mergel(y0, y2);
140 x2 = vec_mergeh(y1, y3);
141 x3 = vec_mergel(y1, y3);
144 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
145 { return vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}); }
148 * SSE1 support macros
150 #elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
151 (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
153 #include <xmmintrin.h>
154 using v4sf = __m128;
155 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/
156 * finalize functions anyway so you will have to work if you want to enable AVX
157 * with its 256-bit vectors.
159 constexpr uint SimdSize{4};
160 force_inline v4sf vzero() noexcept { return _mm_setzero_ps(); }
161 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return _mm_mul_ps(a, b); }
162 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return _mm_add_ps(a, b); }
163 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return _mm_add_ps(_mm_mul_ps(a,b), c); }
164 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return _mm_sub_ps(a, b); }
165 force_inline v4sf ld_ps1(float a) noexcept { return _mm_set1_ps(a); }
167 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
168 { return _mm_setr_ps(a, b, c, d); }
169 force_inline v4sf vinsert0(const v4sf v, const float a) noexcept
170 { return _mm_move_ss(v, _mm_set_ss(a)); }
171 force_inline float vextract0(v4sf v) noexcept
172 { return _mm_cvtss_f32(v); }
174 force_inline void interleave2(const v4sf in1, const v4sf in2, v4sf &out1, v4sf &out2) noexcept
176 out1 = _mm_unpacklo_ps(in1, in2);
177 out2 = _mm_unpackhi_ps(in1, in2);
179 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
181 out1 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0));
182 out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
185 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
186 { _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
188 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
189 { return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)); }
192 * ARM NEON support macros
194 #elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)
196 #include <arm_neon.h>
197 using v4sf = float32x4_t;
198 constexpr uint SimdSize{4};
199 force_inline v4sf vzero() noexcept { return vdupq_n_f32(0.0f); }
200 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vmulq_f32(a, b); }
201 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vaddq_f32(a, b); }
202 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vmlaq_f32(c, a, b); }
203 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vsubq_f32(a, b); }
204 force_inline v4sf ld_ps1(float a) noexcept { return vdupq_n_f32(a); }
206 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
208 float32x4_t ret{vmovq_n_f32(a)};
209 ret = vsetq_lane_f32(b, ret, 1);
210 ret = vsetq_lane_f32(c, ret, 2);
211 ret = vsetq_lane_f32(d, ret, 3);
212 return ret;
214 force_inline v4sf vinsert0(v4sf v, float a) noexcept
215 { return vsetq_lane_f32(a, v, 0); }
216 force_inline float vextract0(v4sf v) noexcept
217 { return vgetq_lane_f32(v, 0); }
219 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
221 float32x4x2_t tmp{vzipq_f32(in1, in2)};
222 out1 = tmp.val[0];
223 out2 = tmp.val[1];
225 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
227 float32x4x2_t tmp{vuzpq_f32(in1, in2)};
228 out1 = tmp.val[0];
229 out2 = tmp.val[1];
232 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
234 /* marginally faster version:
235 * asm("vtrn.32 %q0, %q1;\n"
236 * "vtrn.32 %q2, %q3\n
237 * "vswp %f0, %e2\n
238 * "vswp %f1, %e3"
239 * : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
241 float32x4x2_t t0_{vzipq_f32(x0, x2)};
242 float32x4x2_t t1_{vzipq_f32(x1, x3)};
243 float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
244 float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
245 x0 = u0_.val[0];
246 x1 = u0_.val[1];
247 x2 = u1_.val[0];
248 x3 = u1_.val[1];
251 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
252 { return vcombine_f32(vget_low_f32(b), vget_high_f32(a)); }
255 * Generic GCC vector macros
257 #elif defined(__GNUC__)
259 using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
260 constexpr uint SimdSize{4};
261 force_inline constexpr v4sf vzero() noexcept { return v4sf{0.0f, 0.0f, 0.0f, 0.0f}; }
262 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
263 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
264 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
265 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
266 force_inline constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
268 force_inline constexpr v4sf vset4(float a, float b, float c, float d) noexcept
269 { return v4sf{a, b, c, d}; }
270 force_inline constexpr v4sf vinsert0(v4sf v, float a) noexcept
271 { return v4sf{a, v[1], v[2], v[3]}; }
272 force_inline float vextract0(v4sf v) noexcept
273 { return v[0]; }
275 force_inline v4sf unpacklo(v4sf a, v4sf b) noexcept
276 { return v4sf{a[0], b[0], a[1], b[1]}; }
277 force_inline v4sf unpackhi(v4sf a, v4sf b) noexcept
278 { return v4sf{a[2], b[2], a[3], b[3]}; }
280 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
282 out1 = unpacklo(in1, in2);
283 out2 = unpackhi(in1, in2);
285 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
287 out1 = v4sf{in1[0], in1[2], in2[0], in2[2]};
288 out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
291 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
293 v4sf tmp0{unpacklo(x0, x1)};
294 v4sf tmp2{unpacklo(x2, x3)};
295 v4sf tmp1{unpackhi(x0, x1)};
296 v4sf tmp3{unpackhi(x2, x3)};
297 x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]};
298 x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]};
299 x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
300 x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
303 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
304 { return v4sf{b[0], b[1], a[2], a[3]}; }
306 #else
308 #warning "building with simd disabled !\n";
309 #define PFFFT_SIMD_DISABLE // fallback to scalar code
310 #endif
312 #endif /* PFFFT_SIMD_DISABLE */
314 // fallback mode for situations where SIMD is not available, use scalar mode instead
315 #ifdef PFFFT_SIMD_DISABLE
316 using v4sf = float;
317 constexpr uint SimdSize{1};
318 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
319 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
320 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
321 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
322 force_inline constexpr v4sf ld_ps1(float a) noexcept { return a; }
324 #else
326 [[maybe_unused]] inline
327 auto valigned(const float *ptr) noexcept -> bool
329 static constexpr uintptr_t alignmask{SimdSize*sizeof(float) - 1};
330 return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
332 #endif
334 // shortcuts for complex multiplications
335 force_inline void vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
337 v4sf tmp{vmul(ar, bi)};
338 ar = vsub(vmul(ar, br), vmul(ai, bi));
339 ai = vmadd(ai, br, tmp);
341 force_inline void vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
343 v4sf tmp{vmul(ar, bi)};
344 ar = vmadd(ai, bi, vmul(ar, br));
345 ai = vsub(vmul(ai, br), tmp);
348 #if !defined(PFFFT_SIMD_DISABLE)
350 inline void assertv4(const al::span<const float,4> v_f [[maybe_unused]],
351 const float f0 [[maybe_unused]], const float f1 [[maybe_unused]],
352 const float f2 [[maybe_unused]], const float f3 [[maybe_unused]])
353 { assert(v_f[0] == f0 && v_f[1] == f1 && v_f[2] == f2 && v_f[3] == f3); }
355 template<typename T, T ...N>
356 constexpr auto make_float_array(std::integer_sequence<T,N...>)
357 { return std::array{static_cast<float>(N)...}; }
359 /* detect bugs with the vector support macros */
360 [[maybe_unused]] void validate_pffft_simd()
362 using float4 = std::array<float,4>;
363 static constexpr auto f = make_float_array(std::make_index_sequence<16>{});
365 v4sf a0_v{vset4(f[ 0], f[ 1], f[ 2], f[ 3])};
366 v4sf a1_v{vset4(f[ 4], f[ 5], f[ 6], f[ 7])};
367 v4sf a2_v{vset4(f[ 8], f[ 9], f[10], f[11])};
368 v4sf a3_v{vset4(f[12], f[13], f[14], f[15])};
369 v4sf u_v{};
371 auto t_v = vzero();
372 auto t_f = al::bit_cast<float4>(t_v);
373 printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
374 assertv4(t_f, 0, 0, 0, 0);
376 t_v = vadd(a1_v, a2_v);
377 t_f = al::bit_cast<float4>(t_v);
378 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
379 assertv4(t_f, 12, 14, 16, 18);
381 t_v = vmul(a1_v, a2_v);
382 t_f = al::bit_cast<float4>(t_v);
383 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
384 assertv4(t_f, 32, 45, 60, 77);
386 t_v = vmadd(a1_v, a2_v, a0_v);
387 t_f = al::bit_cast<float4>(t_v);
388 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
389 assertv4(t_f, 32, 46, 62, 80);
391 interleave2(a1_v, a2_v, t_v, u_v);
392 t_f = al::bit_cast<float4>(t_v);
393 auto u_f = al::bit_cast<float4>(u_v);
394 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
395 t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
396 assertv4(t_f, 4, 8, 5, 9);
397 assertv4(u_f, 6, 10, 7, 11);
399 uninterleave2(a1_v, a2_v, t_v, u_v);
400 t_f = al::bit_cast<float4>(t_v);
401 u_f = al::bit_cast<float4>(u_v);
402 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
403 t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
404 assertv4(t_f, 4, 6, 8, 10);
405 assertv4(u_f, 5, 7, 9, 11);
407 t_v = ld_ps1(f[15]);
408 t_f = al::bit_cast<float4>(t_v);
409 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
410 assertv4(t_f, 15, 15, 15, 15);
412 t_v = vswaphl(a1_v, a2_v);
413 t_f = al::bit_cast<float4>(t_v);
414 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
415 assertv4(t_f, 8, 9, 6, 7);
417 vtranspose4(a0_v, a1_v, a2_v, a3_v);
418 auto a0_f = al::bit_cast<float4>(a0_v);
419 auto a1_f = al::bit_cast<float4>(a1_v);
420 auto a2_f = al::bit_cast<float4>(a2_v);
421 auto a3_f = al::bit_cast<float4>(a3_v);
422 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
423 a0_f[0], a0_f[1], a0_f[2], a0_f[3], a1_f[0], a1_f[1], a1_f[2], a1_f[3],
424 a2_f[0], a2_f[1], a2_f[2], a2_f[3], a3_f[0], a3_f[1], a3_f[2], a3_f[3]);
425 assertv4(a0_f, 0, 4, 8, 12);
426 assertv4(a1_f, 1, 5, 9, 13);
427 assertv4(a2_f, 2, 6, 10, 14);
428 assertv4(a3_f, 3, 7, 11, 15);
430 #endif //!PFFFT_SIMD_DISABLE
432 /* SSE and co like 16-bytes aligned pointers */
433 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
434 constexpr auto V4sfAlignment = size_t(64);
435 constexpr auto V4sfAlignVal = std::align_val_t(V4sfAlignment);
437 /* NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
438 * FIXME: Converting this from raw pointers to spans or something will probably
439 * need significant work to maintain performance, given non-sequential range-
440 * checked accesses and lack of 'restrict' to indicate non-aliased memory. At
441 * least, some tests should be done to check the impact of using range-checked
442 * spans here before blindly switching.
445 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
447 NOINLINE void passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
448 const float *const wa1, const float fsign)
450 const size_t l1ido{l1*ido};
451 if(ido <= 2)
453 for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
455 ch[0] = vadd(cc[0], cc[ido+0]);
456 ch[l1ido] = vsub(cc[0], cc[ido+0]);
457 ch[1] = vadd(cc[1], cc[ido+1]);
458 ch[l1ido + 1] = vsub(cc[1], cc[ido+1]);
461 else
463 for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
465 for(size_t i{0};i < ido-1;i += 2)
467 v4sf tr2{vsub(cc[i+0], cc[i+ido+0])};
468 v4sf ti2{vsub(cc[i+1], cc[i+ido+1])};
469 v4sf wr{ld_ps1(wa1[i])}, wi{ld_ps1(wa1[i+1]*fsign)};
470 ch[i] = vadd(cc[i+0], cc[i+ido+0]);
471 ch[i+1] = vadd(cc[i+1], cc[i+ido+1]);
472 vcplxmul(tr2, ti2, wr, wi);
473 ch[i+l1ido] = tr2;
474 ch[i+l1ido+1] = ti2;
481 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
483 NOINLINE void passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
484 const float *const wa1, const float fsign)
486 assert(ido > 2);
488 const v4sf taur{ld_ps1(-0.5f)};
489 const v4sf taui{ld_ps1(0.866025403784439f*fsign)};
490 const size_t l1ido{l1*ido};
491 const auto wa2 = wa1 + ido;
492 for(size_t k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
494 for(size_t i{0};i < ido-1;i += 2)
496 v4sf tr2{vadd(cc[i+ido], cc[i+2*ido])};
497 v4sf cr2{vmadd(taur, tr2, cc[i])};
498 ch[i] = vadd(tr2, cc[i]);
499 v4sf ti2{vadd(cc[i+ido+1], cc[i+2*ido+1])};
500 v4sf ci2{vmadd(taur, ti2, cc[i+1])};
501 ch[i+1] = vadd(cc[i+1], ti2);
502 v4sf cr3{vmul(taui, vsub(cc[i+ido], cc[i+2*ido]))};
503 v4sf ci3{vmul(taui, vsub(cc[i+ido+1], cc[i+2*ido+1]))};
504 v4sf dr2{vsub(cr2, ci3)};
505 v4sf dr3{vadd(cr2, ci3)};
506 v4sf di2{vadd(ci2, cr3)};
507 v4sf di3{vsub(ci2, cr3)};
508 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
509 vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
510 ch[i+l1ido] = dr2;
511 ch[i+l1ido + 1] = di2;
512 vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
513 ch[i+2*l1ido] = dr3;
514 ch[i+2*l1ido+1] = di3;
517 } /* passf3 */
519 NOINLINE void passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
520 const float *const wa1, const float fsign)
522 /* fsign == -1 for forward transform and +1 for backward transform */
523 const v4sf vsign{ld_ps1(fsign)};
524 const size_t l1ido{l1*ido};
525 if(ido == 2)
527 for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
529 v4sf tr1{vsub(cc[0], cc[2*ido + 0])};
530 v4sf tr2{vadd(cc[0], cc[2*ido + 0])};
531 v4sf ti1{vsub(cc[1], cc[2*ido + 1])};
532 v4sf ti2{vadd(cc[1], cc[2*ido + 1])};
533 v4sf ti4{vmul(vsub(cc[1*ido + 0], cc[3*ido + 0]), vsign)};
534 v4sf tr4{vmul(vsub(cc[3*ido + 1], cc[1*ido + 1]), vsign)};
535 v4sf tr3{vadd(cc[ido + 0], cc[3*ido + 0])};
536 v4sf ti3{vadd(cc[ido + 1], cc[3*ido + 1])};
538 ch[0*l1ido + 0] = vadd(tr2, tr3);
539 ch[0*l1ido + 1] = vadd(ti2, ti3);
540 ch[1*l1ido + 0] = vadd(tr1, tr4);
541 ch[1*l1ido + 1] = vadd(ti1, ti4);
542 ch[2*l1ido + 0] = vsub(tr2, tr3);
543 ch[2*l1ido + 1] = vsub(ti2, ti3);
544 ch[3*l1ido + 0] = vsub(tr1, tr4);
545 ch[3*l1ido + 1] = vsub(ti1, ti4);
548 else
550 const auto wa2 = wa1 + ido;
551 const auto wa3 = wa2 + ido;
552 for(size_t k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
554 for(size_t i{0};i < ido-1;i+=2)
556 v4sf tr1{vsub(cc[i + 0], cc[i + 2*ido + 0])};
557 v4sf tr2{vadd(cc[i + 0], cc[i + 2*ido + 0])};
558 v4sf ti1{vsub(cc[i + 1], cc[i + 2*ido + 1])};
559 v4sf ti2{vadd(cc[i + 1], cc[i + 2*ido + 1])};
560 v4sf tr4{vmul(vsub(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)};
561 v4sf ti4{vmul(vsub(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)};
562 v4sf tr3{vadd(cc[i + ido + 0], cc[i + 3*ido + 0])};
563 v4sf ti3{vadd(cc[i + ido + 1], cc[i + 3*ido + 1])};
565 ch[i] = vadd(tr2, tr3);
566 v4sf cr3{vsub(tr2, tr3)};
567 ch[i + 1] = vadd(ti2, ti3);
568 v4sf ci3{vsub(ti2, ti3)};
570 v4sf cr2{vadd(tr1, tr4)};
571 v4sf cr4{vsub(tr1, tr4)};
572 v4sf ci2{vadd(ti1, ti4)};
573 v4sf ci4{vsub(ti1, ti4)};
574 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
575 vcplxmul(cr2, ci2, ld_ps1(wr1), ld_ps1(wi1));
576 float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
577 ch[i + l1ido] = cr2;
578 ch[i + l1ido + 1] = ci2;
580 vcplxmul(cr3, ci3, ld_ps1(wr2), ld_ps1(wi2));
581 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
582 ch[i + 2*l1ido] = cr3;
583 ch[i + 2*l1ido + 1] = ci3;
585 vcplxmul(cr4, ci4, ld_ps1(wr3), ld_ps1(wi3));
586 ch[i + 3*l1ido] = cr4;
587 ch[i + 3*l1ido + 1] = ci4;
591 } /* passf4 */
594 * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
596 NOINLINE void passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
597 const float *const wa1, const float fsign)
599 const v4sf tr11{ld_ps1(0.309016994374947f)};
600 const v4sf tr12{ld_ps1(-0.809016994374947f)};
601 const v4sf ti11{ld_ps1(0.951056516295154f*fsign)};
602 const v4sf ti12{ld_ps1(0.587785252292473f*fsign)};
604 auto cc_ref = [&cc,ido](size_t a_1, size_t a_2) noexcept -> auto&
605 { return cc[(a_2-1)*ido + a_1 + 1]; };
606 auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_3) noexcept -> auto&
607 { return ch[(a_3-1)*l1*ido + a_1 + 1]; };
609 assert(ido > 2);
611 const auto wa2 = wa1 + ido;
612 const auto wa3 = wa2 + ido;
613 const auto wa4 = wa3 + ido;
614 for(size_t k{0};k < l1;++k, cc += 5*ido, ch += ido)
616 for(size_t i{0};i < ido-1;i += 2)
618 v4sf ti5{vsub(cc_ref(i , 2), cc_ref(i , 5))};
619 v4sf ti2{vadd(cc_ref(i , 2), cc_ref(i , 5))};
620 v4sf ti4{vsub(cc_ref(i , 3), cc_ref(i , 4))};
621 v4sf ti3{vadd(cc_ref(i , 3), cc_ref(i , 4))};
622 v4sf tr5{vsub(cc_ref(i-1, 2), cc_ref(i-1, 5))};
623 v4sf tr2{vadd(cc_ref(i-1, 2), cc_ref(i-1, 5))};
624 v4sf tr4{vsub(cc_ref(i-1, 3), cc_ref(i-1, 4))};
625 v4sf tr3{vadd(cc_ref(i-1, 3), cc_ref(i-1, 4))};
626 ch_ref(i-1, 1) = vadd(cc_ref(i-1, 1), vadd(tr2, tr3));
627 ch_ref(i , 1) = vadd(cc_ref(i , 1), vadd(ti2, ti3));
628 v4sf cr2{vadd(cc_ref(i-1, 1), vmadd(tr11, tr2, vmul(tr12, tr3)))};
629 v4sf ci2{vadd(cc_ref(i , 1), vmadd(tr11, ti2, vmul(tr12, ti3)))};
630 v4sf cr3{vadd(cc_ref(i-1, 1), vmadd(tr12, tr2, vmul(tr11, tr3)))};
631 v4sf ci3{vadd(cc_ref(i , 1), vmadd(tr12, ti2, vmul(tr11, ti3)))};
632 v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
633 v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
634 v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
635 v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
636 v4sf dr3{vsub(cr3, ci4)};
637 v4sf dr4{vadd(cr3, ci4)};
638 v4sf di3{vadd(ci3, cr4)};
639 v4sf di4{vsub(ci3, cr4)};
640 v4sf dr5{vadd(cr2, ci5)};
641 v4sf dr2{vsub(cr2, ci5)};
642 v4sf di5{vsub(ci2, cr5)};
643 v4sf di2{vadd(ci2, cr5)};
644 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
645 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
646 vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
647 ch_ref(i - 1, 2) = dr2;
648 ch_ref(i, 2) = di2;
649 vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
650 ch_ref(i - 1, 3) = dr3;
651 ch_ref(i, 3) = di3;
652 vcplxmul(dr4, di4, ld_ps1(wr3), ld_ps1(wi3));
653 ch_ref(i - 1, 4) = dr4;
654 ch_ref(i, 4) = di4;
655 vcplxmul(dr5, di5, ld_ps1(wr4), ld_ps1(wi4));
656 ch_ref(i - 1, 5) = dr5;
657 ch_ref(i, 5) = di5;
662 NOINLINE void radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
663 v4sf *RESTRICT ch, const float *const wa1)
665 const size_t l1ido{l1*ido};
666 for(size_t k{0};k < l1ido;k += ido)
668 v4sf a{cc[k]}, b{cc[k + l1ido]};
669 ch[2*k] = vadd(a, b);
670 ch[2*(k+ido)-1] = vsub(a, b);
672 if(ido < 2)
673 return;
674 if(ido != 2)
676 for(size_t k{0};k < l1ido;k += ido)
678 for(size_t i{2};i < ido;i += 2)
680 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
681 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
682 vcplxmulconj(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
683 ch[i + 2*k] = vadd(bi, ti2);
684 ch[2*(k+ido) - i] = vsub(ti2, bi);
685 ch[i - 1 + 2*k] = vadd(br, tr2);
686 ch[2*(k+ido) - i -1] = vsub(br, tr2);
689 if((ido&1) == 1)
690 return;
692 const v4sf minus_one{ld_ps1(-1.0f)};
693 for(size_t k{0};k < l1ido;k += ido)
695 ch[2*k + ido] = vmul(minus_one, cc[ido-1 + k + l1ido]);
696 ch[2*k + ido-1] = cc[k + ido-1];
698 } /* radf2 */
701 NOINLINE void radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
702 const float *const wa1)
704 const size_t l1ido{l1*ido};
705 for(size_t k{0};k < l1ido;k += ido)
707 v4sf a{cc[2*k]};
708 v4sf b{cc[2*(k+ido) - 1]};
709 ch[k] = vadd(a, b);
710 ch[k + l1ido] = vsub(a, b);
712 if(ido < 2)
713 return;
714 if(ido != 2)
716 for(size_t k{0};k < l1ido;k += ido)
718 for(size_t i{2};i < ido;i += 2)
720 v4sf a{cc[i-1 + 2*k]};
721 v4sf b{cc[2*(k + ido) - i - 1]};
722 v4sf c{cc[i+0 + 2*k]};
723 v4sf d{cc[2*(k + ido) - i + 0]};
724 ch[i-1 + k] = vadd(a, b);
725 v4sf tr2{vsub(a, b)};
726 ch[i+0 + k] = vsub(c, d);
727 v4sf ti2{vadd(c, d)};
728 vcplxmul(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
729 ch[i-1 + k + l1ido] = tr2;
730 ch[i+0 + k + l1ido] = ti2;
733 if((ido&1) == 1)
734 return;
736 const v4sf minus_two{ld_ps1(-2.0f)};
737 for(size_t k{0};k < l1ido;k += ido)
739 v4sf a{cc[2*k + ido-1]};
740 v4sf b{cc[2*k + ido]};
741 ch[k + ido-1] = vadd(a,a);
742 ch[k + ido-1 + l1ido] = vmul(minus_two, b);
744 } /* radb2 */
746 void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
747 const float *const wa1)
749 const v4sf taur{ld_ps1(-0.5f)};
750 const v4sf taui{ld_ps1(0.866025403784439f)};
751 for(size_t k{0};k < l1;++k)
753 v4sf cr2{vadd(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
754 ch[ (3*k )*ido] = vadd(cc[k*ido], cr2);
755 ch[ (3*k + 2)*ido] = vmul(taui, vsub(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
756 ch[ido-1 + (3*k + 1)*ido] = vmadd(taur, cr2, cc[k*ido]);
758 if(ido == 1)
759 return;
761 const auto wa2 = wa1 + ido;
762 for(size_t k{0};k < l1;++k)
764 for(size_t i{2};i < ido;i += 2)
766 const size_t ic{ido - i};
767 v4sf wr1{ld_ps1(wa1[i - 2])};
768 v4sf wi1{ld_ps1(wa1[i - 1])};
769 v4sf dr2{cc[i - 1 + (k + l1)*ido]};
770 v4sf di2{cc[i + (k + l1)*ido]};
771 vcplxmulconj(dr2, di2, wr1, wi1);
773 v4sf wr2{ld_ps1(wa2[i - 2])};
774 v4sf wi2{ld_ps1(wa2[i - 1])};
775 v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
776 v4sf di3{cc[i + (k + l1*2)*ido]};
777 vcplxmulconj(dr3, di3, wr2, wi2);
779 v4sf cr2{vadd(dr2, dr3)};
780 v4sf ci2{vadd(di2, di3)};
781 ch[i - 1 + 3*k*ido] = vadd(cc[i - 1 + k*ido], cr2);
782 ch[i + 3*k*ido] = vadd(cc[i + k*ido], ci2);
783 v4sf tr2{vmadd(taur, cr2, cc[i - 1 + k*ido])};
784 v4sf ti2{vmadd(taur, ci2, cc[i + k*ido])};
785 v4sf tr3{vmul(taui, vsub(di2, di3))};
786 v4sf ti3{vmul(taui, vsub(dr3, dr2))};
787 ch[i - 1 + (3*k + 2)*ido] = vadd(tr2, tr3);
788 ch[ic - 1 + (3*k + 1)*ido] = vsub(tr2, tr3);
789 ch[i + (3*k + 2)*ido] = vadd(ti2, ti3);
790 ch[ic + (3*k + 1)*ido] = vsub(ti3, ti2);
793 } /* radf3 */
796 void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
797 const float *const wa1)
799 static constexpr float taur{-0.5f};
800 static constexpr float taui{0.866025403784439f};
801 static constexpr float taui_2{taui*2.0f};
803 const v4sf vtaur{ld_ps1(taur)};
804 const v4sf vtaui_2{ld_ps1(taui_2)};
805 for(size_t k{0};k < l1;++k)
807 v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
808 tr2 = vadd(tr2,tr2);
809 v4sf cr2 = vmadd(vtaur, tr2, cc[3*k*ido]);
810 ch[k*ido] = vadd(cc[3*k*ido], tr2);
811 v4sf ci3 = vmul(vtaui_2, cc[(3*k + 2)*ido]);
812 ch[(k + l1)*ido] = vsub(cr2, ci3);
813 ch[(k + 2*l1)*ido] = vadd(cr2, ci3);
815 if(ido == 1)
816 return;
818 const auto wa2 = wa1 + ido;
819 const v4sf vtaui{ld_ps1(taui)};
820 for(size_t k{0};k < l1;++k)
822 for(size_t i{2};i < ido;i += 2)
824 const size_t ic{ido - i};
825 v4sf tr2{vadd(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
826 v4sf cr2{vmadd(vtaur, tr2, cc[i - 1 + 3*k*ido])};
827 ch[i - 1 + k*ido] = vadd(cc[i - 1 + 3*k*ido], tr2);
828 v4sf ti2{vsub(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])};
829 v4sf ci2{vmadd(vtaur, ti2, cc[i + 3*k*ido])};
830 ch[i + k*ido] = vadd(cc[i + 3*k*ido], ti2);
831 v4sf cr3{vmul(vtaui, vsub(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))};
832 v4sf ci3{vmul(vtaui, vadd(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))};
833 v4sf dr2{vsub(cr2, ci3)};
834 v4sf dr3{vadd(cr2, ci3)};
835 v4sf di2{vadd(ci2, cr3)};
836 v4sf di3{vsub(ci2, cr3)};
837 vcplxmul(dr2, di2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
838 ch[i - 1 + (k + l1)*ido] = dr2;
839 ch[i + (k + l1)*ido] = di2;
840 vcplxmul(dr3, di3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
841 ch[i - 1 + (k + 2*l1)*ido] = dr3;
842 ch[i + (k + 2*l1)*ido] = di3;
845 } /* radb3 */
847 NOINLINE void radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
848 v4sf *RESTRICT ch, const float *const wa1)
850 const size_t l1ido{l1*ido};
852 const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido};
853 v4sf *RESTRICT ch_{ch};
854 while(cc != cc_end)
856 // this loop represents between 25% and 40% of total radf4_ps cost !
857 v4sf a0{cc[0]}, a1{cc[l1ido]};
858 v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]};
859 v4sf tr1{vadd(a1, a3)};
860 v4sf tr2{vadd(a0, a2)};
861 ch[2*ido-1] = vsub(a0, a2);
862 ch[2*ido ] = vsub(a3, a1);
863 ch[0 ] = vadd(tr1, tr2);
864 ch[4*ido-1] = vsub(tr2, tr1);
865 cc += ido; ch += 4*ido;
867 cc = cc_;
868 ch = ch_;
870 if(ido < 2)
871 return;
872 if(ido != 2)
874 const auto wa2 = wa1 + ido;
875 const auto wa3 = wa2 + ido;
877 for(size_t k{0};k < l1ido;k += ido)
879 const v4sf *RESTRICT pc{cc + 1 + k};
880 for(size_t i{2};i < ido;i += 2, pc += 2)
882 const size_t ic{ido - i};
884 v4sf cr2{pc[1*l1ido+0]};
885 v4sf ci2{pc[1*l1ido+1]};
886 v4sf wr{ld_ps1(wa1[i - 2])};
887 v4sf wi{ld_ps1(wa1[i - 1])};
888 vcplxmulconj(cr2,ci2,wr,wi);
890 v4sf cr3{pc[2*l1ido+0]};
891 v4sf ci3{pc[2*l1ido+1]};
892 wr = ld_ps1(wa2[i-2]);
893 wi = ld_ps1(wa2[i-1]);
894 vcplxmulconj(cr3, ci3, wr, wi);
896 v4sf cr4{pc[3*l1ido]};
897 v4sf ci4{pc[3*l1ido+1]};
898 wr = ld_ps1(wa3[i-2]);
899 wi = ld_ps1(wa3[i-1]);
900 vcplxmulconj(cr4, ci4, wr, wi);
902 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
904 v4sf tr1{vadd(cr2,cr4)};
905 v4sf tr4{vsub(cr4,cr2)};
906 v4sf tr2{vadd(pc[0],cr3)};
907 v4sf tr3{vsub(pc[0],cr3)};
908 ch[i - 1 + 4*k ] = vadd(tr2,tr1);
909 ch[ic - 1 + 4*k + 3*ido] = vsub(tr2,tr1); // at this point tr1 and tr2 can be disposed
910 v4sf ti1{vadd(ci2,ci4)};
911 v4sf ti4{vsub(ci2,ci4)};
912 ch[i - 1 + 4*k + 2*ido] = vadd(tr3,ti4);
913 ch[ic - 1 + 4*k + 1*ido] = vsub(tr3,ti4); // dispose tr3, ti4
914 v4sf ti2{vadd(pc[1],ci3)};
915 v4sf ti3{vsub(pc[1],ci3)};
916 ch[i + 4*k ] = vadd(ti1, ti2);
917 ch[ic + 4*k + 3*ido] = vsub(ti1, ti2);
918 ch[i + 4*k + 2*ido] = vadd(tr4, ti3);
919 ch[ic + 4*k + 1*ido] = vsub(tr4, ti3);
922 if((ido&1) == 1)
923 return;
925 const v4sf minus_hsqt2{ld_ps1(al::numbers::sqrt2_v<float> * -0.5f)};
926 for(size_t k{0};k < l1ido;k += ido)
928 v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
929 v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
930 v4sf ti1{vmul(minus_hsqt2, vadd(b, a))};
931 v4sf tr1{vmul(minus_hsqt2, vsub(b, a))};
932 ch[ido-1 + 4*k ] = vadd(c, tr1);
933 ch[ido-1 + 4*k + 2*ido] = vsub(c, tr1);
934 ch[ 4*k + 1*ido] = vsub(ti1, d);
935 ch[ 4*k + 3*ido] = vadd(ti1, d);
937 } /* radf4 */
940 NOINLINE void radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
941 v4sf *RESTRICT ch, const float *const wa1)
943 const v4sf two{ld_ps1(2.0f)};
944 const size_t l1ido{l1*ido};
946 const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
947 v4sf *ch_{ch};
948 while(ch != ch_end)
950 v4sf a{cc[0]}, b{cc[4*ido-1]};
951 v4sf c{cc[2*ido]}, d{cc[2*ido-1]};
952 v4sf tr3{vmul(two,d)};
953 v4sf tr2{vadd(a,b)};
954 v4sf tr1{vsub(a,b)};
955 v4sf tr4{vmul(two,c)};
956 ch[0*l1ido] = vadd(tr2, tr3);
957 ch[2*l1ido] = vsub(tr2, tr3);
958 ch[1*l1ido] = vsub(tr1, tr4);
959 ch[3*l1ido] = vadd(tr1, tr4);
961 cc += 4*ido; ch += ido;
963 cc = cc_; ch = ch_;
965 if(ido < 2)
966 return;
967 if(ido != 2)
969 const auto wa2 = wa1 + ido;
970 const auto wa3 = wa2 + ido;
972 for(size_t k{0};k < l1ido;k += ido)
974 const v4sf *RESTRICT pc{cc - 1 + 4*k};
975 v4sf *RESTRICT ph{ch + k + 1};
976 for(size_t i{2};i < ido;i += 2)
978 v4sf tr1{vsub(pc[ i], pc[4*ido - i])};
979 v4sf tr2{vadd(pc[ i], pc[4*ido - i])};
980 v4sf ti4{vsub(pc[2*ido + i], pc[2*ido - i])};
981 v4sf tr3{vadd(pc[2*ido + i], pc[2*ido - i])};
982 ph[0] = vadd(tr2, tr3);
983 v4sf cr3{vsub(tr2, tr3)};
985 v4sf ti3{vsub(pc[2*ido + i + 1], pc[2*ido - i + 1])};
986 v4sf tr4{vadd(pc[2*ido + i + 1], pc[2*ido - i + 1])};
987 v4sf cr2{vsub(tr1, tr4)};
988 v4sf cr4{vadd(tr1, tr4)};
990 v4sf ti1{vadd(pc[i + 1], pc[4*ido - i + 1])};
991 v4sf ti2{vsub(pc[i + 1], pc[4*ido - i + 1])};
993 ph[1] = vadd(ti2, ti3); ph += l1ido;
994 v4sf ci3{vsub(ti2, ti3)};
995 v4sf ci2{vadd(ti1, ti4)};
996 v4sf ci4{vsub(ti1, ti4)};
997 vcplxmul(cr2, ci2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
998 ph[0] = cr2;
999 ph[1] = ci2; ph += l1ido;
1000 vcplxmul(cr3, ci3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
1001 ph[0] = cr3;
1002 ph[1] = ci3; ph += l1ido;
1003 vcplxmul(cr4, ci4, ld_ps1(wa3[i-2]), ld_ps1(wa3[i-1]));
1004 ph[0] = cr4;
1005 ph[1] = ci4; ph = ph - 3*l1ido + 2;
1008 if((ido&1) == 1)
1009 return;
1011 const v4sf minus_sqrt2{ld_ps1(-1.414213562373095f)};
1012 for(size_t k{0};k < l1ido;k += ido)
1014 const size_t i0{4*k + ido};
1015 v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
1016 v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
1017 v4sf tr1{vsub(c,d)};
1018 v4sf tr2{vadd(c,d)};
1019 v4sf ti1{vadd(b,a)};
1020 v4sf ti2{vsub(b,a)};
1021 ch[ido-1 + k + 0*l1ido] = vadd(tr2,tr2);
1022 ch[ido-1 + k + 1*l1ido] = vmul(minus_sqrt2, vsub(ti1, tr1));
1023 ch[ido-1 + k + 2*l1ido] = vadd(ti2, ti2);
1024 ch[ido-1 + k + 3*l1ido] = vmul(minus_sqrt2, vadd(ti1, tr1));
1026 } /* radb4 */
1028 void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1029 const float *const wa1)
1031 const v4sf tr11{ld_ps1(0.309016994374947f)};
1032 const v4sf ti11{ld_ps1(0.951056516295154f)};
1033 const v4sf tr12{ld_ps1(-0.809016994374947f)};
1034 const v4sf ti12{ld_ps1(0.587785252292473f)};
1036 auto cc_ref = [&cc,l1,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1037 { return cc[(a_3*l1 + a_2)*ido + a_1]; };
1038 auto ch_ref = [&ch,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1039 { return ch[(a_3*5 + a_2)*ido + a_1]; };
1041 /* Parameter adjustments */
1042 ch -= 1 + ido * 6;
1043 cc -= 1 + ido * (1 + l1);
1045 const auto wa2 = wa1 + ido;
1046 const auto wa3 = wa2 + ido;
1047 const auto wa4 = wa3 + ido;
1049 /* Function Body */
1050 for(size_t k{1};k <= l1;++k)
1052 v4sf cr2{vadd(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1053 v4sf ci5{vsub(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1054 v4sf cr3{vadd(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1055 v4sf ci4{vsub(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1056 ch_ref(1, 1, k) = vadd(cc_ref(1, k, 1), vadd(cr2, cr3));
1057 ch_ref(ido, 2, k) = vadd(cc_ref(1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)));
1058 ch_ref(1, 3, k) = vmadd(ti11, ci5, vmul(ti12, ci4));
1059 ch_ref(ido, 4, k) = vadd(cc_ref(1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)));
1060 ch_ref(1, 5, k) = vsub(vmul(ti12, ci5), vmul(ti11, ci4));
1061 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
1063 if(ido == 1)
1064 return;
1066 const size_t idp2{ido + 2};
1067 for(size_t k{1};k <= l1;++k)
1069 for(size_t i{3};i <= ido;i += 2)
1071 const size_t ic{idp2 - i};
1072 v4sf dr2{ld_ps1(wa1[i-3])};
1073 v4sf di2{ld_ps1(wa1[i-2])};
1074 v4sf dr3{ld_ps1(wa2[i-3])};
1075 v4sf di3{ld_ps1(wa2[i-2])};
1076 v4sf dr4{ld_ps1(wa3[i-3])};
1077 v4sf di4{ld_ps1(wa3[i-2])};
1078 v4sf dr5{ld_ps1(wa4[i-3])};
1079 v4sf di5{ld_ps1(wa4[i-2])};
1080 vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
1081 vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
1082 vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
1083 vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
1084 v4sf cr2{vadd(dr2, dr5)};
1085 v4sf ci5{vsub(dr5, dr2)};
1086 v4sf cr5{vsub(di2, di5)};
1087 v4sf ci2{vadd(di2, di5)};
1088 v4sf cr3{vadd(dr3, dr4)};
1089 v4sf ci4{vsub(dr4, dr3)};
1090 v4sf cr4{vsub(di3, di4)};
1091 v4sf ci3{vadd(di3, di4)};
1092 ch_ref(i - 1, 1, k) = vadd(cc_ref(i - 1, k, 1), vadd(cr2, cr3));
1093 ch_ref(i, 1, k) = vsub(cc_ref(i, k, 1), vadd(ci2, ci3));
1094 v4sf tr2{vadd(cc_ref(i - 1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)))};
1095 v4sf ti2{vsub(cc_ref(i, k, 1), vmadd(tr11, ci2, vmul(tr12, ci3)))};
1096 v4sf tr3{vadd(cc_ref(i - 1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)))};
1097 v4sf ti3{vsub(cc_ref(i, k, 1), vmadd(tr12, ci2, vmul(tr11, ci3)))};
1098 v4sf tr5{vmadd(ti11, cr5, vmul(ti12, cr4))};
1099 v4sf ti5{vmadd(ti11, ci5, vmul(ti12, ci4))};
1100 v4sf tr4{vsub(vmul(ti12, cr5), vmul(ti11, cr4))};
1101 v4sf ti4{vsub(vmul(ti12, ci5), vmul(ti11, ci4))};
1102 ch_ref(i - 1, 3, k) = vsub(tr2, tr5);
1103 ch_ref(ic - 1, 2, k) = vadd(tr2, tr5);
1104 ch_ref(i , 3, k) = vadd(ti5, ti2);
1105 ch_ref(ic , 2, k) = vsub(ti5, ti2);
1106 ch_ref(i - 1, 5, k) = vsub(tr3, tr4);
1107 ch_ref(ic - 1, 4, k) = vadd(tr3, tr4);
1108 ch_ref(i , 5, k) = vadd(ti4, ti3);
1109 ch_ref(ic , 4, k) = vsub(ti4, ti3);
1112 } /* radf5 */
1114 void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1115 const float *const wa1)
1117 const v4sf tr11{ld_ps1(0.309016994374947f)};
1118 const v4sf ti11{ld_ps1(0.951056516295154f)};
1119 const v4sf tr12{ld_ps1(-0.809016994374947f)};
1120 const v4sf ti12{ld_ps1(0.587785252292473f)};
1122 auto cc_ref = [&cc,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1123 { return cc[(a_3*5 + a_2)*ido + a_1]; };
1124 auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1125 { return ch[(a_3*l1 + a_2)*ido + a_1]; };
1127 /* Parameter adjustments */
1128 ch -= 1 + ido*(1 + l1);
1129 cc -= 1 + ido*6;
1131 const auto wa2 = wa1 + ido;
1132 const auto wa3 = wa2 + ido;
1133 const auto wa4 = wa3 + ido;
1135 /* Function Body */
1136 for(size_t k{1};k <= l1;++k)
1138 v4sf ti5{vadd(cc_ref( 1, 3, k), cc_ref(1, 3, k))};
1139 v4sf ti4{vadd(cc_ref( 1, 5, k), cc_ref(1, 5, k))};
1140 v4sf tr2{vadd(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
1141 v4sf tr3{vadd(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
1142 ch_ref(1, k, 1) = vadd(cc_ref(1, 1, k), vadd(tr2, tr3));
1143 v4sf cr2{vadd(cc_ref(1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1144 v4sf cr3{vadd(cc_ref(1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1145 v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1146 v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1147 ch_ref(1, k, 2) = vsub(cr2, ci5);
1148 ch_ref(1, k, 3) = vsub(cr3, ci4);
1149 ch_ref(1, k, 4) = vadd(cr3, ci4);
1150 ch_ref(1, k, 5) = vadd(cr2, ci5);
1152 if(ido == 1)
1153 return;
1155 const size_t idp2{ido + 2};
1156 for(size_t k{1};k <= l1;++k)
1158 for(size_t i{3};i <= ido;i += 2)
1160 const size_t ic{idp2 - i};
1161 v4sf ti5{vadd(cc_ref(i , 3, k), cc_ref(ic , 2, k))};
1162 v4sf ti2{vsub(cc_ref(i , 3, k), cc_ref(ic , 2, k))};
1163 v4sf ti4{vadd(cc_ref(i , 5, k), cc_ref(ic , 4, k))};
1164 v4sf ti3{vsub(cc_ref(i , 5, k), cc_ref(ic , 4, k))};
1165 v4sf tr5{vsub(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1166 v4sf tr2{vadd(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1167 v4sf tr4{vsub(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1168 v4sf tr3{vadd(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1169 ch_ref(i - 1, k, 1) = vadd(cc_ref(i-1, 1, k), vadd(tr2, tr3));
1170 ch_ref(i , k, 1) = vadd(cc_ref(i , 1, k), vadd(ti2, ti3));
1171 v4sf cr2{vadd(cc_ref(i-1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1172 v4sf ci2{vadd(cc_ref(i , 1, k), vmadd(tr11, ti2, vmul(tr12, ti3)))};
1173 v4sf cr3{vadd(cc_ref(i-1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1174 v4sf ci3{vadd(cc_ref(i , 1, k), vmadd(tr12, ti2, vmul(tr11, ti3)))};
1175 v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
1176 v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1177 v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
1178 v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1179 v4sf dr3{vsub(cr3, ci4)};
1180 v4sf dr4{vadd(cr3, ci4)};
1181 v4sf di3{vadd(ci3, cr4)};
1182 v4sf di4{vsub(ci3, cr4)};
1183 v4sf dr5{vadd(cr2, ci5)};
1184 v4sf dr2{vsub(cr2, ci5)};
1185 v4sf di5{vsub(ci2, cr5)};
1186 v4sf di2{vadd(ci2, cr5)};
1187 vcplxmul(dr2, di2, ld_ps1(wa1[i-3]), ld_ps1(wa1[i-2]));
1188 vcplxmul(dr3, di3, ld_ps1(wa2[i-3]), ld_ps1(wa2[i-2]));
1189 vcplxmul(dr4, di4, ld_ps1(wa3[i-3]), ld_ps1(wa3[i-2]));
1190 vcplxmul(dr5, di5, ld_ps1(wa4[i-3]), ld_ps1(wa4[i-2]));
1192 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
1193 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
1194 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
1195 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
1198 } /* radb5 */
1200 NOINLINE v4sf *rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1201 const float *wa, const al::span<const uint,15> ifac)
1203 assert(work1 != work2);
1205 const v4sf *in{input_readonly};
1206 v4sf *out{in == work2 ? work1 : work2};
1207 const size_t nf{ifac[1]};
1208 size_t l2{n};
1209 size_t iw{n-1};
1210 size_t k1{1};
1211 while(true)
1213 const size_t kh{nf - k1};
1214 const size_t ip{ifac[kh + 2]};
1215 const size_t l1{l2 / ip};
1216 const size_t ido{n / l2};
1217 iw -= (ip - 1)*ido;
1218 switch(ip)
1220 case 5:
1221 radf5_ps(ido, l1, in, out, &wa[iw]);
1222 break;
1223 case 4:
1224 radf4_ps(ido, l1, in, out, &wa[iw]);
1225 break;
1226 case 3:
1227 radf3_ps(ido, l1, in, out, &wa[iw]);
1228 break;
1229 case 2:
1230 radf2_ps(ido, l1, in, out, &wa[iw]);
1231 break;
1232 default:
1233 assert(0);
1235 if(++k1 > nf)
1236 return out;
1238 l2 = l1;
1239 if(out == work2)
1241 out = work1;
1242 in = work2;
1244 else
1246 out = work2;
1247 in = work1;
1250 } /* rfftf1 */
1252 NOINLINE v4sf *rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1253 const float *wa, const al::span<const uint,15> ifac)
1255 assert(work1 != work2);
1257 const v4sf *in{input_readonly};
1258 v4sf *out{in == work2 ? work1 : work2};
1259 const size_t nf{ifac[1]};
1260 size_t l1{1};
1261 size_t iw{0};
1262 size_t k1{1};
1263 while(true)
1265 const size_t ip{ifac[k1 + 1]};
1266 const size_t l2{ip*l1};
1267 const size_t ido{n / l2};
1268 switch(ip)
1270 case 5:
1271 radb5_ps(ido, l1, in, out, &wa[iw]);
1272 break;
1273 case 4:
1274 radb4_ps(ido, l1, in, out, &wa[iw]);
1275 break;
1276 case 3:
1277 radb3_ps(ido, l1, in, out, &wa[iw]);
1278 break;
1279 case 2:
1280 radb2_ps(ido, l1, in, out, &wa[iw]);
1281 break;
1282 default:
1283 assert(0);
1285 if(++k1 > nf)
1286 return out;
1288 l1 = l2;
1289 iw += (ip - 1)*ido;
1291 if(out == work2)
1293 out = work1;
1294 in = work2;
1296 else
1298 out = work2;
1299 in = work1;
1304 v4sf *cfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1305 const float *wa, const al::span<const uint,15> ifac, const float fsign)
1307 assert(work1 != work2);
1309 const v4sf *in{input_readonly};
1310 v4sf *out{in == work2 ? work1 : work2};
1311 const size_t nf{ifac[1]};
1312 size_t l1{1}, iw{0};
1313 size_t k1{2};
1314 while(true)
1316 const size_t ip{ifac[k1]};
1317 const size_t l2{ip*l1};
1318 const size_t ido{n / l2};
1319 const size_t idot{ido + ido};
1320 switch(ip)
1322 case 5:
1323 passf5_ps(idot, l1, in, out, &wa[iw], fsign);
1324 break;
1325 case 4:
1326 passf4_ps(idot, l1, in, out, &wa[iw], fsign);
1327 break;
1328 case 3:
1329 passf3_ps(idot, l1, in, out, &wa[iw], fsign);
1330 break;
1331 case 2:
1332 passf2_ps(idot, l1, in, out, &wa[iw], fsign);
1333 break;
1334 default:
1335 assert(0);
1337 if(++k1 > nf+1)
1338 return out;
1340 l1 = l2;
1341 iw += (ip - 1)*idot;
1342 if(out == work2)
1344 out = work1;
1345 in = work2;
1347 else
1349 out = work2;
1350 in = work1;
1356 uint decompose(const uint n, const al::span<uint,15> ifac, const al::span<const uint,4> ntryh)
1358 uint nl{n}, nf{0};
1359 for(const uint ntry : ntryh)
1361 while(nl != 1)
1363 const uint nq{nl / ntry};
1364 const uint nr{nl % ntry};
1365 if(nr != 0) break;
1367 ifac[2+nf++] = ntry;
1368 nl = nq;
1369 if(ntry == 2 && nf != 1)
1371 for(size_t i{2};i <= nf;++i)
1373 size_t ib{nf - i + 2};
1374 ifac[ib + 1] = ifac[ib];
1376 ifac[2] = 2;
1380 ifac[0] = n;
1381 ifac[1] = nf;
1382 return nf;
1385 void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1387 static constexpr std::array ntryh{4u,2u,3u,5u};
1389 const uint nf{decompose(n, ifac, ntryh)};
1390 const double argh{2.0*al::numbers::pi / n};
1391 size_t is{0};
1392 size_t nfm1{nf - 1};
1393 size_t l1{1};
1394 for(size_t k1{0};k1 < nfm1;++k1)
1396 const size_t ip{ifac[k1+2]};
1397 const size_t l2{l1*ip};
1398 const size_t ido{n / l2};
1399 const size_t ipm{ip - 1};
1400 size_t ld{0};
1401 for(size_t j{0};j < ipm;++j)
1403 size_t i{is};
1404 ld += l1;
1405 const double argld{static_cast<double>(ld)*argh};
1406 double fi{0.0};
1407 for(size_t ii{2};ii < ido;ii += 2)
1409 fi += 1.0;
1410 wa[i++] = static_cast<float>(std::cos(fi*argld));
1411 wa[i++] = static_cast<float>(std::sin(fi*argld));
1413 is += ido;
1415 l1 = l2;
1417 } /* rffti1 */
1419 void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1421 static constexpr std::array ntryh{5u,3u,4u,2u};
1423 const uint nf{decompose(n, ifac, ntryh)};
1424 const double argh{2.0*al::numbers::pi / n};
1425 size_t i{1};
1426 size_t l1{1};
1427 for(size_t k1{0};k1 < nf;++k1)
1429 const size_t ip{ifac[k1+2]};
1430 const size_t l2{l1*ip};
1431 const size_t ido{n / l2};
1432 const size_t idot{ido + ido + 2};
1433 const size_t ipm{ip - 1};
1434 size_t ld{0};
1435 for(size_t j{0};j < ipm;++j)
1437 size_t i1{i};
1438 wa[i-1] = 1;
1439 wa[i] = 0;
1440 ld += l1;
1441 const double argld{static_cast<double>(ld)*argh};
1442 double fi{0.0};
1443 for(size_t ii{3};ii < idot;ii += 2)
1445 fi += 1.0;
1446 wa[++i] = static_cast<float>(std::cos(fi*argld));
1447 wa[++i] = static_cast<float>(std::sin(fi*argld));
1449 if(ip > 5)
1451 wa[i1-1] = wa[i-1];
1452 wa[i1] = wa[i];
1455 l1 = l2;
1457 } /* cffti1 */
1459 } // namespace
1461 /* NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) */
1462 struct PFFFT_Setup {
1463 uint N{};
1464 uint Ncvec{}; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1465 std::array<uint,15> ifac{};
1466 pffft_transform_t transform{};
1468 float *twiddle{}; /* N/4 elements */
1469 al::span<v4sf> e; /* N/4*3 elements */
1471 alignas(V4sfAlignment) std::byte end;
1474 PFFFTSetupPtr pffft_new_setup(unsigned int N, pffft_transform_t transform)
1476 assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX);
1477 assert(N > 0);
1478 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1479 * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1480 * handle other cases (or maybe just switch to a scalar fft, I don't know..)
1482 if(transform == PFFFT_REAL)
1483 assert((N%(2*SimdSize*SimdSize)) == 0);
1484 else
1485 assert((N%(SimdSize*SimdSize)) == 0);
1487 const uint Ncvec{(transform == PFFFT_REAL ? N/2 : N) / SimdSize};
1489 const size_t storelen{std::max(offsetof(PFFFT_Setup, end) + 2_zu*Ncvec*sizeof(v4sf),
1490 sizeof(PFFFT_Setup))};
1491 auto storage = static_cast<gsl::owner<std::byte*>>(::operator new[](storelen, V4sfAlignVal));
1492 al::span extrastore{&storage[offsetof(PFFFT_Setup, end)], 2_zu*Ncvec*sizeof(v4sf)};
1494 PFFFTSetupPtr s{::new(storage) PFFFT_Setup{}};
1495 s->N = N;
1496 s->transform = transform;
1497 s->Ncvec = Ncvec;
1499 const size_t ecount{2_zu*Ncvec*(SimdSize-1)/SimdSize};
1500 s->e = {std::launder(reinterpret_cast<v4sf*>(extrastore.data())), ecount};
1501 s->twiddle = std::launder(reinterpret_cast<float*>(&extrastore[ecount*sizeof(v4sf)]));
1503 if constexpr(SimdSize > 1)
1505 auto e = std::vector<float>(s->e.size()*SimdSize, 0.0f);
1506 for(size_t k{0};k < s->Ncvec;++k)
1508 const size_t i{k / SimdSize};
1509 const size_t j{k % SimdSize};
1510 for(size_t m{0};m < SimdSize-1;++m)
1512 const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N};
1513 e[((i*3 + m)*2 + 0)*SimdSize + j] = static_cast<float>(std::cos(A));
1514 e[((i*3 + m)*2 + 1)*SimdSize + j] = static_cast<float>(std::sin(A));
1517 std::memcpy(s->e.data(), e.data(), e.size()*sizeof(float));
1519 if(transform == PFFFT_REAL)
1520 rffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1521 else
1522 cffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1524 /* check that N is decomposable with allowed prime factors */
1525 size_t m{1};
1526 for(size_t k{0};k < s->ifac[1];++k)
1527 m *= s->ifac[2+k];
1529 if(m != N/SimdSize)
1530 s = nullptr;
1532 return s;
1536 void pffft_destroy_setup(gsl::owner<PFFFT_Setup*> s) noexcept
1538 std::destroy_at(s);
1539 ::operator delete[](gsl::owner<void*>{s}, V4sfAlignVal);
1542 #if !defined(PFFFT_SIMD_DISABLE)
1544 namespace {
1546 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1547 void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *RESTRICT out)
1549 v4sf g0, g1;
1550 interleave2(in[0], in[1], g0, g1);
1551 in += in_stride;
1553 *--out = vswaphl(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1554 for(size_t k{1};k < N;++k)
1556 v4sf h0, h1;
1557 interleave2(in[0], in[1], h0, h1);
1558 in += in_stride;
1559 *--out = vswaphl(g1, h0);
1560 *--out = vswaphl(h0, h1);
1561 g1 = h1;
1563 *--out = vswaphl(g1, g0);
1566 void unreversed_copy(const size_t N, const v4sf *in, v4sf *RESTRICT out, const int out_stride)
1568 v4sf g0{in[0]}, g1{g0};
1569 ++in;
1570 for(size_t k{1};k < N;++k)
1572 v4sf h0{*in++}; v4sf h1{*in++};
1573 g1 = vswaphl(g1, h0);
1574 h0 = vswaphl(h0, h1);
1575 uninterleave2(h0, g1, out[0], out[1]);
1576 out += out_stride;
1577 g1 = h1;
1579 v4sf h0{*in++}, h1{g0};
1580 g1 = vswaphl(g1, h0);
1581 h0 = vswaphl(h0, h1);
1582 uninterleave2(h0, g1, out[0], out[1]);
1585 void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1587 assert(in != out);
1589 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1590 for(size_t k{0};k < dk;++k)
1592 v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1593 v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1594 v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1595 v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1596 vtranspose4(r0,r1,r2,r3);
1597 vtranspose4(i0,i1,i2,i3);
1598 vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
1599 vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
1600 vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
1602 v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1603 v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1604 v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1605 v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1607 /* transformation for each column is:
1609 * [1 1 1 1 0 0 0 0] [r0]
1610 * [1 0 -1 0 0 -1 0 1] [r1]
1611 * [1 -1 1 -1 0 0 0 0] [r2]
1612 * [1 0 -1 0 0 1 0 -1] [r3]
1613 * [0 0 0 0 1 1 1 1] * [i0]
1614 * [0 1 0 -1 1 0 -1 0] [i1]
1615 * [0 0 0 0 1 -1 1 -1] [i2]
1616 * [0 -1 0 1 1 0 -1 0] [i3]
1619 r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1620 r1 = vadd(dr0, di1); i1 = vsub(di0, dr1);
1621 r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1622 r3 = vsub(dr0, di1); i3 = vadd(di0, dr1);
1624 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1625 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1629 void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1631 assert(in != out);
1633 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1634 for(size_t k{0};k < dk;++k)
1636 v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1637 v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1638 v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1639 v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1641 v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1642 v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1643 v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1644 v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1646 r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1647 r1 = vsub(dr0, di1); i1 = vadd(di0, dr1);
1648 r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1649 r3 = vadd(dr0, di1); i3 = vsub(di0, dr1);
1651 vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
1652 vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
1653 vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
1655 vtranspose4(r0,r1,r2,r3);
1656 vtranspose4(i0,i1,i2,i3);
1658 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1659 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1664 force_inline void pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1665 const v4sf *e, v4sf *RESTRICT out)
1667 v4sf r0{*in0}, i0{*in1};
1668 v4sf r1{*in++}; v4sf i1{*in++};
1669 v4sf r2{*in++}; v4sf i2{*in++};
1670 v4sf r3{*in++}; v4sf i3{*in++};
1671 vtranspose4(r0,r1,r2,r3);
1672 vtranspose4(i0,i1,i2,i3);
1674 /* transformation for each column is:
1676 * [1 1 1 1 0 0 0 0] [r0]
1677 * [1 0 -1 0 0 -1 0 1] [r1]
1678 * [1 0 -1 0 0 1 0 -1] [r2]
1679 * [1 -1 1 -1 0 0 0 0] [r3]
1680 * [0 0 0 0 1 1 1 1] * [i0]
1681 * [0 -1 0 1 -1 0 1 0] [i1]
1682 * [0 -1 0 1 1 0 -1 0] [i2]
1683 * [0 0 0 0 -1 1 -1 1] [i3]
1686 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1687 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1689 vcplxmul(r1,i1,e[0],e[1]);
1690 vcplxmul(r2,i2,e[2],e[3]);
1691 vcplxmul(r3,i3,e[4],e[5]);
1693 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1694 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1696 v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0,r2)};
1697 v4sf sr1{vadd(r1,r3)}, dr1{vsub(r3,r1)};
1698 v4sf si0{vadd(i0,i2)}, di0{vsub(i0,i2)};
1699 v4sf si1{vadd(i1,i3)}, di1{vsub(i3,i1)};
1701 r0 = vadd(sr0, sr1);
1702 r3 = vsub(sr0, sr1);
1703 i0 = vadd(si0, si1);
1704 i3 = vsub(si1, si0);
1705 r1 = vadd(dr0, di1);
1706 r2 = vsub(dr0, di1);
1707 i1 = vsub(dr1, di0);
1708 i2 = vadd(dr1, di0);
1710 *out++ = r0;
1711 *out++ = i0;
1712 *out++ = r1;
1713 *out++ = i1;
1714 *out++ = r2;
1715 *out++ = i2;
1716 *out++ = r3;
1717 *out++ = i3;
1720 NOINLINE void pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1721 const v4sf *e)
1723 static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
1725 assert(in != out);
1726 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1727 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1729 const v4sf zero{vzero()};
1730 const auto cr = al::bit_cast<std::array<float,SimdSize>>(in[0]);
1731 const auto ci = al::bit_cast<std::array<float,SimdSize>>(in[Ncvec*2-1]);
1732 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1734 /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1736 * [Xr(1)] ] [1 1 1 1 0 0 0 0]
1737 * [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1738 * [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1739 * [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1740 * [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1741 * [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1742 * [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1743 * [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1746 const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; out[0] = vinsert0(out[0], xr0);
1747 const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; out[1] = vinsert0(out[1], xi0);
1748 const float xr2{(cr[0]-cr[2])}; out[4] = vinsert0(out[4], xr2);
1749 const float xi2{(cr[3]-cr[1])}; out[5] = vinsert0(out[5], xi2);
1750 const float xr1{ ci[0] + s*(ci[1]-ci[3])}; out[2] = vinsert0(out[2], xr1);
1751 const float xi1{-ci[2] - s*(ci[1]+ci[3])}; out[3] = vinsert0(out[3], xi1);
1752 const float xr3{ ci[0] - s*(ci[1]-ci[3])}; out[6] = vinsert0(out[6], xr3);
1753 const float xi3{ ci[2] - s*(ci[1]+ci[3])}; out[7] = vinsert0(out[7], xi3);
1755 for(size_t k{1};k < dk;++k)
1756 pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
1759 force_inline void pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
1760 const bool first)
1762 v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
1763 v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]};
1765 /* transformation for each column is:
1767 * [1 1 1 1 0 0 0 0] [r0]
1768 * [1 0 0 -1 0 -1 -1 0] [r1]
1769 * [1 -1 -1 1 0 0 0 0] [r2]
1770 * [1 0 0 -1 0 1 1 0] [r3]
1771 * [0 0 0 0 1 -1 1 -1] * [i0]
1772 * [0 -1 1 0 1 0 0 1] [i1]
1773 * [0 0 0 0 1 1 -1 -1] [i2]
1774 * [0 1 -1 0 1 0 0 1] [i3]
1777 v4sf sr0{vadd(r0,r3)}, dr0{vsub(r0,r3)};
1778 v4sf sr1{vadd(r1,r2)}, dr1{vsub(r1,r2)};
1779 v4sf si0{vadd(i0,i3)}, di0{vsub(i0,i3)};
1780 v4sf si1{vadd(i1,i2)}, di1{vsub(i1,i2)};
1782 r0 = vadd(sr0, sr1);
1783 r2 = vsub(sr0, sr1);
1784 r1 = vsub(dr0, si1);
1785 r3 = vadd(dr0, si1);
1786 i0 = vsub(di0, di1);
1787 i2 = vadd(di0, di1);
1788 i1 = vsub(si0, dr1);
1789 i3 = vadd(si0, dr1);
1791 vcplxmulconj(r1,i1,e[0],e[1]);
1792 vcplxmulconj(r2,i2,e[2],e[3]);
1793 vcplxmulconj(r3,i3,e[4],e[5]);
1795 vtranspose4(r0,r1,r2,r3);
1796 vtranspose4(i0,i1,i2,i3);
1798 if(!first)
1800 *out++ = r0;
1801 *out++ = i0;
1803 *out++ = r1;
1804 *out++ = i1;
1805 *out++ = r2;
1806 *out++ = i2;
1807 *out++ = r3;
1808 *out++ = i3;
1811 NOINLINE void pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1812 const v4sf *e)
1814 static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
1816 assert(in != out);
1817 const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1818 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1820 std::array<float,SimdSize> Xr{}, Xi{};
1821 for(size_t k{0};k < SimdSize;++k)
1823 Xr[k] = vextract0(in[2*k]);
1824 Xi[k] = vextract0(in[2*k + 1]);
1827 pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
1829 /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1831 * [cr0] [1 0 2 0 1 0 0 0]
1832 * [cr1] [1 0 0 0 -1 0 -2 0]
1833 * [cr2] [1 0 -2 0 1 0 0 0]
1834 * [cr3] [1 0 0 0 -1 0 2 0]
1835 * [ci0] [0 2 0 2 0 0 0 0]
1836 * [ci1] [0 s 0 -s 0 -s 0 -s]
1837 * [ci2] [0 0 0 0 0 -2 0 2]
1838 * [ci3] [0 -s 0 s 0 -s 0 -s]
1840 for(size_t k{1};k < dk;++k)
1841 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
1843 const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
1844 const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
1845 const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
1846 const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
1847 out[0] = vset4(cr0, cr1, cr2, cr3);
1848 const float ci0{ 2*(Xr[1]+Xr[3])};
1849 const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1850 const float ci2{ 2*(Xi[3]-Xi[1])};
1851 const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1852 out[2*Ncvec-1] = vset4(ci0, ci1, ci2, ci3);
1856 void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
1857 v4sf *scratch, const pffft_direction_t direction, const bool ordered)
1859 assert(scratch != nullptr);
1860 assert(voutput != scratch);
1862 const size_t Ncvec{setup->Ncvec};
1863 const bool nf_odd{(setup->ifac[1]&1) != 0};
1865 std::array buff{voutput, scratch};
1866 bool ib{nf_odd != ordered};
1867 if(direction == PFFFT_FORWARD)
1869 /* Swap the initial work buffer for forward FFTs, which helps avoid an
1870 * extra copy for output.
1872 ib = !ib;
1873 if(setup->transform == PFFFT_REAL)
1875 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
1876 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1878 else
1880 v4sf *tmp{buff[ib]};
1881 for(size_t k=0; k < Ncvec; ++k)
1882 uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1884 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
1885 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1887 if(ordered)
1888 pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
1889 reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
1890 else
1891 ib = !ib;
1893 else
1895 if(vinput == buff[ib])
1896 ib = !ib; // may happen when finput == foutput
1898 if(ordered)
1900 pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
1901 reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
1902 vinput = buff[ib];
1903 ib = !ib;
1905 if(setup->transform == PFFFT_REAL)
1907 pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1908 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
1910 else
1912 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1913 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
1914 for(size_t k{0};k < Ncvec;++k)
1915 interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1919 if(buff[ib] != voutput)
1921 /* extra copy required -- this situation should only happen when finput == foutput */
1922 assert(vinput==voutput);
1923 for(size_t k{0};k < Ncvec;++k)
1925 v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
1926 voutput[2*k] = a; voutput[2*k+1] = b;
1931 } // namespace
1933 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
1934 pffft_direction_t direction)
1936 assert(in != out);
1938 const size_t N{setup->N}, Ncvec{setup->Ncvec};
1939 const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
1940 v4sf *RESTRICT vout{reinterpret_cast<v4sf*>(out)};
1941 if(setup->transform == PFFFT_REAL)
1943 const size_t dk{N/32};
1944 if(direction == PFFFT_FORWARD)
1946 for(size_t k{0};k < dk;++k)
1948 interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1949 interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1951 reversed_copy(dk, vin+2, 8, vout + N/SimdSize/2);
1952 reversed_copy(dk, vin+6, 8, vout + N/SimdSize);
1954 else
1956 for(size_t k{0};k < dk;++k)
1958 uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1959 uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1961 unreversed_copy(dk, vin + N/SimdSize/4, vout + N/SimdSize - 6, -8);
1962 unreversed_copy(dk, vin + 3_uz*N/SimdSize/4, vout + N/SimdSize - 2, -8);
1965 else
1967 if(direction == PFFFT_FORWARD)
1969 for(size_t k{0};k < Ncvec;++k)
1971 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1972 interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1975 else
1977 for(size_t k{0};k < Ncvec;++k)
1979 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1980 uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1986 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
1987 float *ab, float scaling)
1989 const size_t Ncvec{s->Ncvec};
1990 const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
1991 const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
1992 v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
1994 #ifdef __arm__
1995 __builtin_prefetch(va);
1996 __builtin_prefetch(vb);
1997 __builtin_prefetch(vab);
1998 __builtin_prefetch(va+2);
1999 __builtin_prefetch(vb+2);
2000 __builtin_prefetch(vab+2);
2001 __builtin_prefetch(va+4);
2002 __builtin_prefetch(vb+4);
2003 __builtin_prefetch(vab+4);
2004 __builtin_prefetch(va+6);
2005 __builtin_prefetch(vb+6);
2006 __builtin_prefetch(vab+6);
2007 #ifndef __clang__
2008 #define ZCONVOLVE_USING_INLINE_NEON_ASM
2009 #endif
2010 #endif
2012 const float ar1{vextract0(va[0])};
2013 const float ai1{vextract0(va[1])};
2014 const float br1{vextract0(vb[0])};
2015 const float bi1{vextract0(vb[1])};
2016 const float abr1{vextract0(vab[0])};
2017 const float abi1{vextract0(vab[1])};
2019 #ifdef ZCONVOLVE_USING_INLINE_ASM
2020 /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
2021 * Ubuntu. So this will be restricted to GCC.
2023 * Does it still miscompile with Clang? Is it even needed with today's
2024 * optimizers?
2026 const float *a_{a}, *b_{b}; float *ab_{ab};
2027 size_t N{Ncvec};
2028 asm volatile("mov r8, %2 \n"
2029 "vdup.f32 q15, %4 \n"
2030 "1: \n"
2031 "pld [%0,#64] \n"
2032 "pld [%1,#64] \n"
2033 "pld [%2,#64] \n"
2034 "pld [%0,#96] \n"
2035 "pld [%1,#96] \n"
2036 "pld [%2,#96] \n"
2037 "vld1.f32 {q0,q1}, [%0,:128]! \n"
2038 "vld1.f32 {q4,q5}, [%1,:128]! \n"
2039 "vld1.f32 {q2,q3}, [%0,:128]! \n"
2040 "vld1.f32 {q6,q7}, [%1,:128]! \n"
2041 "vld1.f32 {q8,q9}, [r8,:128]! \n"
2043 "vmul.f32 q10, q0, q4 \n"
2044 "vmul.f32 q11, q0, q5 \n"
2045 "vmul.f32 q12, q2, q6 \n"
2046 "vmul.f32 q13, q2, q7 \n"
2047 "vmls.f32 q10, q1, q5 \n"
2048 "vmla.f32 q11, q1, q4 \n"
2049 "vld1.f32 {q0,q1}, [r8,:128]! \n"
2050 "vmls.f32 q12, q3, q7 \n"
2051 "vmla.f32 q13, q3, q6 \n"
2052 "vmla.f32 q8, q10, q15 \n"
2053 "vmla.f32 q9, q11, q15 \n"
2054 "vmla.f32 q0, q12, q15 \n"
2055 "vmla.f32 q1, q13, q15 \n"
2056 "vst1.f32 {q8,q9},[%2,:128]! \n"
2057 "vst1.f32 {q0,q1},[%2,:128]! \n"
2058 "subs %3, #2 \n"
2059 "bne 1b \n"
2060 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
2062 #else
2064 /* Default routine, works fine for non-arm cpus with current compilers. */
2065 const v4sf vscal{ld_ps1(scaling)};
2066 for(size_t i{0};i < Ncvec;i += 2)
2068 v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2069 v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2070 vcplxmul(ar4, ai4, br4, bi4);
2071 vab[2*i+0] = vmadd(ar4, vscal, vab[2*i+0]);
2072 vab[2*i+1] = vmadd(ai4, vscal, vab[2*i+1]);
2073 ar4 = va[2*i+2]; ai4 = va[2*i+3];
2074 br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2075 vcplxmul(ar4, ai4, br4, bi4);
2076 vab[2*i+2] = vmadd(ar4, vscal, vab[2*i+2]);
2077 vab[2*i+3] = vmadd(ai4, vscal, vab[2*i+3]);
2079 #endif
2081 if(s->transform == PFFFT_REAL)
2083 vab[0] = vinsert0(vab[0], abr1 + ar1*br1*scaling);
2084 vab[1] = vinsert0(vab[1], abi1 + ai1*bi1*scaling);
2088 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2090 const size_t Ncvec{s->Ncvec};
2091 const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
2092 const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
2093 v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
2095 #ifdef __arm__
2096 __builtin_prefetch(va);
2097 __builtin_prefetch(vb);
2098 __builtin_prefetch(vab);
2099 __builtin_prefetch(va+2);
2100 __builtin_prefetch(vb+2);
2101 __builtin_prefetch(vab+2);
2102 __builtin_prefetch(va+4);
2103 __builtin_prefetch(vb+4);
2104 __builtin_prefetch(vab+4);
2105 __builtin_prefetch(va+6);
2106 __builtin_prefetch(vb+6);
2107 __builtin_prefetch(vab+6);
2108 #endif
2110 const float ar1{vextract0(va[0])};
2111 const float ai1{vextract0(va[1])};
2112 const float br1{vextract0(vb[0])};
2113 const float bi1{vextract0(vb[1])};
2114 const float abr1{vextract0(vab[0])};
2115 const float abi1{vextract0(vab[1])};
2117 /* No inline assembly for this version. I'm not familiar enough with NEON
2118 * assembly, and I don't know that it's needed with today's optimizers.
2120 for(size_t i{0};i < Ncvec;i += 2)
2122 v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2123 v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2124 vcplxmul(ar4, ai4, br4, bi4);
2125 vab[2*i+0] = vadd(ar4, vab[2*i+0]);
2126 vab[2*i+1] = vadd(ai4, vab[2*i+1]);
2127 ar4 = va[2*i+2]; ai4 = va[2*i+3];
2128 br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2129 vcplxmul(ar4, ai4, br4, bi4);
2130 vab[2*i+2] = vadd(ar4, vab[2*i+2]);
2131 vab[2*i+3] = vadd(ai4, vab[2*i+3]);
2134 if(s->transform == PFFFT_REAL)
2136 vab[0] = vinsert0(vab[0], abr1 + ar1*br1);
2137 vab[1] = vinsert0(vab[1], abi1 + ai1*bi1);
2142 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2143 pffft_direction_t direction)
2145 assert(valigned(input) && valigned(output) && valigned(work));
2146 pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2147 reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2148 reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
2151 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2152 float *work, pffft_direction_t direction)
2154 assert(valigned(input) && valigned(output) && valigned(work));
2155 pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2156 reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2157 reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
2160 #else // defined(PFFFT_SIMD_DISABLE)
2162 // standard routine using scalar floats, without SIMD stuff.
2164 namespace {
2166 void pffft_transform_internal(const PFFFT_Setup *setup, const float *input, float *output,
2167 float *scratch, const pffft_direction_t direction, bool ordered)
2169 const size_t Ncvec{setup->Ncvec};
2170 const bool nf_odd{(setup->ifac[1]&1) != 0};
2172 assert(scratch != nullptr);
2174 /* z-domain data for complex transforms is already ordered without SIMD. */
2175 if(setup->transform == PFFFT_COMPLEX)
2176 ordered = false;
2178 float *buff[2]{output, scratch};
2179 bool ib{nf_odd != ordered};
2180 if(direction == PFFFT_FORWARD)
2182 if(setup->transform == PFFFT_REAL)
2183 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
2184 else
2185 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
2186 if(ordered)
2188 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
2189 ib = !ib;
2192 else
2194 if (input == buff[ib])
2195 ib = !ib; // may happen when finput == foutput
2197 if(ordered)
2199 pffft_zreorder(setup, input, buff[ib], PFFFT_BACKWARD);
2200 input = buff[ib];
2201 ib = !ib;
2203 if(setup->transform == PFFFT_REAL)
2204 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
2205 else
2206 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
2208 if(buff[ib] != output)
2210 // extra copy required -- this situation should happens only when finput == foutput
2211 assert(input==output);
2212 for(size_t k{0};k < Ncvec;++k)
2214 float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
2215 output[2*k] = a; output[2*k+1] = b;
2220 } // namespace
2222 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *RESTRICT out,
2223 pffft_direction_t direction)
2225 const size_t N{setup->N};
2226 if(setup->transform == PFFFT_COMPLEX)
2228 for(size_t k{0};k < 2*N;++k)
2229 out[k] = in[k];
2230 return;
2232 else if(direction == PFFFT_FORWARD)
2234 float x_N{in[N-1]};
2235 for(size_t k{N-1};k > 1;--k)
2236 out[k] = in[k-1];
2237 out[0] = in[0];
2238 out[1] = x_N;
2240 else
2242 float x_N{in[1]};
2243 for(size_t k{1};k < N-1;++k)
2244 out[k] = in[k+1];
2245 out[0] = in[0];
2246 out[N-1] = x_N;
2250 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
2251 float *ab, float scaling)
2253 size_t Ncvec{s->Ncvec};
2255 if(s->transform == PFFFT_REAL)
2257 // take care of the fftpack ordering
2258 ab[0] += a[0]*b[0]*scaling;
2259 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
2260 ++ab; ++a; ++b; --Ncvec;
2262 for(size_t i{0};i < Ncvec;++i)
2264 float ar{a[2*i+0]}, ai{a[2*i+1]};
2265 const float br{b[2*i+0]}, bi{b[2*i+1]};
2266 vcplxmul(ar, ai, br, bi);
2267 ab[2*i+0] += ar*scaling;
2268 ab[2*i+1] += ai*scaling;
2272 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2274 size_t Ncvec{s->Ncvec};
2276 if(s->transform == PFFFT_REAL)
2278 // take care of the fftpack ordering
2279 ab[0] += a[0]*b[0];
2280 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
2281 ++ab; ++a; ++b; --Ncvec;
2283 for(size_t i{0};i < Ncvec;++i)
2285 float ar{a[2*i+0]}, ai{a[2*i+1]};
2286 const float br{b[2*i+0]}, bi{b[2*i+1]};
2287 vcplxmul(ar, ai, br, bi);
2288 ab[2*i+0] += ar;
2289 ab[2*i+1] += ai;
2294 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2295 pffft_direction_t direction)
2297 pffft_transform_internal(setup, input, output, work, direction, false);
2300 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2301 float *work, pffft_direction_t direction)
2303 pffft_transform_internal(setup, input, output, work, direction, true);
2306 #endif /* defined(PFFFT_SIMD_DISABLE) */
2307 /* NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) */