common/pffft.cpp

   1 //$ nobt
   2
   3 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   4  * Copyright (c) 2023  Christopher Robinson
   5  *
   6  * Based on original fortran 77 code from FFTPACKv4 from NETLIB
   7  * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
   8  * of NCAR, in 1985.
   9  *
  10  * As confirmed by the NCAR fftpack software curators, the following
  11  * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
  12  * released under the same terms.
  13  *
  14  * FFTPACK license:
  15  *
  16  * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
  17  *
  18  * Copyright (c) 2004 the University Corporation for Atmospheric
  19  * Research ("UCAR"). All rights reserved. Developed by NCAR's
  20  * Computational and Information Systems Laboratory, UCAR,
  21  * www.cisl.ucar.edu.
  22  *
  23  * Redistribution and use of the Software in source and binary forms,
  24  * with or without modification, is permitted provided that the
  25  * following conditions are met:
  26  *
  27  * - Neither the names of NCAR's Computational and Information Systems
  28  * Laboratory, the University Corporation for Atmospheric Research,
  29  * nor the names of its sponsors or contributors may be used to
  30  * endorse or promote products derived from this Software without
  31  * specific prior written permission.
  32  *
  33  * - Redistributions of source code must retain the above copyright
  34  * notices, this list of conditions, and the disclaimer below.
  35  *
  36  * - Redistributions in binary form must reproduce the above copyright
  37  * notice, this list of conditions, and the disclaimer below in the
  38  * documentation and/or other materials provided with the
  39  * distribution.
  40  *
  41  * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  42  * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
  43  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  44  * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
  45  * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
  46  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  47  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  48  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
  49  * SOFTWARE.
  50  *
  51  *
  52  * PFFFT : a Pretty Fast FFT.
  53  *
  54  * This file is largerly based on the original FFTPACK implementation, modified
  55  * in order to take advantage of SIMD instructions of modern CPUs.
  56  */
  57
  58 #include "pffft.h"
  59
  60 #include <algorithm>
  61 #include <array>
  62 #include <cassert>
  63 #include <cmath>
  64 #include <cstdint>
  65 #include <cstdio>
  66 #include <cstring>
  67 #include <new>
  68 #include <utility>
  69 #include <vector>
  70
  71 #include "albit.h"
  72 #include "almalloc.h"
  73 #include "alnumbers.h"
  74 #include "alnumeric.h"
  75 #include "alspan.h"
  76 #include "opthelpers.h"
  77
  78
  79 using uint = unsigned int;
  80
  81 namespace {
  82
  83 #if defined(__GNUC__) || defined(_MSC_VER)
  84 #define RESTRICT __restrict
  85 #else
  86 #define RESTRICT
  87 #endif
  88
  89 /* Vector support macros: the rest of the code is independent of
  90  * SSE/Altivec/NEON -- adding support for other platforms with 4-element
  91  * vectors should be limited to these macros
  92  */
  93
  94 /* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
  95 //#define PFFFT_SIMD_DISABLE
  96
  97 #ifndef PFFFT_SIMD_DISABLE
  98 /*
  99  * Altivec support macros
 100  */
 101 #if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
 102 #include <altivec.h>
 103 using v4sf = vector float;
 104 constexpr uint SimdSize{4};
 105 force_inline v4sf vzero() noexcept { return (vector float)vec_splat_u8(0); }
 106 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vec_madd(a, b, vzero()); }
 107 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vec_add(a, b); }
 108 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vec_madd(a, b, c); }
 109 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vec_sub(a, b); }
 110 force_inline v4sf ld_ps1(float a) noexcept { return vec_splats(a); }
 111
 112 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 113 {
 114     /* There a more efficient way to do this? */
 115     alignas(16) std::array<float,4> vals{{a, b, c, d}};
 116     return vec_ld(0, vals.data());
 117 }
 118 force_inline v4sf vinsert0(v4sf v, float a) noexcept { return vec_insert(a, v, 0); }
 119 force_inline float vextract0(v4sf v) noexcept { return vec_extract(v, 0); }
 120
 121 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 122 {
 123     out1 = vec_mergeh(in1, in2);
 124     out2 = vec_mergel(in1, in2);
 125 }
 126 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 127 {
 128     out1 = vec_perm(in1, in2, (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27});
 129     out2 = vec_perm(in1, in2, (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31});
 130 }
 131
 132 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 133 {
 134     v4sf y0{vec_mergeh(x0, x2)};
 135     v4sf y1{vec_mergel(x0, x2)};
 136     v4sf y2{vec_mergeh(x1, x3)};
 137     v4sf y3{vec_mergel(x1, x3)};
 138     x0 = vec_mergeh(y0, y2);
 139     x1 = vec_mergel(y0, y2);
 140     x2 = vec_mergeh(y1, y3);
 141     x3 = vec_mergel(y1, y3);
 142 }
 143
 144 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 145 { return vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}); }
 146
 147 /*
 148  * SSE1 support macros
 149  */
 150 #elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
 151     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 152
 153 #include <xmmintrin.h>
 154 using v4sf = __m128;
 155 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/
 156  * finalize functions anyway so you will have to work if you want to enable AVX
 157  * with its 256-bit vectors.
 158  */
 159 constexpr uint SimdSize{4};
 160 force_inline v4sf vzero() noexcept { return _mm_setzero_ps(); }
 161 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return _mm_mul_ps(a, b); }
 162 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return _mm_add_ps(a, b); }
 163 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return _mm_add_ps(_mm_mul_ps(a,b), c); }
 164 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return _mm_sub_ps(a, b); }
 165 force_inline v4sf ld_ps1(float a) noexcept { return _mm_set1_ps(a); }
 166
 167 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 168 { return _mm_setr_ps(a, b, c, d); }
 169 force_inline v4sf vinsert0(const v4sf v, const float a) noexcept
 170 { return _mm_move_ss(v, _mm_set_ss(a)); }
 171 force_inline float vextract0(v4sf v) noexcept
 172 { return _mm_cvtss_f32(v); }
 173
 174 force_inline void interleave2(const v4sf in1, const v4sf in2, v4sf &out1, v4sf &out2) noexcept
 175 {
 176     out1 = _mm_unpacklo_ps(in1, in2);
 177     out2 = _mm_unpackhi_ps(in1, in2);
 178 }
 179 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 180 {
 181     out1 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0));
 182     out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
 183 }
 184
 185 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 186 { _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
 187
 188 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 189 { return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)); }
 190
 191 /*
 192  * ARM NEON support macros
 193  */
 194 #elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)
 195
 196 #include <arm_neon.h>
 197 using v4sf = float32x4_t;
 198 constexpr uint SimdSize{4};
 199 force_inline v4sf vzero() noexcept { return vdupq_n_f32(0.0f); }
 200 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vmulq_f32(a, b); }
 201 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vaddq_f32(a, b); }
 202 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vmlaq_f32(c, a, b); }
 203 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vsubq_f32(a, b); }
 204 force_inline v4sf ld_ps1(float a) noexcept { return vdupq_n_f32(a); }
 205
 206 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 207 {
 208     float32x4_t ret{vmovq_n_f32(a)};
 209     ret = vsetq_lane_f32(b, ret, 1);
 210     ret = vsetq_lane_f32(c, ret, 2);
 211     ret = vsetq_lane_f32(d, ret, 3);
 212     return ret;
 213 }
 214 force_inline v4sf vinsert0(v4sf v, float a) noexcept
 215 { return vsetq_lane_f32(a, v, 0); }
 216 force_inline float vextract0(v4sf v) noexcept
 217 { return vgetq_lane_f32(v, 0); }
 218
 219 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 220 {
 221     float32x4x2_t tmp{vzipq_f32(in1, in2)};
 222     out1 = tmp.val[0];
 223     out2 = tmp.val[1];
 224 }
 225 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 226 {
 227     float32x4x2_t tmp{vuzpq_f32(in1, in2)};
 228     out1 = tmp.val[0];
 229     out2 = tmp.val[1];
 230 }
 231
 232 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 233 {
 234     /* marginally faster version:
 235      * asm("vtrn.32 %q0, %q1;\n"
 236      *     "vtrn.32 %q2, %q3\n
 237      *     "vswp %f0, %e2\n
 238      *     "vswp %f1, %e3"
 239      *     : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
 240      */
 241     float32x4x2_t t0_{vzipq_f32(x0, x2)};
 242     float32x4x2_t t1_{vzipq_f32(x1, x3)};
 243     float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
 244     float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
 245     x0 = u0_.val[0];
 246     x1 = u0_.val[1];
 247     x2 = u1_.val[0];
 248     x3 = u1_.val[1];
 249 }
 250
 251 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 252 { return vcombine_f32(vget_low_f32(b), vget_high_f32(a)); }
 253
 254 /*
 255  * Generic GCC vector macros
 256  */
 257 #elif defined(__GNUC__)
 258
 259 using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 260 constexpr uint SimdSize{4};
 261 force_inline constexpr v4sf vzero() noexcept { return v4sf{0.0f, 0.0f, 0.0f, 0.0f}; }
 262 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
 263 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
 264 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
 265 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
 266 force_inline constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 267
 268 force_inline constexpr v4sf vset4(float a, float b, float c, float d) noexcept
 269 { return v4sf{a, b, c, d}; }
 270 force_inline constexpr v4sf vinsert0(v4sf v, float a) noexcept
 271 { return v4sf{a, v[1], v[2], v[3]}; }
 272 force_inline float vextract0(v4sf v) noexcept
 273 { return v[0]; }
 274
 275 force_inline v4sf unpacklo(v4sf a, v4sf b) noexcept
 276 { return v4sf{a[0], b[0], a[1], b[1]}; }
 277 force_inline v4sf unpackhi(v4sf a, v4sf b) noexcept
 278 { return v4sf{a[2], b[2], a[3], b[3]}; }
 279
 280 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 281 {
 282     out1 = unpacklo(in1, in2);
 283     out2 = unpackhi(in1, in2);
 284 }
 285 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 286 {
 287     out1 = v4sf{in1[0], in1[2], in2[0], in2[2]};
 288     out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
 289 }
 290
 291 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 292 {
 293     v4sf tmp0{unpacklo(x0, x1)};
 294     v4sf tmp2{unpacklo(x2, x3)};
 295     v4sf tmp1{unpackhi(x0, x1)};
 296     v4sf tmp3{unpackhi(x2, x3)};
 297     x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]};
 298     x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]};
 299     x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
 300     x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
 301 }
 302
 303 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 304 { return v4sf{b[0], b[1], a[2], a[3]}; }
 305
 306 #else
 307
 308 #warning "building with simd disabled !\n";
 309 #define PFFFT_SIMD_DISABLE // fallback to scalar code
 310 #endif
 311
 312 #endif /* PFFFT_SIMD_DISABLE */
 313
 314 // fallback mode for situations where SIMD is not available, use scalar mode instead
 315 #ifdef PFFFT_SIMD_DISABLE
 316 using v4sf = float;
 317 constexpr uint SimdSize{1};
 318 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
 319 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
 320 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
 321 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
 322 force_inline constexpr v4sf ld_ps1(float a) noexcept { return a; }
 323
 324 #else
 325
 326 [[maybe_unused]] inline
 327 auto valigned(const float *ptr) noexcept -> bool
 328 {
 329     static constexpr uintptr_t alignmask{SimdSize*sizeof(float) - 1};
 330     return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
 331 }
 332 #endif
 333
 334 // shortcuts for complex multiplications
 335 force_inline void vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 336 {
 337     v4sf tmp{vmul(ar, bi)};
 338     ar = vsub(vmul(ar, br), vmul(ai, bi));
 339     ai = vmadd(ai, br, tmp);
 340 }
 341 force_inline void vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 342 {
 343     v4sf tmp{vmul(ar, bi)};
 344     ar = vmadd(ai, bi, vmul(ar, br));
 345     ai = vsub(vmul(ai, br), tmp);
 346 }
 347
 348 #if !defined(PFFFT_SIMD_DISABLE)
 349
 350 inline void assertv4(const al::span<const float,4> v_f [[maybe_unused]],
 351     const float f0 [[maybe_unused]], const float f1 [[maybe_unused]],
 352     const float f2 [[maybe_unused]], const float f3 [[maybe_unused]])
 353 { assert(v_f[0] == f0 && v_f[1] == f1 && v_f[2] == f2 && v_f[3] == f3); }
 354
 355 template<typename T, T ...N>
 356 constexpr auto make_float_array(std::integer_sequence<T,N...>)
 357 { return std::array{static_cast<float>(N)...}; }
 358
 359 /* detect bugs with the vector support macros */
 360 [[maybe_unused]] void validate_pffft_simd()
 361 {
 362     using float4 = std::array<float,4>;
 363     static constexpr auto f = make_float_array(std::make_index_sequence<16>{});
 364
 365     v4sf a0_v{vset4(f[ 0], f[ 1], f[ 2], f[ 3])};
 366     v4sf a1_v{vset4(f[ 4], f[ 5], f[ 6], f[ 7])};
 367     v4sf a2_v{vset4(f[ 8], f[ 9], f[10], f[11])};
 368     v4sf a3_v{vset4(f[12], f[13], f[14], f[15])};
 369     v4sf u_v{};
 370
 371     auto t_v = vzero();
 372     auto t_f = al::bit_cast<float4>(t_v);
 373     printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 374     assertv4(t_f, 0, 0, 0, 0);
 375
 376     t_v = vadd(a1_v, a2_v);
 377     t_f = al::bit_cast<float4>(t_v);
 378     printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 379     assertv4(t_f, 12, 14, 16, 18);
 380
 381     t_v = vmul(a1_v, a2_v);
 382     t_f = al::bit_cast<float4>(t_v);
 383     printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 384     assertv4(t_f, 32, 45, 60, 77);
 385
 386     t_v = vmadd(a1_v, a2_v, a0_v);
 387     t_f = al::bit_cast<float4>(t_v);
 388     printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 389     assertv4(t_f, 32, 46, 62, 80);
 390
 391     interleave2(a1_v, a2_v, t_v, u_v);
 392     t_f = al::bit_cast<float4>(t_v);
 393     auto u_f = al::bit_cast<float4>(u_v);
 394     printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
 395         t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
 396     assertv4(t_f, 4, 8, 5, 9);
 397     assertv4(u_f, 6, 10, 7, 11);
 398
 399     uninterleave2(a1_v, a2_v, t_v, u_v);
 400     t_f = al::bit_cast<float4>(t_v);
 401     u_f = al::bit_cast<float4>(u_v);
 402     printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
 403         t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
 404     assertv4(t_f, 4, 6, 8, 10);
 405     assertv4(u_f, 5, 7, 9, 11);
 406
 407     t_v = ld_ps1(f[15]);
 408     t_f = al::bit_cast<float4>(t_v);
 409     printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 410     assertv4(t_f, 15, 15, 15, 15);
 411
 412     t_v = vswaphl(a1_v, a2_v);
 413     t_f = al::bit_cast<float4>(t_v);
 414     printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 415     assertv4(t_f, 8, 9, 6, 7);
 416
 417     vtranspose4(a0_v, a1_v, a2_v, a3_v);
 418     auto a0_f = al::bit_cast<float4>(a0_v);
 419     auto a1_f = al::bit_cast<float4>(a1_v);
 420     auto a2_f = al::bit_cast<float4>(a2_v);
 421     auto a3_f = al::bit_cast<float4>(a3_v);
 422     printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
 423           a0_f[0], a0_f[1], a0_f[2], a0_f[3], a1_f[0], a1_f[1], a1_f[2], a1_f[3],
 424           a2_f[0], a2_f[1], a2_f[2], a2_f[3], a3_f[0], a3_f[1], a3_f[2], a3_f[3]);
 425     assertv4(a0_f, 0, 4, 8, 12);
 426     assertv4(a1_f, 1, 5, 9, 13);
 427     assertv4(a2_f, 2, 6, 10, 14);
 428     assertv4(a3_f, 3, 7, 11, 15);
 429 }
 430 #endif //!PFFFT_SIMD_DISABLE
 431
 432 /* SSE and co like 16-bytes aligned pointers */
 433 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
 434 constexpr auto V4sfAlignment = size_t(64);
 435 constexpr auto V4sfAlignVal = std::align_val_t(V4sfAlignment);
 436
 437 /* NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
 438  * FIXME: Converting this from raw pointers to spans or something will probably
 439  * need significant work to maintain performance, given non-sequential range-
 440  * checked accesses and lack of 'restrict' to indicate non-aliased memory. At
 441  * least, some tests should be done to check the impact of using range-checked
 442  * spans here before blindly switching.
 443  */
 444 /*
 445   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 446 */
 447 NOINLINE void passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 448     const float *const wa1, const float fsign)
 449 {
 450     const size_t l1ido{l1*ido};
 451     if(ido <= 2)
 452     {
 453         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
 454         {
 455             ch[0]         = vadd(cc[0], cc[ido+0]);
 456             ch[l1ido]     = vsub(cc[0], cc[ido+0]);
 457             ch[1]         = vadd(cc[1], cc[ido+1]);
 458             ch[l1ido + 1] = vsub(cc[1], cc[ido+1]);
 459         }
 460     }
 461     else
 462     {
 463         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
 464         {
 465             for(size_t i{0};i < ido-1;i += 2)
 466             {
 467                 v4sf tr2{vsub(cc[i+0], cc[i+ido+0])};
 468                 v4sf ti2{vsub(cc[i+1], cc[i+ido+1])};
 469                 v4sf wr{ld_ps1(wa1[i])}, wi{ld_ps1(wa1[i+1]*fsign)};
 470                 ch[i]   = vadd(cc[i+0], cc[i+ido+0]);
 471                 ch[i+1] = vadd(cc[i+1], cc[i+ido+1]);
 472                 vcplxmul(tr2, ti2, wr, wi);
 473                 ch[i+l1ido]   = tr2;
 474                 ch[i+l1ido+1] = ti2;
 475             }
 476         }
 477     }
 478 }
 479
 480 /*
 481   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 482 */
 483 NOINLINE void passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 484     const float *const wa1, const float fsign)
 485 {
 486     assert(ido > 2);
 487
 488     const v4sf taur{ld_ps1(-0.5f)};
 489     const v4sf taui{ld_ps1(0.866025403784439f*fsign)};
 490     const size_t l1ido{l1*ido};
 491     const auto wa2 = wa1 + ido;
 492     for(size_t k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
 493     {
 494         for(size_t i{0};i < ido-1;i += 2)
 495         {
 496             v4sf tr2{vadd(cc[i+ido], cc[i+2*ido])};
 497             v4sf cr2{vmadd(taur, tr2, cc[i])};
 498             ch[i]  = vadd(tr2, cc[i]);
 499             v4sf ti2{vadd(cc[i+ido+1], cc[i+2*ido+1])};
 500             v4sf ci2{vmadd(taur, ti2, cc[i+1])};
 501             ch[i+1] = vadd(cc[i+1], ti2);
 502             v4sf cr3{vmul(taui, vsub(cc[i+ido], cc[i+2*ido]))};
 503             v4sf ci3{vmul(taui, vsub(cc[i+ido+1], cc[i+2*ido+1]))};
 504             v4sf dr2{vsub(cr2, ci3)};
 505             v4sf dr3{vadd(cr2, ci3)};
 506             v4sf di2{vadd(ci2, cr3)};
 507             v4sf di3{vsub(ci2, cr3)};
 508             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
 509             vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
 510             ch[i+l1ido] = dr2;
 511             ch[i+l1ido + 1] = di2;
 512             vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
 513             ch[i+2*l1ido] = dr3;
 514             ch[i+2*l1ido+1] = di3;
 515         }
 516     }
 517 } /* passf3 */
 518
 519 NOINLINE void passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 520     const float *const wa1, const float fsign)
 521 {
 522     /* fsign == -1 for forward transform and +1 for backward transform */
 523     const v4sf vsign{ld_ps1(fsign)};
 524     const size_t l1ido{l1*ido};
 525     if(ido == 2)
 526     {
 527         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
 528         {
 529             v4sf tr1{vsub(cc[0], cc[2*ido + 0])};
 530             v4sf tr2{vadd(cc[0], cc[2*ido + 0])};
 531             v4sf ti1{vsub(cc[1], cc[2*ido + 1])};
 532             v4sf ti2{vadd(cc[1], cc[2*ido + 1])};
 533             v4sf ti4{vmul(vsub(cc[1*ido + 0], cc[3*ido + 0]), vsign)};
 534             v4sf tr4{vmul(vsub(cc[3*ido + 1], cc[1*ido + 1]), vsign)};
 535             v4sf tr3{vadd(cc[ido + 0], cc[3*ido + 0])};
 536             v4sf ti3{vadd(cc[ido + 1], cc[3*ido + 1])};
 537
 538             ch[0*l1ido + 0] = vadd(tr2, tr3);
 539             ch[0*l1ido + 1] = vadd(ti2, ti3);
 540             ch[1*l1ido + 0] = vadd(tr1, tr4);
 541             ch[1*l1ido + 1] = vadd(ti1, ti4);
 542             ch[2*l1ido + 0] = vsub(tr2, tr3);
 543             ch[2*l1ido + 1] = vsub(ti2, ti3);
 544             ch[3*l1ido + 0] = vsub(tr1, tr4);
 545             ch[3*l1ido + 1] = vsub(ti1, ti4);
 546         }
 547     }
 548     else
 549     {
 550         const auto wa2 = wa1 + ido;
 551         const auto wa3 = wa2 + ido;
 552         for(size_t k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
 553         {
 554             for(size_t i{0};i < ido-1;i+=2)
 555             {
 556                 v4sf tr1{vsub(cc[i + 0], cc[i + 2*ido + 0])};
 557                 v4sf tr2{vadd(cc[i + 0], cc[i + 2*ido + 0])};
 558                 v4sf ti1{vsub(cc[i + 1], cc[i + 2*ido + 1])};
 559                 v4sf ti2{vadd(cc[i + 1], cc[i + 2*ido + 1])};
 560                 v4sf tr4{vmul(vsub(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)};
 561                 v4sf ti4{vmul(vsub(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)};
 562                 v4sf tr3{vadd(cc[i + ido + 0], cc[i + 3*ido + 0])};
 563                 v4sf ti3{vadd(cc[i + ido + 1], cc[i + 3*ido + 1])};
 564
 565                 ch[i] = vadd(tr2, tr3);
 566                 v4sf cr3{vsub(tr2, tr3)};
 567                 ch[i + 1] = vadd(ti2, ti3);
 568                 v4sf ci3{vsub(ti2, ti3)};
 569
 570                 v4sf cr2{vadd(tr1, tr4)};
 571                 v4sf cr4{vsub(tr1, tr4)};
 572                 v4sf ci2{vadd(ti1, ti4)};
 573                 v4sf ci4{vsub(ti1, ti4)};
 574                 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
 575                 vcplxmul(cr2, ci2, ld_ps1(wr1), ld_ps1(wi1));
 576                 float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
 577                 ch[i + l1ido] = cr2;
 578                 ch[i + l1ido + 1] = ci2;
 579
 580                 vcplxmul(cr3, ci3, ld_ps1(wr2), ld_ps1(wi2));
 581                 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
 582                 ch[i + 2*l1ido] = cr3;
 583                 ch[i + 2*l1ido + 1] = ci3;
 584
 585                 vcplxmul(cr4, ci4, ld_ps1(wr3), ld_ps1(wi3));
 586                 ch[i + 3*l1ido] = cr4;
 587                 ch[i + 3*l1ido + 1] = ci4;
 588             }
 589         }
 590     }
 591 } /* passf4 */
 592
 593 /*
 594  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
 595  */
 596 NOINLINE void passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 597     const float *const wa1, const float fsign)
 598 {
 599     const v4sf tr11{ld_ps1(0.309016994374947f)};
 600     const v4sf tr12{ld_ps1(-0.809016994374947f)};
 601     const v4sf ti11{ld_ps1(0.951056516295154f*fsign)};
 602     const v4sf ti12{ld_ps1(0.587785252292473f*fsign)};
 603
 604     auto cc_ref = [&cc,ido](size_t a_1, size_t a_2) noexcept -> auto&
 605     { return cc[(a_2-1)*ido + a_1 + 1]; };
 606     auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_3) noexcept -> auto&
 607     { return ch[(a_3-1)*l1*ido + a_1 + 1]; };
 608
 609     assert(ido > 2);
 610
 611     const auto wa2 = wa1 + ido;
 612     const auto wa3 = wa2 + ido;
 613     const auto wa4 = wa3 + ido;
 614     for(size_t k{0};k < l1;++k, cc += 5*ido, ch += ido)
 615     {
 616         for(size_t i{0};i < ido-1;i += 2)
 617         {
 618             v4sf ti5{vsub(cc_ref(i  , 2), cc_ref(i  , 5))};
 619             v4sf ti2{vadd(cc_ref(i  , 2), cc_ref(i  , 5))};
 620             v4sf ti4{vsub(cc_ref(i  , 3), cc_ref(i  , 4))};
 621             v4sf ti3{vadd(cc_ref(i  , 3), cc_ref(i  , 4))};
 622             v4sf tr5{vsub(cc_ref(i-1, 2), cc_ref(i-1, 5))};
 623             v4sf tr2{vadd(cc_ref(i-1, 2), cc_ref(i-1, 5))};
 624             v4sf tr4{vsub(cc_ref(i-1, 3), cc_ref(i-1, 4))};
 625             v4sf tr3{vadd(cc_ref(i-1, 3), cc_ref(i-1, 4))};
 626             ch_ref(i-1, 1) = vadd(cc_ref(i-1, 1), vadd(tr2, tr3));
 627             ch_ref(i  , 1) = vadd(cc_ref(i  , 1), vadd(ti2, ti3));
 628             v4sf cr2{vadd(cc_ref(i-1, 1), vmadd(tr11, tr2, vmul(tr12, tr3)))};
 629             v4sf ci2{vadd(cc_ref(i  , 1), vmadd(tr11, ti2, vmul(tr12, ti3)))};
 630             v4sf cr3{vadd(cc_ref(i-1, 1), vmadd(tr12, tr2, vmul(tr11, tr3)))};
 631             v4sf ci3{vadd(cc_ref(i  , 1), vmadd(tr12, ti2, vmul(tr11, ti3)))};
 632             v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
 633             v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
 634             v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
 635             v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
 636             v4sf dr3{vsub(cr3, ci4)};
 637             v4sf dr4{vadd(cr3, ci4)};
 638             v4sf di3{vadd(ci3, cr4)};
 639             v4sf di4{vsub(ci3, cr4)};
 640             v4sf dr5{vadd(cr2, ci5)};
 641             v4sf dr2{vsub(cr2, ci5)};
 642             v4sf di5{vsub(ci2, cr5)};
 643             v4sf di2{vadd(ci2, cr5)};
 644             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
 645             float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
 646             vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
 647             ch_ref(i - 1, 2) = dr2;
 648             ch_ref(i, 2)     = di2;
 649             vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
 650             ch_ref(i - 1, 3) = dr3;
 651             ch_ref(i, 3)     = di3;
 652             vcplxmul(dr4, di4, ld_ps1(wr3), ld_ps1(wi3));
 653             ch_ref(i - 1, 4) = dr4;
 654             ch_ref(i, 4)     = di4;
 655             vcplxmul(dr5, di5, ld_ps1(wr4), ld_ps1(wi4));
 656             ch_ref(i - 1, 5) = dr5;
 657             ch_ref(i, 5)     = di5;
 658         }
 659     }
 660 }
 661
 662 NOINLINE void radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
 663     v4sf *RESTRICT ch, const float *const wa1)
 664 {
 665     const size_t l1ido{l1*ido};
 666     for(size_t k{0};k < l1ido;k += ido)
 667     {
 668         v4sf a{cc[k]}, b{cc[k + l1ido]};
 669         ch[2*k] = vadd(a, b);
 670         ch[2*(k+ido)-1] = vsub(a, b);
 671     }
 672     if(ido < 2)
 673         return;
 674     if(ido != 2)
 675     {
 676         for(size_t k{0};k < l1ido;k += ido)
 677         {
 678             for(size_t i{2};i < ido;i += 2)
 679             {
 680                 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
 681                 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
 682                 vcplxmulconj(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
 683                 ch[i + 2*k] = vadd(bi, ti2);
 684                 ch[2*(k+ido) - i] = vsub(ti2, bi);
 685                 ch[i - 1 + 2*k] = vadd(br, tr2);
 686                 ch[2*(k+ido) - i -1] = vsub(br, tr2);
 687             }
 688         }
 689         if((ido&1) == 1)
 690             return;
 691     }
 692     const v4sf minus_one{ld_ps1(-1.0f)};
 693     for(size_t k{0};k < l1ido;k += ido)
 694     {
 695         ch[2*k + ido] = vmul(minus_one, cc[ido-1 + k + l1ido]);
 696         ch[2*k + ido-1] = cc[k + ido-1];
 697     }
 698 } /* radf2 */
 699
 700
 701 NOINLINE void radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 702     const float *const wa1)
 703 {
 704     const size_t l1ido{l1*ido};
 705     for(size_t k{0};k < l1ido;k += ido)
 706     {
 707         v4sf a{cc[2*k]};
 708         v4sf b{cc[2*(k+ido) - 1]};
 709         ch[k] = vadd(a, b);
 710         ch[k + l1ido] = vsub(a, b);
 711     }
 712     if(ido < 2)
 713         return;
 714     if(ido != 2)
 715     {
 716         for(size_t k{0};k < l1ido;k += ido)
 717         {
 718             for(size_t i{2};i < ido;i += 2)
 719             {
 720                 v4sf a{cc[i-1 + 2*k]};
 721                 v4sf b{cc[2*(k + ido) - i - 1]};
 722                 v4sf c{cc[i+0 + 2*k]};
 723                 v4sf d{cc[2*(k + ido) - i + 0]};
 724                 ch[i-1 + k] = vadd(a, b);
 725                 v4sf tr2{vsub(a, b)};
 726                 ch[i+0 + k] = vsub(c, d);
 727                 v4sf ti2{vadd(c, d)};
 728                 vcplxmul(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
 729                 ch[i-1 + k + l1ido] = tr2;
 730                 ch[i+0 + k + l1ido] = ti2;
 731             }
 732         }
 733         if((ido&1) == 1)
 734             return;
 735     }
 736     const v4sf minus_two{ld_ps1(-2.0f)};
 737     for(size_t k{0};k < l1ido;k += ido)
 738     {
 739         v4sf a{cc[2*k + ido-1]};
 740         v4sf b{cc[2*k + ido]};
 741         ch[k + ido-1] = vadd(a,a);
 742         ch[k + ido-1 + l1ido] = vmul(minus_two, b);
 743     }
 744 } /* radb2 */
 745
 746 void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
 747     const float *const wa1)
 748 {
 749     const v4sf taur{ld_ps1(-0.5f)};
 750     const v4sf taui{ld_ps1(0.866025403784439f)};
 751     for(size_t k{0};k < l1;++k)
 752     {
 753         v4sf cr2{vadd(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
 754         ch[        (3*k    )*ido] = vadd(cc[k*ido], cr2);
 755         ch[        (3*k + 2)*ido] = vmul(taui, vsub(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
 756         ch[ido-1 + (3*k + 1)*ido] = vmadd(taur, cr2, cc[k*ido]);
 757     }
 758     if(ido == 1)
 759         return;
 760
 761     const auto wa2 = wa1 + ido;
 762     for(size_t k{0};k < l1;++k)
 763     {
 764         for(size_t i{2};i < ido;i += 2)
 765         {
 766             const size_t ic{ido - i};
 767             v4sf wr1{ld_ps1(wa1[i - 2])};
 768             v4sf wi1{ld_ps1(wa1[i - 1])};
 769             v4sf dr2{cc[i - 1 + (k + l1)*ido]};
 770             v4sf di2{cc[i     + (k + l1)*ido]};
 771             vcplxmulconj(dr2, di2, wr1, wi1);
 772
 773             v4sf wr2{ld_ps1(wa2[i - 2])};
 774             v4sf wi2{ld_ps1(wa2[i - 1])};
 775             v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
 776             v4sf di3{cc[i     + (k + l1*2)*ido]};
 777             vcplxmulconj(dr3, di3, wr2, wi2);
 778
 779             v4sf cr2{vadd(dr2, dr3)};
 780             v4sf ci2{vadd(di2, di3)};
 781             ch[i - 1 + 3*k*ido] = vadd(cc[i - 1 + k*ido], cr2);
 782             ch[i     + 3*k*ido] = vadd(cc[i     + k*ido], ci2);
 783             v4sf tr2{vmadd(taur, cr2, cc[i - 1 + k*ido])};
 784             v4sf ti2{vmadd(taur, ci2, cc[i     + k*ido])};
 785             v4sf tr3{vmul(taui, vsub(di2, di3))};
 786             v4sf ti3{vmul(taui, vsub(dr3, dr2))};
 787             ch[i  - 1 + (3*k + 2)*ido] = vadd(tr2, tr3);
 788             ch[ic - 1 + (3*k + 1)*ido] = vsub(tr2, tr3);
 789             ch[i      + (3*k + 2)*ido] = vadd(ti2, ti3);
 790             ch[ic     + (3*k + 1)*ido] = vsub(ti3, ti2);
 791         }
 792     }
 793 } /* radf3 */
 794
 795
 796 void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
 797     const float *const wa1)
 798 {
 799     static constexpr float taur{-0.5f};
 800     static constexpr float taui{0.866025403784439f};
 801     static constexpr float taui_2{taui*2.0f};
 802
 803     const v4sf vtaur{ld_ps1(taur)};
 804     const v4sf vtaui_2{ld_ps1(taui_2)};
 805     for(size_t k{0};k < l1;++k)
 806     {
 807         v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
 808         tr2 = vadd(tr2,tr2);
 809         v4sf cr2 = vmadd(vtaur, tr2, cc[3*k*ido]);
 810         ch[k*ido] = vadd(cc[3*k*ido], tr2);
 811         v4sf ci3 = vmul(vtaui_2, cc[(3*k + 2)*ido]);
 812         ch[(k + l1)*ido] = vsub(cr2, ci3);
 813         ch[(k + 2*l1)*ido] = vadd(cr2, ci3);
 814     }
 815     if(ido == 1)
 816         return;
 817
 818     const auto wa2 = wa1 + ido;
 819     const v4sf vtaui{ld_ps1(taui)};
 820     for(size_t k{0};k < l1;++k)
 821     {
 822         for(size_t i{2};i < ido;i += 2)
 823         {
 824             const size_t ic{ido - i};
 825             v4sf tr2{vadd(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
 826             v4sf cr2{vmadd(vtaur, tr2, cc[i - 1 + 3*k*ido])};
 827             ch[i - 1 + k*ido] = vadd(cc[i - 1 + 3*k*ido], tr2);
 828             v4sf ti2{vsub(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])};
 829             v4sf ci2{vmadd(vtaur, ti2, cc[i + 3*k*ido])};
 830             ch[i + k*ido] = vadd(cc[i + 3*k*ido], ti2);
 831             v4sf cr3{vmul(vtaui, vsub(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))};
 832             v4sf ci3{vmul(vtaui, vadd(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))};
 833             v4sf dr2{vsub(cr2, ci3)};
 834             v4sf dr3{vadd(cr2, ci3)};
 835             v4sf di2{vadd(ci2, cr3)};
 836             v4sf di3{vsub(ci2, cr3)};
 837             vcplxmul(dr2, di2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
 838             ch[i - 1 + (k + l1)*ido] = dr2;
 839             ch[i + (k + l1)*ido] = di2;
 840             vcplxmul(dr3, di3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
 841             ch[i - 1 + (k + 2*l1)*ido] = dr3;
 842             ch[i + (k + 2*l1)*ido] = di3;
 843         }
 844     }
 845 } /* radb3 */
 846
 847 NOINLINE void radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
 848     v4sf *RESTRICT ch, const float *const wa1)
 849 {
 850     const size_t l1ido{l1*ido};
 851     {
 852         const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido};
 853         v4sf *RESTRICT ch_{ch};
 854         while(cc != cc_end)
 855         {
 856             // this loop represents between 25% and 40% of total radf4_ps cost !
 857             v4sf a0{cc[0]}, a1{cc[l1ido]};
 858             v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]};
 859             v4sf tr1{vadd(a1, a3)};
 860             v4sf tr2{vadd(a0, a2)};
 861             ch[2*ido-1] = vsub(a0, a2);
 862             ch[2*ido  ] = vsub(a3, a1);
 863             ch[0      ] = vadd(tr1, tr2);
 864             ch[4*ido-1] = vsub(tr2, tr1);
 865             cc += ido; ch += 4*ido;
 866         }
 867         cc = cc_;
 868         ch = ch_;
 869     }
 870     if(ido < 2)
 871         return;
 872     if(ido != 2)
 873     {
 874         const auto wa2 = wa1 + ido;
 875         const auto wa3 = wa2 + ido;
 876
 877         for(size_t k{0};k < l1ido;k += ido)
 878         {
 879             const v4sf *RESTRICT pc{cc + 1 + k};
 880             for(size_t i{2};i < ido;i += 2, pc += 2)
 881             {
 882                 const size_t ic{ido - i};
 883
 884                 v4sf cr2{pc[1*l1ido+0]};
 885                 v4sf ci2{pc[1*l1ido+1]};
 886                 v4sf wr{ld_ps1(wa1[i - 2])};
 887                 v4sf wi{ld_ps1(wa1[i - 1])};
 888                 vcplxmulconj(cr2,ci2,wr,wi);
 889
 890                 v4sf cr3{pc[2*l1ido+0]};
 891                 v4sf ci3{pc[2*l1ido+1]};
 892                 wr = ld_ps1(wa2[i-2]);
 893                 wi = ld_ps1(wa2[i-1]);
 894                 vcplxmulconj(cr3, ci3, wr, wi);
 895
 896                 v4sf cr4{pc[3*l1ido]};
 897                 v4sf ci4{pc[3*l1ido+1]};
 898                 wr = ld_ps1(wa3[i-2]);
 899                 wi = ld_ps1(wa3[i-1]);
 900                 vcplxmulconj(cr4, ci4, wr, wi);
 901
 902                 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
 903
 904                 v4sf tr1{vadd(cr2,cr4)};
 905                 v4sf tr4{vsub(cr4,cr2)};
 906                 v4sf tr2{vadd(pc[0],cr3)};
 907                 v4sf tr3{vsub(pc[0],cr3)};
 908                 ch[i  - 1 + 4*k        ] = vadd(tr2,tr1);
 909                 ch[ic - 1 + 4*k + 3*ido] = vsub(tr2,tr1); // at this point tr1 and tr2 can be disposed
 910                 v4sf ti1{vadd(ci2,ci4)};
 911                 v4sf ti4{vsub(ci2,ci4)};
 912                 ch[i  - 1 + 4*k + 2*ido] = vadd(tr3,ti4);
 913                 ch[ic - 1 + 4*k + 1*ido] = vsub(tr3,ti4); // dispose tr3, ti4
 914                 v4sf ti2{vadd(pc[1],ci3)};
 915                 v4sf ti3{vsub(pc[1],ci3)};
 916                 ch[i  + 4*k        ] = vadd(ti1, ti2);
 917                 ch[ic + 4*k + 3*ido] = vsub(ti1, ti2);
 918                 ch[i  + 4*k + 2*ido] = vadd(tr4, ti3);
 919                 ch[ic + 4*k + 1*ido] = vsub(tr4, ti3);
 920             }
 921         }
 922         if((ido&1) == 1)
 923             return;
 924     }
 925     const v4sf minus_hsqt2{ld_ps1(al::numbers::sqrt2_v<float> * -0.5f)};
 926     for(size_t k{0};k < l1ido;k += ido)
 927     {
 928         v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
 929         v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
 930         v4sf ti1{vmul(minus_hsqt2, vadd(b, a))};
 931         v4sf tr1{vmul(minus_hsqt2, vsub(b, a))};
 932         ch[ido-1 + 4*k        ] = vadd(c, tr1);
 933         ch[ido-1 + 4*k + 2*ido] = vsub(c, tr1);
 934         ch[        4*k + 1*ido] = vsub(ti1, d);
 935         ch[        4*k + 3*ido] = vadd(ti1, d);
 936     }
 937 } /* radf4 */
 938
 939
 940 NOINLINE void radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
 941     v4sf *RESTRICT ch, const float *const wa1)
 942 {
 943     const v4sf two{ld_ps1(2.0f)};
 944     const size_t l1ido{l1*ido};
 945     {
 946         const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
 947         v4sf *ch_{ch};
 948         while(ch != ch_end)
 949         {
 950             v4sf a{cc[0]}, b{cc[4*ido-1]};
 951             v4sf c{cc[2*ido]}, d{cc[2*ido-1]};
 952             v4sf tr3{vmul(two,d)};
 953             v4sf tr2{vadd(a,b)};
 954             v4sf tr1{vsub(a,b)};
 955             v4sf tr4{vmul(two,c)};
 956             ch[0*l1ido] = vadd(tr2, tr3);
 957             ch[2*l1ido] = vsub(tr2, tr3);
 958             ch[1*l1ido] = vsub(tr1, tr4);
 959             ch[3*l1ido] = vadd(tr1, tr4);
 960
 961             cc += 4*ido; ch += ido;
 962         }
 963         cc = cc_; ch = ch_;
 964     }
 965     if(ido < 2)
 966         return;
 967     if(ido != 2)
 968     {
 969         const auto wa2 = wa1 + ido;
 970         const auto wa3 = wa2 + ido;
 971
 972         for(size_t k{0};k < l1ido;k += ido)
 973         {
 974             const v4sf *RESTRICT pc{cc - 1 + 4*k};
 975             v4sf *RESTRICT ph{ch + k + 1};
 976             for(size_t i{2};i < ido;i += 2)
 977             {
 978                 v4sf tr1{vsub(pc[        i], pc[4*ido - i])};
 979                 v4sf tr2{vadd(pc[        i], pc[4*ido - i])};
 980                 v4sf ti4{vsub(pc[2*ido + i], pc[2*ido - i])};
 981                 v4sf tr3{vadd(pc[2*ido + i], pc[2*ido - i])};
 982                 ph[0] = vadd(tr2, tr3);
 983                 v4sf cr3{vsub(tr2, tr3)};
 984
 985                 v4sf ti3{vsub(pc[2*ido + i + 1], pc[2*ido - i + 1])};
 986                 v4sf tr4{vadd(pc[2*ido + i + 1], pc[2*ido - i + 1])};
 987                 v4sf cr2{vsub(tr1, tr4)};
 988                 v4sf cr4{vadd(tr1, tr4)};
 989
 990                 v4sf ti1{vadd(pc[i + 1], pc[4*ido - i + 1])};
 991                 v4sf ti2{vsub(pc[i + 1], pc[4*ido - i + 1])};
 992
 993                 ph[1] = vadd(ti2, ti3); ph += l1ido;
 994                 v4sf ci3{vsub(ti2, ti3)};
 995                 v4sf ci2{vadd(ti1, ti4)};
 996                 v4sf ci4{vsub(ti1, ti4)};
 997                 vcplxmul(cr2, ci2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
 998                 ph[0] = cr2;
 999                 ph[1] = ci2; ph += l1ido;
1000                 vcplxmul(cr3, ci3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
1001                 ph[0] = cr3;
1002                 ph[1] = ci3; ph += l1ido;
1003                 vcplxmul(cr4, ci4, ld_ps1(wa3[i-2]), ld_ps1(wa3[i-1]));
1004                 ph[0] = cr4;
1005                 ph[1] = ci4; ph = ph - 3*l1ido + 2;
1006             }
1007         }
1008         if((ido&1) == 1)
1009             return;
1010     }
1011     const v4sf minus_sqrt2{ld_ps1(-1.414213562373095f)};
1012     for(size_t k{0};k < l1ido;k += ido)
1013     {
1014         const size_t i0{4*k + ido};
1015         v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
1016         v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
1017         v4sf tr1{vsub(c,d)};
1018         v4sf tr2{vadd(c,d)};
1019         v4sf ti1{vadd(b,a)};
1020         v4sf ti2{vsub(b,a)};
1021         ch[ido-1 + k + 0*l1ido] = vadd(tr2,tr2);
1022         ch[ido-1 + k + 1*l1ido] = vmul(minus_sqrt2, vsub(ti1, tr1));
1023         ch[ido-1 + k + 2*l1ido] = vadd(ti2, ti2);
1024         ch[ido-1 + k + 3*l1ido] = vmul(minus_sqrt2, vadd(ti1, tr1));
1025     }
1026 } /* radb4 */
1027
1028 void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1029     const float *const wa1)
1030 {
1031     const v4sf tr11{ld_ps1(0.309016994374947f)};
1032     const v4sf ti11{ld_ps1(0.951056516295154f)};
1033     const v4sf tr12{ld_ps1(-0.809016994374947f)};
1034     const v4sf ti12{ld_ps1(0.587785252292473f)};
1035
1036     auto cc_ref = [&cc,l1,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1037     { return cc[(a_3*l1 + a_2)*ido + a_1]; };
1038     auto ch_ref = [&ch,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1039     { return ch[(a_3*5 + a_2)*ido + a_1]; };
1040
1041     /* Parameter adjustments */
1042     ch -= 1 + ido * 6;
1043     cc -= 1 + ido * (1 + l1);
1044
1045     const auto wa2 = wa1 + ido;
1046     const auto wa3 = wa2 + ido;
1047     const auto wa4 = wa3 + ido;
1048
1049     /* Function Body */
1050     for(size_t k{1};k <= l1;++k)
1051     {
1052         v4sf cr2{vadd(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1053         v4sf ci5{vsub(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1054         v4sf cr3{vadd(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1055         v4sf ci4{vsub(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1056         ch_ref(1, 1, k)   = vadd(cc_ref(1, k, 1), vadd(cr2, cr3));
1057         ch_ref(ido, 2, k) = vadd(cc_ref(1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)));
1058         ch_ref(1, 3, k)   = vmadd(ti11, ci5, vmul(ti12, ci4));
1059         ch_ref(ido, 4, k) = vadd(cc_ref(1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)));
1060         ch_ref(1, 5, k)   = vsub(vmul(ti12, ci5), vmul(ti11, ci4));
1061         //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
1062     }
1063     if(ido == 1)
1064         return;
1065
1066     const size_t idp2{ido + 2};
1067     for(size_t k{1};k <= l1;++k)
1068     {
1069         for(size_t i{3};i <= ido;i += 2)
1070         {
1071             const size_t ic{idp2 - i};
1072             v4sf dr2{ld_ps1(wa1[i-3])};
1073             v4sf di2{ld_ps1(wa1[i-2])};
1074             v4sf dr3{ld_ps1(wa2[i-3])};
1075             v4sf di3{ld_ps1(wa2[i-2])};
1076             v4sf dr4{ld_ps1(wa3[i-3])};
1077             v4sf di4{ld_ps1(wa3[i-2])};
1078             v4sf dr5{ld_ps1(wa4[i-3])};
1079             v4sf di5{ld_ps1(wa4[i-2])};
1080             vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
1081             vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
1082             vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
1083             vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
1084             v4sf cr2{vadd(dr2, dr5)};
1085             v4sf ci5{vsub(dr5, dr2)};
1086             v4sf cr5{vsub(di2, di5)};
1087             v4sf ci2{vadd(di2, di5)};
1088             v4sf cr3{vadd(dr3, dr4)};
1089             v4sf ci4{vsub(dr4, dr3)};
1090             v4sf cr4{vsub(di3, di4)};
1091             v4sf ci3{vadd(di3, di4)};
1092             ch_ref(i - 1, 1, k) = vadd(cc_ref(i - 1, k, 1), vadd(cr2, cr3));
1093             ch_ref(i, 1, k) = vsub(cc_ref(i, k, 1), vadd(ci2, ci3));
1094             v4sf tr2{vadd(cc_ref(i - 1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)))};
1095             v4sf ti2{vsub(cc_ref(i, k, 1), vmadd(tr11, ci2, vmul(tr12, ci3)))};
1096             v4sf tr3{vadd(cc_ref(i - 1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)))};
1097             v4sf ti3{vsub(cc_ref(i, k, 1), vmadd(tr12, ci2, vmul(tr11, ci3)))};
1098             v4sf tr5{vmadd(ti11, cr5, vmul(ti12, cr4))};
1099             v4sf ti5{vmadd(ti11, ci5, vmul(ti12, ci4))};
1100             v4sf tr4{vsub(vmul(ti12, cr5), vmul(ti11, cr4))};
1101             v4sf ti4{vsub(vmul(ti12, ci5), vmul(ti11, ci4))};
1102             ch_ref(i  - 1, 3, k) = vsub(tr2, tr5);
1103             ch_ref(ic - 1, 2, k) = vadd(tr2, tr5);
1104             ch_ref(i     , 3, k) = vadd(ti5, ti2);
1105             ch_ref(ic    , 2, k) = vsub(ti5, ti2);
1106             ch_ref(i  - 1, 5, k) = vsub(tr3, tr4);
1107             ch_ref(ic - 1, 4, k) = vadd(tr3, tr4);
1108             ch_ref(i     , 5, k) = vadd(ti4, ti3);
1109             ch_ref(ic    , 4, k) = vsub(ti4, ti3);
1110         }
1111     }
1112 } /* radf5 */
1113
1114 void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1115     const float *const wa1)
1116 {
1117     const v4sf tr11{ld_ps1(0.309016994374947f)};
1118     const v4sf ti11{ld_ps1(0.951056516295154f)};
1119     const v4sf tr12{ld_ps1(-0.809016994374947f)};
1120     const v4sf ti12{ld_ps1(0.587785252292473f)};
1121
1122     auto cc_ref = [&cc,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1123     { return cc[(a_3*5 + a_2)*ido + a_1]; };
1124     auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1125     { return ch[(a_3*l1 + a_2)*ido + a_1]; };
1126
1127     /* Parameter adjustments */
1128     ch -= 1 + ido*(1 + l1);
1129     cc -= 1 + ido*6;
1130
1131     const auto wa2 = wa1 + ido;
1132     const auto wa3 = wa2 + ido;
1133     const auto wa4 = wa3 + ido;
1134
1135     /* Function Body */
1136     for(size_t k{1};k <= l1;++k)
1137     {
1138         v4sf ti5{vadd(cc_ref(  1, 3, k), cc_ref(1, 3, k))};
1139         v4sf ti4{vadd(cc_ref(  1, 5, k), cc_ref(1, 5, k))};
1140         v4sf tr2{vadd(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
1141         v4sf tr3{vadd(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
1142         ch_ref(1, k, 1) = vadd(cc_ref(1, 1, k), vadd(tr2, tr3));
1143         v4sf cr2{vadd(cc_ref(1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1144         v4sf cr3{vadd(cc_ref(1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1145         v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1146         v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1147         ch_ref(1, k, 2) = vsub(cr2, ci5);
1148         ch_ref(1, k, 3) = vsub(cr3, ci4);
1149         ch_ref(1, k, 4) = vadd(cr3, ci4);
1150         ch_ref(1, k, 5) = vadd(cr2, ci5);
1151     }
1152     if(ido == 1)
1153         return;
1154
1155     const size_t idp2{ido + 2};
1156     for(size_t k{1};k <= l1;++k)
1157     {
1158         for(size_t i{3};i <= ido;i += 2)
1159         {
1160             const size_t ic{idp2 - i};
1161             v4sf ti5{vadd(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
1162             v4sf ti2{vsub(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
1163             v4sf ti4{vadd(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
1164             v4sf ti3{vsub(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
1165             v4sf tr5{vsub(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1166             v4sf tr2{vadd(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1167             v4sf tr4{vsub(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1168             v4sf tr3{vadd(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1169             ch_ref(i - 1, k, 1) = vadd(cc_ref(i-1, 1, k), vadd(tr2, tr3));
1170             ch_ref(i    , k, 1) = vadd(cc_ref(i  , 1, k), vadd(ti2, ti3));
1171             v4sf cr2{vadd(cc_ref(i-1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1172             v4sf ci2{vadd(cc_ref(i  , 1, k), vmadd(tr11, ti2, vmul(tr12, ti3)))};
1173             v4sf cr3{vadd(cc_ref(i-1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1174             v4sf ci3{vadd(cc_ref(i  , 1, k), vmadd(tr12, ti2, vmul(tr11, ti3)))};
1175             v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
1176             v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1177             v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
1178             v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1179             v4sf dr3{vsub(cr3, ci4)};
1180             v4sf dr4{vadd(cr3, ci4)};
1181             v4sf di3{vadd(ci3, cr4)};
1182             v4sf di4{vsub(ci3, cr4)};
1183             v4sf dr5{vadd(cr2, ci5)};
1184             v4sf dr2{vsub(cr2, ci5)};
1185             v4sf di5{vsub(ci2, cr5)};
1186             v4sf di2{vadd(ci2, cr5)};
1187             vcplxmul(dr2, di2, ld_ps1(wa1[i-3]), ld_ps1(wa1[i-2]));
1188             vcplxmul(dr3, di3, ld_ps1(wa2[i-3]), ld_ps1(wa2[i-2]));
1189             vcplxmul(dr4, di4, ld_ps1(wa3[i-3]), ld_ps1(wa3[i-2]));
1190             vcplxmul(dr5, di5, ld_ps1(wa4[i-3]), ld_ps1(wa4[i-2]));
1191
1192             ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
1193             ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
1194             ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
1195             ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
1196         }
1197     }
1198 } /* radb5 */
1199
1200 NOINLINE v4sf *rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1201     const float *wa, const al::span<const uint,15> ifac)
1202 {
1203     assert(work1 != work2);
1204
1205     const v4sf *in{input_readonly};
1206     v4sf *out{in == work2 ? work1 : work2};
1207     const size_t nf{ifac[1]};
1208     size_t l2{n};
1209     size_t iw{n-1};
1210     size_t k1{1};
1211     while(true)
1212     {
1213         const size_t kh{nf - k1};
1214         const size_t ip{ifac[kh + 2]};
1215         const size_t l1{l2 / ip};
1216         const size_t ido{n / l2};
1217         iw -= (ip - 1)*ido;
1218         switch(ip)
1219         {
1220         case 5:
1221             radf5_ps(ido, l1, in, out, &wa[iw]);
1222             break;
1223         case 4:
1224             radf4_ps(ido, l1, in, out, &wa[iw]);
1225             break;
1226         case 3:
1227             radf3_ps(ido, l1, in, out, &wa[iw]);
1228             break;
1229         case 2:
1230             radf2_ps(ido, l1, in, out, &wa[iw]);
1231             break;
1232         default:
1233             assert(0);
1234         }
1235         if(++k1 > nf)
1236             return out;
1237
1238         l2 = l1;
1239         if(out == work2)
1240         {
1241             out = work1;
1242             in = work2;
1243         }
1244         else
1245         {
1246             out = work2;
1247             in = work1;
1248         }
1249     }
1250 } /* rfftf1 */
1251
1252 NOINLINE v4sf *rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1253     const float *wa, const al::span<const uint,15> ifac)
1254 {
1255     assert(work1 != work2);
1256
1257     const v4sf *in{input_readonly};
1258     v4sf *out{in == work2 ? work1 : work2};
1259     const size_t nf{ifac[1]};
1260     size_t l1{1};
1261     size_t iw{0};
1262     size_t k1{1};
1263     while(true)
1264     {
1265         const size_t ip{ifac[k1 + 1]};
1266         const size_t l2{ip*l1};
1267         const size_t ido{n / l2};
1268         switch(ip)
1269         {
1270         case 5:
1271             radb5_ps(ido, l1, in, out, &wa[iw]);
1272             break;
1273         case 4:
1274             radb4_ps(ido, l1, in, out, &wa[iw]);
1275             break;
1276         case 3:
1277             radb3_ps(ido, l1, in, out, &wa[iw]);
1278             break;
1279         case 2:
1280             radb2_ps(ido, l1, in, out, &wa[iw]);
1281             break;
1282         default:
1283             assert(0);
1284         }
1285         if(++k1 > nf)
1286             return out;
1287
1288         l1 = l2;
1289         iw += (ip - 1)*ido;
1290
1291         if(out == work2)
1292         {
1293             out = work1;
1294             in = work2;
1295         }
1296         else
1297         {
1298             out = work2;
1299             in = work1;
1300         }
1301     }
1302 }
1303
1304 v4sf *cfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1305     const float *wa, const al::span<const uint,15> ifac, const float fsign)
1306 {
1307     assert(work1 != work2);
1308
1309     const v4sf *in{input_readonly};
1310     v4sf *out{in == work2 ? work1 : work2};
1311     const size_t nf{ifac[1]};
1312     size_t l1{1}, iw{0};
1313     size_t k1{2};
1314     while(true)
1315     {
1316         const size_t ip{ifac[k1]};
1317         const size_t l2{ip*l1};
1318         const size_t ido{n / l2};
1319         const size_t idot{ido + ido};
1320         switch(ip)
1321         {
1322         case 5:
1323             passf5_ps(idot, l1, in, out, &wa[iw], fsign);
1324             break;
1325         case 4:
1326             passf4_ps(idot, l1, in, out, &wa[iw], fsign);
1327             break;
1328         case 3:
1329             passf3_ps(idot, l1, in, out, &wa[iw], fsign);
1330             break;
1331         case 2:
1332             passf2_ps(idot, l1, in, out, &wa[iw], fsign);
1333             break;
1334         default:
1335             assert(0);
1336         }
1337         if(++k1 > nf+1)
1338             return out;
1339
1340         l1 = l2;
1341         iw += (ip - 1)*idot;
1342         if(out == work2)
1343         {
1344             out = work1;
1345             in = work2;
1346         }
1347         else
1348         {
1349             out = work2;
1350             in = work1;
1351         }
1352     }
1353 }
1354
1355
1356 uint decompose(const uint n, const al::span<uint,15> ifac, const al::span<const uint,4> ntryh)
1357 {
1358     uint nl{n}, nf{0};
1359     for(const uint ntry : ntryh)
1360     {
1361         while(nl != 1)
1362         {
1363             const uint nq{nl / ntry};
1364             const uint nr{nl % ntry};
1365             if(nr != 0) break;
1366
1367             ifac[2+nf++] = ntry;
1368             nl = nq;
1369             if(ntry == 2 && nf != 1)
1370             {
1371                 for(size_t i{2};i <= nf;++i)
1372                 {
1373                     size_t ib{nf - i + 2};
1374                     ifac[ib + 1] = ifac[ib];
1375                 }
1376                 ifac[2] = 2;
1377             }
1378         }
1379     }
1380     ifac[0] = n;
1381     ifac[1] = nf;
1382     return nf;
1383 }
1384
1385 void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1386 {
1387     static constexpr std::array ntryh{4u,2u,3u,5u};
1388
1389     const uint nf{decompose(n, ifac, ntryh)};
1390     const double argh{2.0*al::numbers::pi / n};
1391     size_t is{0};
1392     size_t nfm1{nf - 1};
1393     size_t l1{1};
1394     for(size_t k1{0};k1 < nfm1;++k1)
1395     {
1396         const size_t ip{ifac[k1+2]};
1397         const size_t l2{l1*ip};
1398         const size_t ido{n / l2};
1399         const size_t ipm{ip - 1};
1400         size_t ld{0};
1401         for(size_t j{0};j < ipm;++j)
1402         {
1403             size_t i{is};
1404             ld += l1;
1405             const double argld{static_cast<double>(ld)*argh};
1406             double fi{0.0};
1407             for(size_t ii{2};ii < ido;ii += 2)
1408             {
1409                 fi += 1.0;
1410                 wa[i++] = static_cast<float>(std::cos(fi*argld));
1411                 wa[i++] = static_cast<float>(std::sin(fi*argld));
1412             }
1413             is += ido;
1414         }
1415         l1 = l2;
1416     }
1417 } /* rffti1 */
1418
1419 void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1420 {
1421     static constexpr std::array ntryh{5u,3u,4u,2u};
1422
1423     const uint nf{decompose(n, ifac, ntryh)};
1424     const double argh{2.0*al::numbers::pi / n};
1425     size_t i{1};
1426     size_t l1{1};
1427     for(size_t k1{0};k1 < nf;++k1)
1428     {
1429         const size_t ip{ifac[k1+2]};
1430         const size_t l2{l1*ip};
1431         const size_t ido{n / l2};
1432         const size_t idot{ido + ido + 2};
1433         const size_t ipm{ip - 1};
1434         size_t ld{0};
1435         for(size_t j{0};j < ipm;++j)
1436         {
1437             size_t i1{i};
1438             wa[i-1] = 1;
1439             wa[i] = 0;
1440             ld += l1;
1441             const double argld{static_cast<double>(ld)*argh};
1442             double fi{0.0};
1443             for(size_t ii{3};ii < idot;ii += 2)
1444             {
1445                 fi += 1.0;
1446                 wa[++i] = static_cast<float>(std::cos(fi*argld));
1447                 wa[++i] = static_cast<float>(std::sin(fi*argld));
1448             }
1449             if(ip > 5)
1450             {
1451                 wa[i1-1] = wa[i-1];
1452                 wa[i1] = wa[i];
1453             }
1454         }
1455         l1 = l2;
1456     }
1457 } /* cffti1 */
1458
1459 } // namespace
1460
1461 /* NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) */
1462 struct PFFFT_Setup {
1463     uint N{};
1464     uint Ncvec{}; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1465     std::array<uint,15> ifac{};
1466     pffft_transform_t transform{};
1467
1468     float *twiddle{}; /* N/4 elements */
1469     al::span<v4sf> e; /* N/4*3 elements */
1470
1471     alignas(V4sfAlignment) std::byte end;
1472 };
1473
1474 PFFFTSetupPtr pffft_new_setup(unsigned int N, pffft_transform_t transform)
1475 {
1476     assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX);
1477     assert(N > 0);
1478     /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1479      * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1480      * handle other cases (or maybe just switch to a scalar fft, I don't know..)
1481      */
1482     if(transform == PFFFT_REAL)
1483         assert((N%(2*SimdSize*SimdSize)) == 0);
1484     else
1485         assert((N%(SimdSize*SimdSize)) == 0);
1486
1487     const uint Ncvec{(transform == PFFFT_REAL ? N/2 : N) / SimdSize};
1488
1489     const size_t storelen{std::max(offsetof(PFFFT_Setup, end) + 2_zu*Ncvec*sizeof(v4sf),
1490         sizeof(PFFFT_Setup))};
1491     auto storage = static_cast<gsl::owner<std::byte*>>(::operator new[](storelen, V4sfAlignVal));
1492     al::span extrastore{&storage[offsetof(PFFFT_Setup, end)], 2_zu*Ncvec*sizeof(v4sf)};
1493
1494     PFFFTSetupPtr s{::new(storage) PFFFT_Setup{}};
1495     s->N = N;
1496     s->transform = transform;
1497     s->Ncvec = Ncvec;
1498
1499     const size_t ecount{2_zu*Ncvec*(SimdSize-1)/SimdSize};
1500     s->e = {std::launder(reinterpret_cast<v4sf*>(extrastore.data())), ecount};
1501     s->twiddle = std::launder(reinterpret_cast<float*>(&extrastore[ecount*sizeof(v4sf)]));
1502
1503     if constexpr(SimdSize > 1)
1504     {
1505         auto e = std::vector<float>(s->e.size()*SimdSize, 0.0f);
1506         for(size_t k{0};k < s->Ncvec;++k)
1507         {
1508             const size_t i{k / SimdSize};
1509             const size_t j{k % SimdSize};
1510             for(size_t m{0};m < SimdSize-1;++m)
1511             {
1512                 const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N};
1513                 e[((i*3 + m)*2 + 0)*SimdSize + j] = static_cast<float>(std::cos(A));
1514                 e[((i*3 + m)*2 + 1)*SimdSize + j] = static_cast<float>(std::sin(A));
1515             }
1516         }
1517         std::memcpy(s->e.data(), e.data(), e.size()*sizeof(float));
1518     }
1519     if(transform == PFFFT_REAL)
1520         rffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1521     else
1522         cffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1523
1524     /* check that N is decomposable with allowed prime factors */
1525     size_t m{1};
1526     for(size_t k{0};k < s->ifac[1];++k)
1527         m *= s->ifac[2+k];
1528
1529     if(m != N/SimdSize)
1530         s = nullptr;
1531
1532     return s;
1533 }
1534
1535
1536 void pffft_destroy_setup(gsl::owner<PFFFT_Setup*> s) noexcept
1537 {
1538     std::destroy_at(s);
1539     ::operator delete[](gsl::owner<void*>{s}, V4sfAlignVal);
1540 }
1541
1542 #if !defined(PFFFT_SIMD_DISABLE)
1543
1544 namespace {
1545
1546 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1547 void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *RESTRICT out)
1548 {
1549     v4sf g0, g1;
1550     interleave2(in[0], in[1], g0, g1);
1551     in += in_stride;
1552
1553     *--out = vswaphl(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1554     for(size_t k{1};k < N;++k)
1555     {
1556         v4sf h0, h1;
1557         interleave2(in[0], in[1], h0, h1);
1558         in += in_stride;
1559         *--out = vswaphl(g1, h0);
1560         *--out = vswaphl(h0, h1);
1561         g1 = h1;
1562     }
1563     *--out = vswaphl(g1, g0);
1564 }
1565
1566 void unreversed_copy(const size_t N, const v4sf *in, v4sf *RESTRICT out, const int out_stride)
1567 {
1568     v4sf g0{in[0]}, g1{g0};
1569     ++in;
1570     for(size_t k{1};k < N;++k)
1571     {
1572         v4sf h0{*in++}; v4sf h1{*in++};
1573         g1 = vswaphl(g1, h0);
1574         h0 = vswaphl(h0, h1);
1575         uninterleave2(h0, g1, out[0], out[1]);
1576         out += out_stride;
1577         g1 = h1;
1578     }
1579     v4sf h0{*in++}, h1{g0};
1580     g1 = vswaphl(g1, h0);
1581     h0 = vswaphl(h0, h1);
1582     uninterleave2(h0, g1, out[0], out[1]);
1583 }
1584
1585 void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1586 {
1587     assert(in != out);
1588
1589     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1590     for(size_t k{0};k < dk;++k)
1591     {
1592         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1593         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1594         v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1595         v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1596         vtranspose4(r0,r1,r2,r3);
1597         vtranspose4(i0,i1,i2,i3);
1598         vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
1599         vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
1600         vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
1601
1602         v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1603         v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1604         v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1605         v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1606
1607         /* transformation for each column is:
1608          *
1609          * [1   1   1   1   0   0   0   0]   [r0]
1610          * [1   0  -1   0   0  -1   0   1]   [r1]
1611          * [1  -1   1  -1   0   0   0   0]   [r2]
1612          * [1   0  -1   0   0   1   0  -1]   [r3]
1613          * [0   0   0   0   1   1   1   1] * [i0]
1614          * [0   1   0  -1   1   0  -1   0]   [i1]
1615          * [0   0   0   0   1  -1   1  -1]   [i2]
1616          * [0  -1   0   1   1   0  -1   0]   [i3]
1617          */
1618
1619         r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1620         r1 = vadd(dr0, di1); i1 = vsub(di0, dr1);
1621         r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1622         r3 = vsub(dr0, di1); i3 = vadd(di0, dr1);
1623
1624         *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1625         *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1626     }
1627 }
1628
1629 void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1630 {
1631     assert(in != out);
1632
1633     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1634     for(size_t k{0};k < dk;++k)
1635     {
1636         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1637         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1638         v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1639         v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1640
1641         v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1642         v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1643         v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1644         v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1645
1646         r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1647         r1 = vsub(dr0, di1); i1 = vadd(di0, dr1);
1648         r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1649         r3 = vadd(dr0, di1); i3 = vsub(di0, dr1);
1650
1651         vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
1652         vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
1653         vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
1654
1655         vtranspose4(r0,r1,r2,r3);
1656         vtranspose4(i0,i1,i2,i3);
1657
1658         *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1659         *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1660     }
1661 }
1662
1663
1664 force_inline void pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1665     const v4sf *e, v4sf *RESTRICT out)
1666 {
1667     v4sf r0{*in0}, i0{*in1};
1668     v4sf r1{*in++}; v4sf i1{*in++};
1669     v4sf r2{*in++}; v4sf i2{*in++};
1670     v4sf r3{*in++}; v4sf i3{*in++};
1671     vtranspose4(r0,r1,r2,r3);
1672     vtranspose4(i0,i1,i2,i3);
1673
1674     /* transformation for each column is:
1675      *
1676      * [1   1   1   1   0   0   0   0]   [r0]
1677      * [1   0  -1   0   0  -1   0   1]   [r1]
1678      * [1   0  -1   0   0   1   0  -1]   [r2]
1679      * [1  -1   1  -1   0   0   0   0]   [r3]
1680      * [0   0   0   0   1   1   1   1] * [i0]
1681      * [0  -1   0   1  -1   0   1   0]   [i1]
1682      * [0  -1   0   1   1   0  -1   0]   [i2]
1683      * [0   0   0   0  -1   1  -1   1]   [i3]
1684      */
1685
1686     //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1687     //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1688
1689     vcplxmul(r1,i1,e[0],e[1]);
1690     vcplxmul(r2,i2,e[2],e[3]);
1691     vcplxmul(r3,i3,e[4],e[5]);
1692
1693     //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1694     //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1695
1696     v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0,r2)};
1697     v4sf sr1{vadd(r1,r3)}, dr1{vsub(r3,r1)};
1698     v4sf si0{vadd(i0,i2)}, di0{vsub(i0,i2)};
1699     v4sf si1{vadd(i1,i3)}, di1{vsub(i3,i1)};
1700
1701     r0 = vadd(sr0, sr1);
1702     r3 = vsub(sr0, sr1);
1703     i0 = vadd(si0, si1);
1704     i3 = vsub(si1, si0);
1705     r1 = vadd(dr0, di1);
1706     r2 = vsub(dr0, di1);
1707     i1 = vsub(dr1, di0);
1708     i2 = vadd(dr1, di0);
1709
1710     *out++ = r0;
1711     *out++ = i0;
1712     *out++ = r1;
1713     *out++ = i1;
1714     *out++ = r2;
1715     *out++ = i2;
1716     *out++ = r3;
1717     *out++ = i3;
1718 }
1719
1720 NOINLINE void pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1721     const v4sf *e)
1722 {
1723     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
1724
1725     assert(in != out);
1726     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1727     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1728
1729     const v4sf zero{vzero()};
1730     const auto cr = al::bit_cast<std::array<float,SimdSize>>(in[0]);
1731     const auto ci = al::bit_cast<std::array<float,SimdSize>>(in[Ncvec*2-1]);
1732     pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1733
1734     /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1735      *
1736      * [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1737      * [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1738      * [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1739      * [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1740      * [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1741      * [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1742      * [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1743      * [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1744      */
1745
1746     const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; out[0] = vinsert0(out[0], xr0);
1747     const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; out[1] = vinsert0(out[1], xi0);
1748     const float xr2{(cr[0]-cr[2])};                 out[4] = vinsert0(out[4], xr2);
1749     const float xi2{(cr[3]-cr[1])};                 out[5] = vinsert0(out[5], xi2);
1750     const float xr1{ ci[0] + s*(ci[1]-ci[3])};      out[2] = vinsert0(out[2], xr1);
1751     const float xi1{-ci[2] - s*(ci[1]+ci[3])};      out[3] = vinsert0(out[3], xi1);
1752     const float xr3{ ci[0] - s*(ci[1]-ci[3])};      out[6] = vinsert0(out[6], xr3);
1753     const float xi3{ ci[2] - s*(ci[1]+ci[3])};      out[7] = vinsert0(out[7], xi3);
1754
1755     for(size_t k{1};k < dk;++k)
1756         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
1757 }
1758
1759 force_inline void pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
1760     const bool first)
1761 {
1762     v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
1763     v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]};
1764
1765     /* transformation for each column is:
1766      *
1767      * [1   1   1   1   0   0   0   0]   [r0]
1768      * [1   0   0  -1   0  -1  -1   0]   [r1]
1769      * [1  -1  -1   1   0   0   0   0]   [r2]
1770      * [1   0   0  -1   0   1   1   0]   [r3]
1771      * [0   0   0   0   1  -1   1  -1] * [i0]
1772      * [0  -1   1   0   1   0   0   1]   [i1]
1773      * [0   0   0   0   1   1  -1  -1]   [i2]
1774      * [0   1  -1   0   1   0   0   1]   [i3]
1775      */
1776
1777     v4sf sr0{vadd(r0,r3)}, dr0{vsub(r0,r3)};
1778     v4sf sr1{vadd(r1,r2)}, dr1{vsub(r1,r2)};
1779     v4sf si0{vadd(i0,i3)}, di0{vsub(i0,i3)};
1780     v4sf si1{vadd(i1,i2)}, di1{vsub(i1,i2)};
1781
1782     r0 = vadd(sr0, sr1);
1783     r2 = vsub(sr0, sr1);
1784     r1 = vsub(dr0, si1);
1785     r3 = vadd(dr0, si1);
1786     i0 = vsub(di0, di1);
1787     i2 = vadd(di0, di1);
1788     i1 = vsub(si0, dr1);
1789     i3 = vadd(si0, dr1);
1790
1791     vcplxmulconj(r1,i1,e[0],e[1]);
1792     vcplxmulconj(r2,i2,e[2],e[3]);
1793     vcplxmulconj(r3,i3,e[4],e[5]);
1794
1795     vtranspose4(r0,r1,r2,r3);
1796     vtranspose4(i0,i1,i2,i3);
1797
1798     if(!first)
1799     {
1800         *out++ = r0;
1801         *out++ = i0;
1802     }
1803     *out++ = r1;
1804     *out++ = i1;
1805     *out++ = r2;
1806     *out++ = i2;
1807     *out++ = r3;
1808     *out++ = i3;
1809 }
1810
1811 NOINLINE void pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1812     const v4sf *e)
1813 {
1814     static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
1815
1816     assert(in != out);
1817     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1818     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1819
1820     std::array<float,SimdSize> Xr{}, Xi{};
1821     for(size_t k{0};k < SimdSize;++k)
1822     {
1823         Xr[k] = vextract0(in[2*k]);
1824         Xi[k] = vextract0(in[2*k + 1]);
1825     }
1826
1827     pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
1828
1829     /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1830      *
1831      * [cr0] [1   0   2   0   1   0   0   0]
1832      * [cr1] [1   0   0   0  -1   0  -2   0]
1833      * [cr2] [1   0  -2   0   1   0   0   0]
1834      * [cr3] [1   0   0   0  -1   0   2   0]
1835      * [ci0] [0   2   0   2   0   0   0   0]
1836      * [ci1] [0   s   0  -s   0  -s   0  -s]
1837      * [ci2] [0   0   0   0   0  -2   0   2]
1838      * [ci3] [0  -s   0   s   0  -s   0  -s]
1839      */
1840     for(size_t k{1};k < dk;++k)
1841         pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
1842
1843     const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
1844     const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
1845     const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
1846     const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
1847     out[0] = vset4(cr0, cr1, cr2, cr3);
1848     const float ci0{     2*(Xr[1]+Xr[3])};
1849     const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1850     const float ci2{     2*(Xi[3]-Xi[1])};
1851     const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1852     out[2*Ncvec-1] = vset4(ci0, ci1, ci2, ci3);
1853 }
1854
1855
1856 void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
1857     v4sf *scratch, const pffft_direction_t direction, const bool ordered)
1858 {
1859     assert(scratch != nullptr);
1860     assert(voutput != scratch);
1861
1862     const size_t Ncvec{setup->Ncvec};
1863     const bool nf_odd{(setup->ifac[1]&1) != 0};
1864
1865     std::array buff{voutput, scratch};
1866     bool ib{nf_odd != ordered};
1867     if(direction == PFFFT_FORWARD)
1868     {
1869         /* Swap the initial work buffer for forward FFTs, which helps avoid an
1870          * extra copy for output.
1871          */
1872         ib = !ib;
1873         if(setup->transform == PFFFT_REAL)
1874         {
1875             ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
1876             pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1877         }
1878         else
1879         {
1880             v4sf *tmp{buff[ib]};
1881             for(size_t k=0; k < Ncvec; ++k)
1882                 uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1883
1884             ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
1885             pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1886         }
1887         if(ordered)
1888             pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
1889                 reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
1890         else
1891             ib = !ib;
1892     }
1893     else
1894     {
1895         if(vinput == buff[ib])
1896             ib = !ib; // may happen when finput == foutput
1897
1898         if(ordered)
1899         {
1900             pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
1901                 reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
1902             vinput = buff[ib];
1903             ib = !ib;
1904         }
1905         if(setup->transform == PFFFT_REAL)
1906         {
1907             pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1908             ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
1909         }
1910         else
1911         {
1912             pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1913             ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
1914             for(size_t k{0};k < Ncvec;++k)
1915                 interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1916         }
1917     }
1918
1919     if(buff[ib] != voutput)
1920     {
1921         /* extra copy required -- this situation should only happen when finput == foutput */
1922         assert(vinput==voutput);
1923         for(size_t k{0};k < Ncvec;++k)
1924         {
1925             v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
1926             voutput[2*k] = a; voutput[2*k+1] = b;
1927         }
1928     }
1929 }
1930
1931 } // namespace
1932
1933 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
1934     pffft_direction_t direction)
1935 {
1936     assert(in != out);
1937
1938     const size_t N{setup->N}, Ncvec{setup->Ncvec};
1939     const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
1940     v4sf *RESTRICT vout{reinterpret_cast<v4sf*>(out)};
1941     if(setup->transform == PFFFT_REAL)
1942     {
1943         const size_t dk{N/32};
1944         if(direction == PFFFT_FORWARD)
1945         {
1946             for(size_t k{0};k < dk;++k)
1947             {
1948                 interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1949                 interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1950             }
1951             reversed_copy(dk, vin+2, 8, vout + N/SimdSize/2);
1952             reversed_copy(dk, vin+6, 8, vout + N/SimdSize);
1953         }
1954         else
1955         {
1956             for(size_t k{0};k < dk;++k)
1957             {
1958                 uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1959                 uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1960             }
1961             unreversed_copy(dk, vin + N/SimdSize/4, vout + N/SimdSize - 6, -8);
1962             unreversed_copy(dk, vin + 3_uz*N/SimdSize/4, vout + N/SimdSize - 2, -8);
1963         }
1964     }
1965     else
1966     {
1967         if(direction == PFFFT_FORWARD)
1968         {
1969             for(size_t k{0};k < Ncvec;++k)
1970             {
1971                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1972                 interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1973             }
1974         }
1975         else
1976         {
1977             for(size_t k{0};k < Ncvec;++k)
1978             {
1979                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1980                 uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1981             }
1982         }
1983     }
1984 }
1985
1986 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
1987     float *ab, float scaling)
1988 {
1989     const size_t Ncvec{s->Ncvec};
1990     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
1991     const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
1992     v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
1993
1994 #ifdef __arm__
1995     __builtin_prefetch(va);
1996     __builtin_prefetch(vb);
1997     __builtin_prefetch(vab);
1998     __builtin_prefetch(va+2);
1999     __builtin_prefetch(vb+2);
2000     __builtin_prefetch(vab+2);
2001     __builtin_prefetch(va+4);
2002     __builtin_prefetch(vb+4);
2003     __builtin_prefetch(vab+4);
2004     __builtin_prefetch(va+6);
2005     __builtin_prefetch(vb+6);
2006     __builtin_prefetch(vab+6);
2007 #ifndef __clang__
2008 #define ZCONVOLVE_USING_INLINE_NEON_ASM
2009 #endif
2010 #endif
2011
2012     const float ar1{vextract0(va[0])};
2013     const float ai1{vextract0(va[1])};
2014     const float br1{vextract0(vb[0])};
2015     const float bi1{vextract0(vb[1])};
2016     const float abr1{vextract0(vab[0])};
2017     const float abi1{vextract0(vab[1])};
2018
2019 #ifdef ZCONVOLVE_USING_INLINE_ASM
2020     /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
2021      * Ubuntu. So this will be restricted to GCC.
2022      *
2023      * Does it still miscompile with Clang? Is it even needed with today's
2024      * optimizers?
2025      */
2026     const float *a_{a}, *b_{b}; float *ab_{ab};
2027     size_t N{Ncvec};
2028     asm volatile("mov         r8, %2                  \n"
2029                 "vdup.f32    q15, %4                 \n"
2030                 "1:                                  \n"
2031                 "pld         [%0,#64]                \n"
2032                 "pld         [%1,#64]                \n"
2033                 "pld         [%2,#64]                \n"
2034                 "pld         [%0,#96]                \n"
2035                 "pld         [%1,#96]                \n"
2036                 "pld         [%2,#96]                \n"
2037                 "vld1.f32    {q0,q1},   [%0,:128]!         \n"
2038                 "vld1.f32    {q4,q5},   [%1,:128]!         \n"
2039                 "vld1.f32    {q2,q3},   [%0,:128]!         \n"
2040                 "vld1.f32    {q6,q7},   [%1,:128]!         \n"
2041                 "vld1.f32    {q8,q9},   [r8,:128]!          \n"
2042
2043                 "vmul.f32    q10, q0, q4             \n"
2044                 "vmul.f32    q11, q0, q5             \n"
2045                 "vmul.f32    q12, q2, q6             \n"
2046                 "vmul.f32    q13, q2, q7             \n"
2047                 "vmls.f32    q10, q1, q5             \n"
2048                 "vmla.f32    q11, q1, q4             \n"
2049                 "vld1.f32    {q0,q1}, [r8,:128]!     \n"
2050                 "vmls.f32    q12, q3, q7             \n"
2051                 "vmla.f32    q13, q3, q6             \n"
2052                 "vmla.f32    q8, q10, q15            \n"
2053                 "vmla.f32    q9, q11, q15            \n"
2054                 "vmla.f32    q0, q12, q15            \n"
2055                 "vmla.f32    q1, q13, q15            \n"
2056                 "vst1.f32    {q8,q9},[%2,:128]!    \n"
2057                 "vst1.f32    {q0,q1},[%2,:128]!    \n"
2058                 "subs        %3, #2                  \n"
2059                 "bne         1b                      \n"
2060                 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
2061
2062 #else
2063
2064     /* Default routine, works fine for non-arm cpus with current compilers. */
2065     const v4sf vscal{ld_ps1(scaling)};
2066     for(size_t i{0};i < Ncvec;i += 2)
2067     {
2068         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2069         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2070         vcplxmul(ar4, ai4, br4, bi4);
2071         vab[2*i+0] = vmadd(ar4, vscal, vab[2*i+0]);
2072         vab[2*i+1] = vmadd(ai4, vscal, vab[2*i+1]);
2073         ar4 = va[2*i+2]; ai4 = va[2*i+3];
2074         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2075         vcplxmul(ar4, ai4, br4, bi4);
2076         vab[2*i+2] = vmadd(ar4, vscal, vab[2*i+2]);
2077         vab[2*i+3] = vmadd(ai4, vscal, vab[2*i+3]);
2078     }
2079 #endif
2080
2081     if(s->transform == PFFFT_REAL)
2082     {
2083         vab[0] = vinsert0(vab[0], abr1 + ar1*br1*scaling);
2084         vab[1] = vinsert0(vab[1], abi1 + ai1*bi1*scaling);
2085     }
2086 }
2087
2088 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2089 {
2090     const size_t Ncvec{s->Ncvec};
2091     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
2092     const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
2093     v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
2094
2095 #ifdef __arm__
2096     __builtin_prefetch(va);
2097     __builtin_prefetch(vb);
2098     __builtin_prefetch(vab);
2099     __builtin_prefetch(va+2);
2100     __builtin_prefetch(vb+2);
2101     __builtin_prefetch(vab+2);
2102     __builtin_prefetch(va+4);
2103     __builtin_prefetch(vb+4);
2104     __builtin_prefetch(vab+4);
2105     __builtin_prefetch(va+6);
2106     __builtin_prefetch(vb+6);
2107     __builtin_prefetch(vab+6);
2108 #endif
2109
2110     const float ar1{vextract0(va[0])};
2111     const float ai1{vextract0(va[1])};
2112     const float br1{vextract0(vb[0])};
2113     const float bi1{vextract0(vb[1])};
2114     const float abr1{vextract0(vab[0])};
2115     const float abi1{vextract0(vab[1])};
2116
2117     /* No inline assembly for this version. I'm not familiar enough with NEON
2118      * assembly, and I don't know that it's needed with today's optimizers.
2119      */
2120     for(size_t i{0};i < Ncvec;i += 2)
2121     {
2122         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2123         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2124         vcplxmul(ar4, ai4, br4, bi4);
2125         vab[2*i+0] = vadd(ar4, vab[2*i+0]);
2126         vab[2*i+1] = vadd(ai4, vab[2*i+1]);
2127         ar4 = va[2*i+2]; ai4 = va[2*i+3];
2128         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2129         vcplxmul(ar4, ai4, br4, bi4);
2130         vab[2*i+2] = vadd(ar4, vab[2*i+2]);
2131         vab[2*i+3] = vadd(ai4, vab[2*i+3]);
2132     }
2133
2134     if(s->transform == PFFFT_REAL)
2135     {
2136         vab[0] = vinsert0(vab[0], abr1 + ar1*br1);
2137         vab[1] = vinsert0(vab[1], abi1 + ai1*bi1);
2138     }
2139 }
2140
2141
2142 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2143     pffft_direction_t direction)
2144 {
2145     assert(valigned(input) && valigned(output) && valigned(work));
2146     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2147         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2148         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
2149 }
2150
2151 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2152     float *work, pffft_direction_t direction)
2153 {
2154     assert(valigned(input) && valigned(output) && valigned(work));
2155     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2156         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2157         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
2158 }
2159
2160 #else // defined(PFFFT_SIMD_DISABLE)
2161
2162 // standard routine using scalar floats, without SIMD stuff.
2163
2164 namespace {
2165
2166 void pffft_transform_internal(const PFFFT_Setup *setup, const float *input, float *output,
2167     float *scratch, const pffft_direction_t direction, bool ordered)
2168 {
2169     const size_t Ncvec{setup->Ncvec};
2170     const bool nf_odd{(setup->ifac[1]&1) != 0};
2171
2172     assert(scratch != nullptr);
2173
2174     /* z-domain data for complex transforms is already ordered without SIMD. */
2175     if(setup->transform == PFFFT_COMPLEX)
2176         ordered = false;
2177
2178     float *buff[2]{output, scratch};
2179     bool ib{nf_odd != ordered};
2180     if(direction == PFFFT_FORWARD)
2181     {
2182         if(setup->transform == PFFFT_REAL)
2183             ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
2184         else
2185             ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
2186         if(ordered)
2187         {
2188             pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
2189             ib = !ib;
2190         }
2191     }
2192     else
2193     {
2194         if (input == buff[ib])
2195             ib = !ib; // may happen when finput == foutput
2196
2197         if(ordered)
2198         {
2199             pffft_zreorder(setup, input, buff[ib], PFFFT_BACKWARD);
2200             input = buff[ib];
2201             ib = !ib;
2202         }
2203         if(setup->transform == PFFFT_REAL)
2204             ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, setup->ifac) == buff[1]);
2205         else
2206             ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
2207     }
2208     if(buff[ib] != output)
2209     {
2210         // extra copy required -- this situation should happens only when finput == foutput
2211         assert(input==output);
2212         for(size_t k{0};k < Ncvec;++k)
2213         {
2214             float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
2215             output[2*k] = a; output[2*k+1] = b;
2216         }
2217     }
2218 }
2219
2220 } // namespace
2221
2222 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *RESTRICT out,
2223     pffft_direction_t direction)
2224 {
2225     const size_t N{setup->N};
2226     if(setup->transform == PFFFT_COMPLEX)
2227     {
2228         for(size_t k{0};k < 2*N;++k)
2229             out[k] = in[k];
2230         return;
2231     }
2232     else if(direction == PFFFT_FORWARD)
2233     {
2234         float x_N{in[N-1]};
2235         for(size_t k{N-1};k > 1;--k)
2236             out[k] = in[k-1];
2237         out[0] = in[0];
2238         out[1] = x_N;
2239     }
2240     else
2241     {
2242         float x_N{in[1]};
2243         for(size_t k{1};k < N-1;++k)
2244             out[k] = in[k+1];
2245         out[0] = in[0];
2246         out[N-1] = x_N;
2247     }
2248 }
2249
2250 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
2251     float *ab, float scaling)
2252 {
2253     size_t Ncvec{s->Ncvec};
2254
2255     if(s->transform == PFFFT_REAL)
2256     {
2257         // take care of the fftpack ordering
2258         ab[0] += a[0]*b[0]*scaling;
2259         ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
2260         ++ab; ++a; ++b; --Ncvec;
2261     }
2262     for(size_t i{0};i < Ncvec;++i)
2263     {
2264         float ar{a[2*i+0]}, ai{a[2*i+1]};
2265         const float br{b[2*i+0]}, bi{b[2*i+1]};
2266         vcplxmul(ar, ai, br, bi);
2267         ab[2*i+0] += ar*scaling;
2268         ab[2*i+1] += ai*scaling;
2269     }
2270 }
2271
2272 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2273 {
2274     size_t Ncvec{s->Ncvec};
2275
2276     if(s->transform == PFFFT_REAL)
2277     {
2278         // take care of the fftpack ordering
2279         ab[0] += a[0]*b[0];
2280         ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
2281         ++ab; ++a; ++b; --Ncvec;
2282     }
2283     for(size_t i{0};i < Ncvec;++i)
2284     {
2285         float ar{a[2*i+0]}, ai{a[2*i+1]};
2286         const float br{b[2*i+0]}, bi{b[2*i+1]};
2287         vcplxmul(ar, ai, br, bi);
2288         ab[2*i+0] += ar;
2289         ab[2*i+1] += ai;
2290     }
2291 }
2292
2293
2294 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2295     pffft_direction_t direction)
2296 {
2297     pffft_transform_internal(setup, input, output, work, direction, false);
2298 }
2299
2300 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2301     float *work, pffft_direction_t direction)
2302 {
2303     pffft_transform_internal(setup, input, output, work, direction, true);
2304 }
2305
2306 #endif /* defined(PFFFT_SIMD_DISABLE) */
2307 /* NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) */