3 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
4 * Copyright (c) 2023 Christopher Robinson
6 * Based on original fortran 77 code from FFTPACKv4 from NETLIB
7 * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
10 * As confirmed by the NCAR fftpack software curators, the following
11 * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
12 * released under the same terms.
16 * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
18 * Copyright (c) 2004 the University Corporation for Atmospheric
19 * Research ("UCAR"). All rights reserved. Developed by NCAR's
20 * Computational and Information Systems Laboratory, UCAR,
23 * Redistribution and use of the Software in source and binary forms,
24 * with or without modification, is permitted provided that the
25 * following conditions are met:
27 * - Neither the names of NCAR's Computational and Information Systems
28 * Laboratory, the University Corporation for Atmospheric Research,
29 * nor the names of its sponsors or contributors may be used to
30 * endorse or promote products derived from this Software without
31 * specific prior written permission.
33 * - Redistributions of source code must retain the above copyright
34 * notices, this list of conditions, and the disclaimer below.
36 * - Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions, and the disclaimer below in the
38 * documentation and/or other materials provided with the
41 * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42 * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
43 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
44 * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
45 * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
46 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
52 * PFFFT : a Pretty Fast FFT.
54 * This file is largerly based on the original FFTPACK implementation, modified
55 * in order to take advantage of SIMD instructions of modern CPUs.
73 #include "alnumbers.h"
74 #include "alnumeric.h"
76 #include "opthelpers.h"
79 using uint
= unsigned int;
83 #if defined(__GNUC__) || defined(_MSC_VER)
84 #define RESTRICT __restrict
89 /* Vector support macros: the rest of the code is independent of
90 * SSE/Altivec/NEON -- adding support for other platforms with 4-element
91 * vectors should be limited to these macros
94 /* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
95 //#define PFFFT_SIMD_DISABLE
97 #ifndef PFFFT_SIMD_DISABLE
99 * Altivec support macros
101 #if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
103 using v4sf
= vector
float;
104 constexpr uint SimdSize
{4};
105 force_inline v4sf
vzero() noexcept
{ return (vector
float)vec_splat_u8(0); }
106 force_inline v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return vec_madd(a
, b
, vzero()); }
107 force_inline v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return vec_add(a
, b
); }
108 force_inline v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return vec_madd(a
, b
, c
); }
109 force_inline v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return vec_sub(a
, b
); }
110 force_inline v4sf
ld_ps1(float a
) noexcept
{ return vec_splats(a
); }
112 force_inline v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
114 /* There a more efficient way to do this? */
115 alignas(16) std::array
<float,4> vals
{{a
, b
, c
, d
}};
116 return vec_ld(0, vals
.data());
118 force_inline v4sf
vinsert0(v4sf v
, float a
) noexcept
{ return vec_insert(a
, v
, 0); }
119 force_inline
float vextract0(v4sf v
) noexcept
{ return vec_extract(v
, 0); }
121 force_inline
void interleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
123 out1
= vec_mergeh(in1
, in2
);
124 out2
= vec_mergel(in1
, in2
);
126 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
128 out1
= vec_perm(in1
, in2
, (vector
unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27});
129 out2
= vec_perm(in1
, in2
, (vector
unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31});
132 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
134 v4sf y0
{vec_mergeh(x0
, x2
)};
135 v4sf y1
{vec_mergel(x0
, x2
)};
136 v4sf y2
{vec_mergeh(x1
, x3
)};
137 v4sf y3
{vec_mergel(x1
, x3
)};
138 x0
= vec_mergeh(y0
, y2
);
139 x1
= vec_mergel(y0
, y2
);
140 x2
= vec_mergeh(y1
, y3
);
141 x3
= vec_mergel(y1
, y3
);
144 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
145 { return vec_perm(a
,b
, (vector
unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}); }
148 * SSE1 support macros
150 #elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
151 (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
153 #include <xmmintrin.h>
155 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/
156 * finalize functions anyway so you will have to work if you want to enable AVX
157 * with its 256-bit vectors.
159 constexpr uint SimdSize
{4};
160 force_inline v4sf
vzero() noexcept
{ return _mm_setzero_ps(); }
161 force_inline v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return _mm_mul_ps(a
, b
); }
162 force_inline v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return _mm_add_ps(a
, b
); }
163 force_inline v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return _mm_add_ps(_mm_mul_ps(a
,b
), c
); }
164 force_inline v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return _mm_sub_ps(a
, b
); }
165 force_inline v4sf
ld_ps1(float a
) noexcept
{ return _mm_set1_ps(a
); }
167 force_inline v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
168 { return _mm_setr_ps(a
, b
, c
, d
); }
169 force_inline v4sf
vinsert0(const v4sf v
, const float a
) noexcept
170 { return _mm_move_ss(v
, _mm_set_ss(a
)); }
171 force_inline
float vextract0(v4sf v
) noexcept
172 { return _mm_cvtss_f32(v
); }
174 force_inline
void interleave2(const v4sf in1
, const v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
176 out1
= _mm_unpacklo_ps(in1
, in2
);
177 out2
= _mm_unpackhi_ps(in1
, in2
);
179 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
181 out1
= _mm_shuffle_ps(in1
, in2
, _MM_SHUFFLE(2,0,2,0));
182 out2
= _mm_shuffle_ps(in1
, in2
, _MM_SHUFFLE(3,1,3,1));
185 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
186 { _MM_TRANSPOSE4_PS(x0
, x1
, x2
, x3
); }
188 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
189 { return _mm_shuffle_ps(b
, a
, _MM_SHUFFLE(3,2,1,0)); }
192 * ARM NEON support macros
194 #elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)
196 #include <arm_neon.h>
197 using v4sf
= float32x4_t
;
198 constexpr uint SimdSize
{4};
199 force_inline v4sf
vzero() noexcept
{ return vdupq_n_f32(0.0f
); }
200 force_inline v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return vmulq_f32(a
, b
); }
201 force_inline v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return vaddq_f32(a
, b
); }
202 force_inline v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return vmlaq_f32(c
, a
, b
); }
203 force_inline v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return vsubq_f32(a
, b
); }
204 force_inline v4sf
ld_ps1(float a
) noexcept
{ return vdupq_n_f32(a
); }
206 force_inline v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
208 float32x4_t ret
{vmovq_n_f32(a
)};
209 ret
= vsetq_lane_f32(b
, ret
, 1);
210 ret
= vsetq_lane_f32(c
, ret
, 2);
211 ret
= vsetq_lane_f32(d
, ret
, 3);
214 force_inline v4sf
vinsert0(v4sf v
, float a
) noexcept
215 { return vsetq_lane_f32(a
, v
, 0); }
216 force_inline
float vextract0(v4sf v
) noexcept
217 { return vgetq_lane_f32(v
, 0); }
219 force_inline
void interleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
221 float32x4x2_t tmp
{vzipq_f32(in1
, in2
)};
225 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
227 float32x4x2_t tmp
{vuzpq_f32(in1
, in2
)};
232 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
234 /* marginally faster version:
235 * asm("vtrn.32 %q0, %q1;\n"
236 * "vtrn.32 %q2, %q3\n
239 * : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
241 float32x4x2_t t0_
{vzipq_f32(x0
, x2
)};
242 float32x4x2_t t1_
{vzipq_f32(x1
, x3
)};
243 float32x4x2_t u0_
{vzipq_f32(t0_
.val
[0], t1_
.val
[0])};
244 float32x4x2_t u1_
{vzipq_f32(t0_
.val
[1], t1_
.val
[1])};
251 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
252 { return vcombine_f32(vget_low_f32(b
), vget_high_f32(a
)); }
255 * Generic GCC vector macros
257 #elif defined(__GNUC__)
259 using v4sf
[[gnu::vector_size(16), gnu::aligned(16)]] = float;
260 constexpr uint SimdSize
{4};
261 force_inline
constexpr v4sf
vzero() noexcept
{ return v4sf
{0.0f
, 0.0f
, 0.0f
, 0.0f
}; }
262 force_inline
constexpr v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return a
* b
; }
263 force_inline
constexpr v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return a
+ b
; }
264 force_inline
constexpr v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return a
*b
+ c
; }
265 force_inline
constexpr v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return a
- b
; }
266 force_inline
constexpr v4sf
ld_ps1(float a
) noexcept
{ return v4sf
{a
, a
, a
, a
}; }
268 force_inline
constexpr v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
269 { return v4sf
{a
, b
, c
, d
}; }
270 force_inline
constexpr v4sf
vinsert0(v4sf v
, float a
) noexcept
271 { return v4sf
{a
, v
[1], v
[2], v
[3]}; }
272 force_inline
float vextract0(v4sf v
) noexcept
275 force_inline v4sf
unpacklo(v4sf a
, v4sf b
) noexcept
276 { return v4sf
{a
[0], b
[0], a
[1], b
[1]}; }
277 force_inline v4sf
unpackhi(v4sf a
, v4sf b
) noexcept
278 { return v4sf
{a
[2], b
[2], a
[3], b
[3]}; }
280 force_inline
void interleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
282 out1
= unpacklo(in1
, in2
);
283 out2
= unpackhi(in1
, in2
);
285 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
287 out1
= v4sf
{in1
[0], in1
[2], in2
[0], in2
[2]};
288 out2
= v4sf
{in1
[1], in1
[3], in2
[1], in2
[3]};
291 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
293 v4sf tmp0
{unpacklo(x0
, x1
)};
294 v4sf tmp2
{unpacklo(x2
, x3
)};
295 v4sf tmp1
{unpackhi(x0
, x1
)};
296 v4sf tmp3
{unpackhi(x2
, x3
)};
297 x0
= v4sf
{tmp0
[0], tmp0
[1], tmp2
[0], tmp2
[1]};
298 x1
= v4sf
{tmp0
[2], tmp0
[3], tmp2
[2], tmp2
[3]};
299 x2
= v4sf
{tmp1
[0], tmp1
[1], tmp3
[0], tmp3
[1]};
300 x3
= v4sf
{tmp1
[2], tmp1
[3], tmp3
[2], tmp3
[3]};
303 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
304 { return v4sf
{b
[0], b
[1], a
[2], a
[3]}; }
308 #warning "building with simd disabled !\n";
309 #define PFFFT_SIMD_DISABLE // fallback to scalar code
312 #endif /* PFFFT_SIMD_DISABLE */
314 // fallback mode for situations where SIMD is not available, use scalar mode instead
315 #ifdef PFFFT_SIMD_DISABLE
317 constexpr uint SimdSize
{1};
318 force_inline
constexpr v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return a
* b
; }
319 force_inline
constexpr v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return a
+ b
; }
320 force_inline
constexpr v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return a
*b
+ c
; }
321 force_inline
constexpr v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return a
- b
; }
322 force_inline
constexpr v4sf
ld_ps1(float a
) noexcept
{ return a
; }
326 [[maybe_unused
]] inline
327 auto valigned(const float *ptr
) noexcept
-> bool
329 static constexpr uintptr_t alignmask
{SimdSize
*sizeof(float) - 1};
330 return (reinterpret_cast<uintptr_t>(ptr
) & alignmask
) == 0;
334 // shortcuts for complex multiplications
335 force_inline
void vcplxmul(v4sf
&ar
, v4sf
&ai
, v4sf br
, v4sf bi
) noexcept
337 v4sf tmp
{vmul(ar
, bi
)};
338 ar
= vsub(vmul(ar
, br
), vmul(ai
, bi
));
339 ai
= vmadd(ai
, br
, tmp
);
341 force_inline
void vcplxmulconj(v4sf
&ar
, v4sf
&ai
, v4sf br
, v4sf bi
) noexcept
343 v4sf tmp
{vmul(ar
, bi
)};
344 ar
= vmadd(ai
, bi
, vmul(ar
, br
));
345 ai
= vsub(vmul(ai
, br
), tmp
);
348 #if !defined(PFFFT_SIMD_DISABLE)
350 inline void assertv4(const al::span
<const float,4> v_f
[[maybe_unused
]],
351 const float f0
[[maybe_unused
]], const float f1
[[maybe_unused
]],
352 const float f2
[[maybe_unused
]], const float f3
[[maybe_unused
]])
353 { assert(v_f
[0] == f0
&& v_f
[1] == f1
&& v_f
[2] == f2
&& v_f
[3] == f3
); }
355 template<typename T
, T
...N
>
356 constexpr auto make_float_array(std::integer_sequence
<T
,N
...>)
357 { return std::array
{static_cast<float>(N
)...}; }
359 /* detect bugs with the vector support macros */
360 [[maybe_unused
]] void validate_pffft_simd()
362 using float4
= std::array
<float,4>;
363 static constexpr auto f
= make_float_array(std::make_index_sequence
<16>{});
365 v4sf a0_v
{vset4(f
[ 0], f
[ 1], f
[ 2], f
[ 3])};
366 v4sf a1_v
{vset4(f
[ 4], f
[ 5], f
[ 6], f
[ 7])};
367 v4sf a2_v
{vset4(f
[ 8], f
[ 9], f
[10], f
[11])};
368 v4sf a3_v
{vset4(f
[12], f
[13], f
[14], f
[15])};
372 auto t_f
= al::bit_cast
<float4
>(t_v
);
373 printf("VZERO=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
374 assertv4(t_f
, 0, 0, 0, 0);
376 t_v
= vadd(a1_v
, a2_v
);
377 t_f
= al::bit_cast
<float4
>(t_v
);
378 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
379 assertv4(t_f
, 12, 14, 16, 18);
381 t_v
= vmul(a1_v
, a2_v
);
382 t_f
= al::bit_cast
<float4
>(t_v
);
383 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
384 assertv4(t_f
, 32, 45, 60, 77);
386 t_v
= vmadd(a1_v
, a2_v
, a0_v
);
387 t_f
= al::bit_cast
<float4
>(t_v
);
388 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
389 assertv4(t_f
, 32, 46, 62, 80);
391 interleave2(a1_v
, a2_v
, t_v
, u_v
);
392 t_f
= al::bit_cast
<float4
>(t_v
);
393 auto u_f
= al::bit_cast
<float4
>(u_v
);
394 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
395 t_f
[0], t_f
[1], t_f
[2], t_f
[3], u_f
[0], u_f
[1], u_f
[2], u_f
[3]);
396 assertv4(t_f
, 4, 8, 5, 9);
397 assertv4(u_f
, 6, 10, 7, 11);
399 uninterleave2(a1_v
, a2_v
, t_v
, u_v
);
400 t_f
= al::bit_cast
<float4
>(t_v
);
401 u_f
= al::bit_cast
<float4
>(u_v
);
402 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
403 t_f
[0], t_f
[1], t_f
[2], t_f
[3], u_f
[0], u_f
[1], u_f
[2], u_f
[3]);
404 assertv4(t_f
, 4, 6, 8, 10);
405 assertv4(u_f
, 5, 7, 9, 11);
408 t_f
= al::bit_cast
<float4
>(t_v
);
409 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
410 assertv4(t_f
, 15, 15, 15, 15);
412 t_v
= vswaphl(a1_v
, a2_v
);
413 t_f
= al::bit_cast
<float4
>(t_v
);
414 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
415 assertv4(t_f
, 8, 9, 6, 7);
417 vtranspose4(a0_v
, a1_v
, a2_v
, a3_v
);
418 auto a0_f
= al::bit_cast
<float4
>(a0_v
);
419 auto a1_f
= al::bit_cast
<float4
>(a1_v
);
420 auto a2_f
= al::bit_cast
<float4
>(a2_v
);
421 auto a3_f
= al::bit_cast
<float4
>(a3_v
);
422 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
423 a0_f
[0], a0_f
[1], a0_f
[2], a0_f
[3], a1_f
[0], a1_f
[1], a1_f
[2], a1_f
[3],
424 a2_f
[0], a2_f
[1], a2_f
[2], a2_f
[3], a3_f
[0], a3_f
[1], a3_f
[2], a3_f
[3]);
425 assertv4(a0_f
, 0, 4, 8, 12);
426 assertv4(a1_f
, 1, 5, 9, 13);
427 assertv4(a2_f
, 2, 6, 10, 14);
428 assertv4(a3_f
, 3, 7, 11, 15);
430 #endif //!PFFFT_SIMD_DISABLE
432 /* SSE and co like 16-bytes aligned pointers */
433 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
434 constexpr auto V4sfAlignment
= size_t(64);
435 constexpr auto V4sfAlignVal
= std::align_val_t(V4sfAlignment
);
437 /* NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
438 * FIXME: Converting this from raw pointers to spans or something will probably
439 * need significant work to maintain performance, given non-sequential range-
440 * checked accesses and lack of 'restrict' to indicate non-aliased memory. At
441 * least, some tests should be done to check the impact of using range-checked
442 * spans here before blindly switching.
445 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
447 NOINLINE
void passf2_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
448 const float *const wa1
, const float fsign
)
450 const size_t l1ido
{l1
*ido
};
453 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+= ido
, cc
+= 2*ido
)
455 ch
[0] = vadd(cc
[0], cc
[ido
+0]);
456 ch
[l1ido
] = vsub(cc
[0], cc
[ido
+0]);
457 ch
[1] = vadd(cc
[1], cc
[ido
+1]);
458 ch
[l1ido
+ 1] = vsub(cc
[1], cc
[ido
+1]);
463 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+= ido
, cc
+= 2*ido
)
465 for(size_t i
{0};i
< ido
-1;i
+= 2)
467 v4sf tr2
{vsub(cc
[i
+0], cc
[i
+ido
+0])};
468 v4sf ti2
{vsub(cc
[i
+1], cc
[i
+ido
+1])};
469 v4sf wr
{ld_ps1(wa1
[i
])}, wi
{ld_ps1(wa1
[i
+1]*fsign
)};
470 ch
[i
] = vadd(cc
[i
+0], cc
[i
+ido
+0]);
471 ch
[i
+1] = vadd(cc
[i
+1], cc
[i
+ido
+1]);
472 vcplxmul(tr2
, ti2
, wr
, wi
);
481 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
483 NOINLINE
void passf3_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
484 const float *const wa1
, const float fsign
)
488 const v4sf taur
{ld_ps1(-0.5f
)};
489 const v4sf taui
{ld_ps1(0.866025403784439f
*fsign
)};
490 const size_t l1ido
{l1
*ido
};
491 const auto wa2
= wa1
+ ido
;
492 for(size_t k
{0};k
< l1ido
;k
+= ido
, cc
+= 3*ido
, ch
+=ido
)
494 for(size_t i
{0};i
< ido
-1;i
+= 2)
496 v4sf tr2
{vadd(cc
[i
+ido
], cc
[i
+2*ido
])};
497 v4sf cr2
{vmadd(taur
, tr2
, cc
[i
])};
498 ch
[i
] = vadd(tr2
, cc
[i
]);
499 v4sf ti2
{vadd(cc
[i
+ido
+1], cc
[i
+2*ido
+1])};
500 v4sf ci2
{vmadd(taur
, ti2
, cc
[i
+1])};
501 ch
[i
+1] = vadd(cc
[i
+1], ti2
);
502 v4sf cr3
{vmul(taui
, vsub(cc
[i
+ido
], cc
[i
+2*ido
]))};
503 v4sf ci3
{vmul(taui
, vsub(cc
[i
+ido
+1], cc
[i
+2*ido
+1]))};
504 v4sf dr2
{vsub(cr2
, ci3
)};
505 v4sf dr3
{vadd(cr2
, ci3
)};
506 v4sf di2
{vadd(ci2
, cr3
)};
507 v4sf di3
{vsub(ci2
, cr3
)};
508 float wr1
{wa1
[i
]}, wi1
{fsign
*wa1
[i
+1]}, wr2
{wa2
[i
]}, wi2
{fsign
*wa2
[i
+1]};
509 vcplxmul(dr2
, di2
, ld_ps1(wr1
), ld_ps1(wi1
));
511 ch
[i
+l1ido
+ 1] = di2
;
512 vcplxmul(dr3
, di3
, ld_ps1(wr2
), ld_ps1(wi2
));
514 ch
[i
+2*l1ido
+1] = di3
;
519 NOINLINE
void passf4_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
520 const float *const wa1
, const float fsign
)
522 /* fsign == -1 for forward transform and +1 for backward transform */
523 const v4sf vsign
{ld_ps1(fsign
)};
524 const size_t l1ido
{l1
*ido
};
527 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+= ido
, cc
+= 4*ido
)
529 v4sf tr1
{vsub(cc
[0], cc
[2*ido
+ 0])};
530 v4sf tr2
{vadd(cc
[0], cc
[2*ido
+ 0])};
531 v4sf ti1
{vsub(cc
[1], cc
[2*ido
+ 1])};
532 v4sf ti2
{vadd(cc
[1], cc
[2*ido
+ 1])};
533 v4sf ti4
{vmul(vsub(cc
[1*ido
+ 0], cc
[3*ido
+ 0]), vsign
)};
534 v4sf tr4
{vmul(vsub(cc
[3*ido
+ 1], cc
[1*ido
+ 1]), vsign
)};
535 v4sf tr3
{vadd(cc
[ido
+ 0], cc
[3*ido
+ 0])};
536 v4sf ti3
{vadd(cc
[ido
+ 1], cc
[3*ido
+ 1])};
538 ch
[0*l1ido
+ 0] = vadd(tr2
, tr3
);
539 ch
[0*l1ido
+ 1] = vadd(ti2
, ti3
);
540 ch
[1*l1ido
+ 0] = vadd(tr1
, tr4
);
541 ch
[1*l1ido
+ 1] = vadd(ti1
, ti4
);
542 ch
[2*l1ido
+ 0] = vsub(tr2
, tr3
);
543 ch
[2*l1ido
+ 1] = vsub(ti2
, ti3
);
544 ch
[3*l1ido
+ 0] = vsub(tr1
, tr4
);
545 ch
[3*l1ido
+ 1] = vsub(ti1
, ti4
);
550 const auto wa2
= wa1
+ ido
;
551 const auto wa3
= wa2
+ ido
;
552 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+=ido
, cc
+= 4*ido
)
554 for(size_t i
{0};i
< ido
-1;i
+=2)
556 v4sf tr1
{vsub(cc
[i
+ 0], cc
[i
+ 2*ido
+ 0])};
557 v4sf tr2
{vadd(cc
[i
+ 0], cc
[i
+ 2*ido
+ 0])};
558 v4sf ti1
{vsub(cc
[i
+ 1], cc
[i
+ 2*ido
+ 1])};
559 v4sf ti2
{vadd(cc
[i
+ 1], cc
[i
+ 2*ido
+ 1])};
560 v4sf tr4
{vmul(vsub(cc
[i
+ 3*ido
+ 1], cc
[i
+ 1*ido
+ 1]), vsign
)};
561 v4sf ti4
{vmul(vsub(cc
[i
+ 1*ido
+ 0], cc
[i
+ 3*ido
+ 0]), vsign
)};
562 v4sf tr3
{vadd(cc
[i
+ ido
+ 0], cc
[i
+ 3*ido
+ 0])};
563 v4sf ti3
{vadd(cc
[i
+ ido
+ 1], cc
[i
+ 3*ido
+ 1])};
565 ch
[i
] = vadd(tr2
, tr3
);
566 v4sf cr3
{vsub(tr2
, tr3
)};
567 ch
[i
+ 1] = vadd(ti2
, ti3
);
568 v4sf ci3
{vsub(ti2
, ti3
)};
570 v4sf cr2
{vadd(tr1
, tr4
)};
571 v4sf cr4
{vsub(tr1
, tr4
)};
572 v4sf ci2
{vadd(ti1
, ti4
)};
573 v4sf ci4
{vsub(ti1
, ti4
)};
574 float wr1
{wa1
[i
]}, wi1
{fsign
*wa1
[i
+1]};
575 vcplxmul(cr2
, ci2
, ld_ps1(wr1
), ld_ps1(wi1
));
576 float wr2
{wa2
[i
]}, wi2
{fsign
*wa2
[i
+1]};
578 ch
[i
+ l1ido
+ 1] = ci2
;
580 vcplxmul(cr3
, ci3
, ld_ps1(wr2
), ld_ps1(wi2
));
581 float wr3
{wa3
[i
]}, wi3
{fsign
*wa3
[i
+1]};
582 ch
[i
+ 2*l1ido
] = cr3
;
583 ch
[i
+ 2*l1ido
+ 1] = ci3
;
585 vcplxmul(cr4
, ci4
, ld_ps1(wr3
), ld_ps1(wi3
));
586 ch
[i
+ 3*l1ido
] = cr4
;
587 ch
[i
+ 3*l1ido
+ 1] = ci4
;
594 * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
596 NOINLINE
void passf5_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
597 const float *const wa1
, const float fsign
)
599 const v4sf tr11
{ld_ps1(0.309016994374947f
)};
600 const v4sf tr12
{ld_ps1(-0.809016994374947f
)};
601 const v4sf ti11
{ld_ps1(0.951056516295154f
*fsign
)};
602 const v4sf ti12
{ld_ps1(0.587785252292473f
*fsign
)};
604 auto cc_ref
= [&cc
,ido
](size_t a_1
, size_t a_2
) noexcept
-> auto&
605 { return cc
[(a_2
-1)*ido
+ a_1
+ 1]; };
606 auto ch_ref
= [&ch
,ido
,l1
](size_t a_1
, size_t a_3
) noexcept
-> auto&
607 { return ch
[(a_3
-1)*l1
*ido
+ a_1
+ 1]; };
611 const auto wa2
= wa1
+ ido
;
612 const auto wa3
= wa2
+ ido
;
613 const auto wa4
= wa3
+ ido
;
614 for(size_t k
{0};k
< l1
;++k
, cc
+= 5*ido
, ch
+= ido
)
616 for(size_t i
{0};i
< ido
-1;i
+= 2)
618 v4sf ti5
{vsub(cc_ref(i
, 2), cc_ref(i
, 5))};
619 v4sf ti2
{vadd(cc_ref(i
, 2), cc_ref(i
, 5))};
620 v4sf ti4
{vsub(cc_ref(i
, 3), cc_ref(i
, 4))};
621 v4sf ti3
{vadd(cc_ref(i
, 3), cc_ref(i
, 4))};
622 v4sf tr5
{vsub(cc_ref(i
-1, 2), cc_ref(i
-1, 5))};
623 v4sf tr2
{vadd(cc_ref(i
-1, 2), cc_ref(i
-1, 5))};
624 v4sf tr4
{vsub(cc_ref(i
-1, 3), cc_ref(i
-1, 4))};
625 v4sf tr3
{vadd(cc_ref(i
-1, 3), cc_ref(i
-1, 4))};
626 ch_ref(i
-1, 1) = vadd(cc_ref(i
-1, 1), vadd(tr2
, tr3
));
627 ch_ref(i
, 1) = vadd(cc_ref(i
, 1), vadd(ti2
, ti3
));
628 v4sf cr2
{vadd(cc_ref(i
-1, 1), vmadd(tr11
, tr2
, vmul(tr12
, tr3
)))};
629 v4sf ci2
{vadd(cc_ref(i
, 1), vmadd(tr11
, ti2
, vmul(tr12
, ti3
)))};
630 v4sf cr3
{vadd(cc_ref(i
-1, 1), vmadd(tr12
, tr2
, vmul(tr11
, tr3
)))};
631 v4sf ci3
{vadd(cc_ref(i
, 1), vmadd(tr12
, ti2
, vmul(tr11
, ti3
)))};
632 v4sf cr5
{vmadd(ti11
, tr5
, vmul(ti12
, tr4
))};
633 v4sf ci5
{vmadd(ti11
, ti5
, vmul(ti12
, ti4
))};
634 v4sf cr4
{vsub(vmul(ti12
, tr5
), vmul(ti11
, tr4
))};
635 v4sf ci4
{vsub(vmul(ti12
, ti5
), vmul(ti11
, ti4
))};
636 v4sf dr3
{vsub(cr3
, ci4
)};
637 v4sf dr4
{vadd(cr3
, ci4
)};
638 v4sf di3
{vadd(ci3
, cr4
)};
639 v4sf di4
{vsub(ci3
, cr4
)};
640 v4sf dr5
{vadd(cr2
, ci5
)};
641 v4sf dr2
{vsub(cr2
, ci5
)};
642 v4sf di5
{vsub(ci2
, cr5
)};
643 v4sf di2
{vadd(ci2
, cr5
)};
644 float wr1
{wa1
[i
]}, wi1
{fsign
*wa1
[i
+1]}, wr2
{wa2
[i
]}, wi2
{fsign
*wa2
[i
+1]};
645 float wr3
{wa3
[i
]}, wi3
{fsign
*wa3
[i
+1]}, wr4
{wa4
[i
]}, wi4
{fsign
*wa4
[i
+1]};
646 vcplxmul(dr2
, di2
, ld_ps1(wr1
), ld_ps1(wi1
));
647 ch_ref(i
- 1, 2) = dr2
;
649 vcplxmul(dr3
, di3
, ld_ps1(wr2
), ld_ps1(wi2
));
650 ch_ref(i
- 1, 3) = dr3
;
652 vcplxmul(dr4
, di4
, ld_ps1(wr3
), ld_ps1(wi3
));
653 ch_ref(i
- 1, 4) = dr4
;
655 vcplxmul(dr5
, di5
, ld_ps1(wr4
), ld_ps1(wi4
));
656 ch_ref(i
- 1, 5) = dr5
;
662 NOINLINE
void radf2_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
,
663 v4sf
*RESTRICT ch
, const float *const wa1
)
665 const size_t l1ido
{l1
*ido
};
666 for(size_t k
{0};k
< l1ido
;k
+= ido
)
668 v4sf a
{cc
[k
]}, b
{cc
[k
+ l1ido
]};
669 ch
[2*k
] = vadd(a
, b
);
670 ch
[2*(k
+ido
)-1] = vsub(a
, b
);
676 for(size_t k
{0};k
< l1ido
;k
+= ido
)
678 for(size_t i
{2};i
< ido
;i
+= 2)
680 v4sf tr2
{cc
[i
- 1 + k
+ l1ido
]}, ti2
{cc
[i
+ k
+ l1ido
]};
681 v4sf br
{cc
[i
- 1 + k
]}, bi
{cc
[i
+ k
]};
682 vcplxmulconj(tr2
, ti2
, ld_ps1(wa1
[i
- 2]), ld_ps1(wa1
[i
- 1]));
683 ch
[i
+ 2*k
] = vadd(bi
, ti2
);
684 ch
[2*(k
+ido
) - i
] = vsub(ti2
, bi
);
685 ch
[i
- 1 + 2*k
] = vadd(br
, tr2
);
686 ch
[2*(k
+ido
) - i
-1] = vsub(br
, tr2
);
692 const v4sf minus_one
{ld_ps1(-1.0f
)};
693 for(size_t k
{0};k
< l1ido
;k
+= ido
)
695 ch
[2*k
+ ido
] = vmul(minus_one
, cc
[ido
-1 + k
+ l1ido
]);
696 ch
[2*k
+ ido
-1] = cc
[k
+ ido
-1];
701 NOINLINE
void radb2_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
702 const float *const wa1
)
704 const size_t l1ido
{l1
*ido
};
705 for(size_t k
{0};k
< l1ido
;k
+= ido
)
708 v4sf b
{cc
[2*(k
+ido
) - 1]};
710 ch
[k
+ l1ido
] = vsub(a
, b
);
716 for(size_t k
{0};k
< l1ido
;k
+= ido
)
718 for(size_t i
{2};i
< ido
;i
+= 2)
720 v4sf a
{cc
[i
-1 + 2*k
]};
721 v4sf b
{cc
[2*(k
+ ido
) - i
- 1]};
722 v4sf c
{cc
[i
+0 + 2*k
]};
723 v4sf d
{cc
[2*(k
+ ido
) - i
+ 0]};
724 ch
[i
-1 + k
] = vadd(a
, b
);
725 v4sf tr2
{vsub(a
, b
)};
726 ch
[i
+0 + k
] = vsub(c
, d
);
727 v4sf ti2
{vadd(c
, d
)};
728 vcplxmul(tr2
, ti2
, ld_ps1(wa1
[i
- 2]), ld_ps1(wa1
[i
- 1]));
729 ch
[i
-1 + k
+ l1ido
] = tr2
;
730 ch
[i
+0 + k
+ l1ido
] = ti2
;
736 const v4sf minus_two
{ld_ps1(-2.0f
)};
737 for(size_t k
{0};k
< l1ido
;k
+= ido
)
739 v4sf a
{cc
[2*k
+ ido
-1]};
740 v4sf b
{cc
[2*k
+ ido
]};
741 ch
[k
+ ido
-1] = vadd(a
,a
);
742 ch
[k
+ ido
-1 + l1ido
] = vmul(minus_two
, b
);
746 void radf3_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
747 const float *const wa1
)
749 const v4sf taur
{ld_ps1(-0.5f
)};
750 const v4sf taui
{ld_ps1(0.866025403784439f
)};
751 for(size_t k
{0};k
< l1
;++k
)
753 v4sf cr2
{vadd(cc
[(k
+ l1
)*ido
], cc
[(k
+ 2*l1
)*ido
])};
754 ch
[ (3*k
)*ido
] = vadd(cc
[k
*ido
], cr2
);
755 ch
[ (3*k
+ 2)*ido
] = vmul(taui
, vsub(cc
[(k
+ l1
*2)*ido
], cc
[(k
+ l1
)*ido
]));
756 ch
[ido
-1 + (3*k
+ 1)*ido
] = vmadd(taur
, cr2
, cc
[k
*ido
]);
761 const auto wa2
= wa1
+ ido
;
762 for(size_t k
{0};k
< l1
;++k
)
764 for(size_t i
{2};i
< ido
;i
+= 2)
766 const size_t ic
{ido
- i
};
767 v4sf wr1
{ld_ps1(wa1
[i
- 2])};
768 v4sf wi1
{ld_ps1(wa1
[i
- 1])};
769 v4sf dr2
{cc
[i
- 1 + (k
+ l1
)*ido
]};
770 v4sf di2
{cc
[i
+ (k
+ l1
)*ido
]};
771 vcplxmulconj(dr2
, di2
, wr1
, wi1
);
773 v4sf wr2
{ld_ps1(wa2
[i
- 2])};
774 v4sf wi2
{ld_ps1(wa2
[i
- 1])};
775 v4sf dr3
{cc
[i
- 1 + (k
+ l1
*2)*ido
]};
776 v4sf di3
{cc
[i
+ (k
+ l1
*2)*ido
]};
777 vcplxmulconj(dr3
, di3
, wr2
, wi2
);
779 v4sf cr2
{vadd(dr2
, dr3
)};
780 v4sf ci2
{vadd(di2
, di3
)};
781 ch
[i
- 1 + 3*k
*ido
] = vadd(cc
[i
- 1 + k
*ido
], cr2
);
782 ch
[i
+ 3*k
*ido
] = vadd(cc
[i
+ k
*ido
], ci2
);
783 v4sf tr2
{vmadd(taur
, cr2
, cc
[i
- 1 + k
*ido
])};
784 v4sf ti2
{vmadd(taur
, ci2
, cc
[i
+ k
*ido
])};
785 v4sf tr3
{vmul(taui
, vsub(di2
, di3
))};
786 v4sf ti3
{vmul(taui
, vsub(dr3
, dr2
))};
787 ch
[i
- 1 + (3*k
+ 2)*ido
] = vadd(tr2
, tr3
);
788 ch
[ic
- 1 + (3*k
+ 1)*ido
] = vsub(tr2
, tr3
);
789 ch
[i
+ (3*k
+ 2)*ido
] = vadd(ti2
, ti3
);
790 ch
[ic
+ (3*k
+ 1)*ido
] = vsub(ti3
, ti2
);
796 void radb3_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
797 const float *const wa1
)
799 static constexpr float taur
{-0.5f
};
800 static constexpr float taui
{0.866025403784439f
};
801 static constexpr float taui_2
{taui
*2.0f
};
803 const v4sf vtaur
{ld_ps1(taur
)};
804 const v4sf vtaui_2
{ld_ps1(taui_2
)};
805 for(size_t k
{0};k
< l1
;++k
)
807 v4sf tr2
= cc
[ido
-1 + (3*k
+ 1)*ido
];
809 v4sf cr2
= vmadd(vtaur
, tr2
, cc
[3*k
*ido
]);
810 ch
[k
*ido
] = vadd(cc
[3*k
*ido
], tr2
);
811 v4sf ci3
= vmul(vtaui_2
, cc
[(3*k
+ 2)*ido
]);
812 ch
[(k
+ l1
)*ido
] = vsub(cr2
, ci3
);
813 ch
[(k
+ 2*l1
)*ido
] = vadd(cr2
, ci3
);
818 const auto wa2
= wa1
+ ido
;
819 const v4sf vtaui
{ld_ps1(taui
)};
820 for(size_t k
{0};k
< l1
;++k
)
822 for(size_t i
{2};i
< ido
;i
+= 2)
824 const size_t ic
{ido
- i
};
825 v4sf tr2
{vadd(cc
[i
- 1 + (3*k
+ 2)*ido
], cc
[ic
- 1 + (3*k
+ 1)*ido
])};
826 v4sf cr2
{vmadd(vtaur
, tr2
, cc
[i
- 1 + 3*k
*ido
])};
827 ch
[i
- 1 + k
*ido
] = vadd(cc
[i
- 1 + 3*k
*ido
], tr2
);
828 v4sf ti2
{vsub(cc
[i
+ (3*k
+ 2)*ido
], cc
[ic
+ (3*k
+ 1)*ido
])};
829 v4sf ci2
{vmadd(vtaur
, ti2
, cc
[i
+ 3*k
*ido
])};
830 ch
[i
+ k
*ido
] = vadd(cc
[i
+ 3*k
*ido
], ti2
);
831 v4sf cr3
{vmul(vtaui
, vsub(cc
[i
- 1 + (3*k
+ 2)*ido
], cc
[ic
- 1 + (3*k
+ 1)*ido
]))};
832 v4sf ci3
{vmul(vtaui
, vadd(cc
[i
+ (3*k
+ 2)*ido
], cc
[ic
+ (3*k
+ 1)*ido
]))};
833 v4sf dr2
{vsub(cr2
, ci3
)};
834 v4sf dr3
{vadd(cr2
, ci3
)};
835 v4sf di2
{vadd(ci2
, cr3
)};
836 v4sf di3
{vsub(ci2
, cr3
)};
837 vcplxmul(dr2
, di2
, ld_ps1(wa1
[i
-2]), ld_ps1(wa1
[i
-1]));
838 ch
[i
- 1 + (k
+ l1
)*ido
] = dr2
;
839 ch
[i
+ (k
+ l1
)*ido
] = di2
;
840 vcplxmul(dr3
, di3
, ld_ps1(wa2
[i
-2]), ld_ps1(wa2
[i
-1]));
841 ch
[i
- 1 + (k
+ 2*l1
)*ido
] = dr3
;
842 ch
[i
+ (k
+ 2*l1
)*ido
] = di3
;
847 NOINLINE
void radf4_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
,
848 v4sf
*RESTRICT ch
, const float *const wa1
)
850 const size_t l1ido
{l1
*ido
};
852 const v4sf
*RESTRICT cc_
{cc
}, *RESTRICT cc_end
{cc
+ l1ido
};
853 v4sf
*RESTRICT ch_
{ch
};
856 // this loop represents between 25% and 40% of total radf4_ps cost !
857 v4sf a0
{cc
[0]}, a1
{cc
[l1ido
]};
858 v4sf a2
{cc
[2*l1ido
]}, a3
{cc
[3*l1ido
]};
859 v4sf tr1
{vadd(a1
, a3
)};
860 v4sf tr2
{vadd(a0
, a2
)};
861 ch
[2*ido
-1] = vsub(a0
, a2
);
862 ch
[2*ido
] = vsub(a3
, a1
);
863 ch
[0 ] = vadd(tr1
, tr2
);
864 ch
[4*ido
-1] = vsub(tr2
, tr1
);
865 cc
+= ido
; ch
+= 4*ido
;
874 const auto wa2
= wa1
+ ido
;
875 const auto wa3
= wa2
+ ido
;
877 for(size_t k
{0};k
< l1ido
;k
+= ido
)
879 const v4sf
*RESTRICT pc
{cc
+ 1 + k
};
880 for(size_t i
{2};i
< ido
;i
+= 2, pc
+= 2)
882 const size_t ic
{ido
- i
};
884 v4sf cr2
{pc
[1*l1ido
+0]};
885 v4sf ci2
{pc
[1*l1ido
+1]};
886 v4sf wr
{ld_ps1(wa1
[i
- 2])};
887 v4sf wi
{ld_ps1(wa1
[i
- 1])};
888 vcplxmulconj(cr2
,ci2
,wr
,wi
);
890 v4sf cr3
{pc
[2*l1ido
+0]};
891 v4sf ci3
{pc
[2*l1ido
+1]};
892 wr
= ld_ps1(wa2
[i
-2]);
893 wi
= ld_ps1(wa2
[i
-1]);
894 vcplxmulconj(cr3
, ci3
, wr
, wi
);
896 v4sf cr4
{pc
[3*l1ido
]};
897 v4sf ci4
{pc
[3*l1ido
+1]};
898 wr
= ld_ps1(wa3
[i
-2]);
899 wi
= ld_ps1(wa3
[i
-1]);
900 vcplxmulconj(cr4
, ci4
, wr
, wi
);
902 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
904 v4sf tr1
{vadd(cr2
,cr4
)};
905 v4sf tr4
{vsub(cr4
,cr2
)};
906 v4sf tr2
{vadd(pc
[0],cr3
)};
907 v4sf tr3
{vsub(pc
[0],cr3
)};
908 ch
[i
- 1 + 4*k
] = vadd(tr2
,tr1
);
909 ch
[ic
- 1 + 4*k
+ 3*ido
] = vsub(tr2
,tr1
); // at this point tr1 and tr2 can be disposed
910 v4sf ti1
{vadd(ci2
,ci4
)};
911 v4sf ti4
{vsub(ci2
,ci4
)};
912 ch
[i
- 1 + 4*k
+ 2*ido
] = vadd(tr3
,ti4
);
913 ch
[ic
- 1 + 4*k
+ 1*ido
] = vsub(tr3
,ti4
); // dispose tr3, ti4
914 v4sf ti2
{vadd(pc
[1],ci3
)};
915 v4sf ti3
{vsub(pc
[1],ci3
)};
916 ch
[i
+ 4*k
] = vadd(ti1
, ti2
);
917 ch
[ic
+ 4*k
+ 3*ido
] = vsub(ti1
, ti2
);
918 ch
[i
+ 4*k
+ 2*ido
] = vadd(tr4
, ti3
);
919 ch
[ic
+ 4*k
+ 1*ido
] = vsub(tr4
, ti3
);
925 const v4sf minus_hsqt2
{ld_ps1(al::numbers::sqrt2_v
<float> * -0.5f
)};
926 for(size_t k
{0};k
< l1ido
;k
+= ido
)
928 v4sf a
{cc
[ido
-1 + k
+ l1ido
]}, b
{cc
[ido
-1 + k
+ 3*l1ido
]};
929 v4sf c
{cc
[ido
-1 + k
]}, d
{cc
[ido
-1 + k
+ 2*l1ido
]};
930 v4sf ti1
{vmul(minus_hsqt2
, vadd(b
, a
))};
931 v4sf tr1
{vmul(minus_hsqt2
, vsub(b
, a
))};
932 ch
[ido
-1 + 4*k
] = vadd(c
, tr1
);
933 ch
[ido
-1 + 4*k
+ 2*ido
] = vsub(c
, tr1
);
934 ch
[ 4*k
+ 1*ido
] = vsub(ti1
, d
);
935 ch
[ 4*k
+ 3*ido
] = vadd(ti1
, d
);
940 NOINLINE
void radb4_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
,
941 v4sf
*RESTRICT ch
, const float *const wa1
)
943 const v4sf two
{ld_ps1(2.0f
)};
944 const size_t l1ido
{l1
*ido
};
946 const v4sf
*RESTRICT cc_
{cc
}, *RESTRICT ch_end
{ch
+ l1ido
};
950 v4sf a
{cc
[0]}, b
{cc
[4*ido
-1]};
951 v4sf c
{cc
[2*ido
]}, d
{cc
[2*ido
-1]};
952 v4sf tr3
{vmul(two
,d
)};
955 v4sf tr4
{vmul(two
,c
)};
956 ch
[0*l1ido
] = vadd(tr2
, tr3
);
957 ch
[2*l1ido
] = vsub(tr2
, tr3
);
958 ch
[1*l1ido
] = vsub(tr1
, tr4
);
959 ch
[3*l1ido
] = vadd(tr1
, tr4
);
961 cc
+= 4*ido
; ch
+= ido
;
969 const auto wa2
= wa1
+ ido
;
970 const auto wa3
= wa2
+ ido
;
972 for(size_t k
{0};k
< l1ido
;k
+= ido
)
974 const v4sf
*RESTRICT pc
{cc
- 1 + 4*k
};
975 v4sf
*RESTRICT ph
{ch
+ k
+ 1};
976 for(size_t i
{2};i
< ido
;i
+= 2)
978 v4sf tr1
{vsub(pc
[ i
], pc
[4*ido
- i
])};
979 v4sf tr2
{vadd(pc
[ i
], pc
[4*ido
- i
])};
980 v4sf ti4
{vsub(pc
[2*ido
+ i
], pc
[2*ido
- i
])};
981 v4sf tr3
{vadd(pc
[2*ido
+ i
], pc
[2*ido
- i
])};
982 ph
[0] = vadd(tr2
, tr3
);
983 v4sf cr3
{vsub(tr2
, tr3
)};
985 v4sf ti3
{vsub(pc
[2*ido
+ i
+ 1], pc
[2*ido
- i
+ 1])};
986 v4sf tr4
{vadd(pc
[2*ido
+ i
+ 1], pc
[2*ido
- i
+ 1])};
987 v4sf cr2
{vsub(tr1
, tr4
)};
988 v4sf cr4
{vadd(tr1
, tr4
)};
990 v4sf ti1
{vadd(pc
[i
+ 1], pc
[4*ido
- i
+ 1])};
991 v4sf ti2
{vsub(pc
[i
+ 1], pc
[4*ido
- i
+ 1])};
993 ph
[1] = vadd(ti2
, ti3
); ph
+= l1ido
;
994 v4sf ci3
{vsub(ti2
, ti3
)};
995 v4sf ci2
{vadd(ti1
, ti4
)};
996 v4sf ci4
{vsub(ti1
, ti4
)};
997 vcplxmul(cr2
, ci2
, ld_ps1(wa1
[i
-2]), ld_ps1(wa1
[i
-1]));
999 ph
[1] = ci2
; ph
+= l1ido
;
1000 vcplxmul(cr3
, ci3
, ld_ps1(wa2
[i
-2]), ld_ps1(wa2
[i
-1]));
1002 ph
[1] = ci3
; ph
+= l1ido
;
1003 vcplxmul(cr4
, ci4
, ld_ps1(wa3
[i
-2]), ld_ps1(wa3
[i
-1]));
1005 ph
[1] = ci4
; ph
= ph
- 3*l1ido
+ 2;
1011 const v4sf minus_sqrt2
{ld_ps1(-1.414213562373095f
)};
1012 for(size_t k
{0};k
< l1ido
;k
+= ido
)
1014 const size_t i0
{4*k
+ ido
};
1015 v4sf c
{cc
[i0
-1]}, d
{cc
[i0
+ 2*ido
-1]};
1016 v4sf a
{cc
[i0
+0]}, b
{cc
[i0
+ 2*ido
+0]};
1017 v4sf tr1
{vsub(c
,d
)};
1018 v4sf tr2
{vadd(c
,d
)};
1019 v4sf ti1
{vadd(b
,a
)};
1020 v4sf ti2
{vsub(b
,a
)};
1021 ch
[ido
-1 + k
+ 0*l1ido
] = vadd(tr2
,tr2
);
1022 ch
[ido
-1 + k
+ 1*l1ido
] = vmul(minus_sqrt2
, vsub(ti1
, tr1
));
1023 ch
[ido
-1 + k
+ 2*l1ido
] = vadd(ti2
, ti2
);
1024 ch
[ido
-1 + k
+ 3*l1ido
] = vmul(minus_sqrt2
, vadd(ti1
, tr1
));
1028 void radf5_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
1029 const float *const wa1
)
1031 const v4sf tr11
{ld_ps1(0.309016994374947f
)};
1032 const v4sf ti11
{ld_ps1(0.951056516295154f
)};
1033 const v4sf tr12
{ld_ps1(-0.809016994374947f
)};
1034 const v4sf ti12
{ld_ps1(0.587785252292473f
)};
1036 auto cc_ref
= [&cc
,l1
,ido
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1037 { return cc
[(a_3
*l1
+ a_2
)*ido
+ a_1
]; };
1038 auto ch_ref
= [&ch
,ido
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1039 { return ch
[(a_3
*5 + a_2
)*ido
+ a_1
]; };
1041 /* Parameter adjustments */
1043 cc
-= 1 + ido
* (1 + l1
);
1045 const auto wa2
= wa1
+ ido
;
1046 const auto wa3
= wa2
+ ido
;
1047 const auto wa4
= wa3
+ ido
;
1050 for(size_t k
{1};k
<= l1
;++k
)
1052 v4sf cr2
{vadd(cc_ref(1, k
, 5), cc_ref(1, k
, 2))};
1053 v4sf ci5
{vsub(cc_ref(1, k
, 5), cc_ref(1, k
, 2))};
1054 v4sf cr3
{vadd(cc_ref(1, k
, 4), cc_ref(1, k
, 3))};
1055 v4sf ci4
{vsub(cc_ref(1, k
, 4), cc_ref(1, k
, 3))};
1056 ch_ref(1, 1, k
) = vadd(cc_ref(1, k
, 1), vadd(cr2
, cr3
));
1057 ch_ref(ido
, 2, k
) = vadd(cc_ref(1, k
, 1), vmadd(tr11
, cr2
, vmul(tr12
, cr3
)));
1058 ch_ref(1, 3, k
) = vmadd(ti11
, ci5
, vmul(ti12
, ci4
));
1059 ch_ref(ido
, 4, k
) = vadd(cc_ref(1, k
, 1), vmadd(tr12
, cr2
, vmul(tr11
, cr3
)));
1060 ch_ref(1, 5, k
) = vsub(vmul(ti12
, ci5
), vmul(ti11
, ci4
));
1061 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
1066 const size_t idp2
{ido
+ 2};
1067 for(size_t k
{1};k
<= l1
;++k
)
1069 for(size_t i
{3};i
<= ido
;i
+= 2)
1071 const size_t ic
{idp2
- i
};
1072 v4sf dr2
{ld_ps1(wa1
[i
-3])};
1073 v4sf di2
{ld_ps1(wa1
[i
-2])};
1074 v4sf dr3
{ld_ps1(wa2
[i
-3])};
1075 v4sf di3
{ld_ps1(wa2
[i
-2])};
1076 v4sf dr4
{ld_ps1(wa3
[i
-3])};
1077 v4sf di4
{ld_ps1(wa3
[i
-2])};
1078 v4sf dr5
{ld_ps1(wa4
[i
-3])};
1079 v4sf di5
{ld_ps1(wa4
[i
-2])};
1080 vcplxmulconj(dr2
, di2
, cc_ref(i
-1, k
, 2), cc_ref(i
, k
, 2));
1081 vcplxmulconj(dr3
, di3
, cc_ref(i
-1, k
, 3), cc_ref(i
, k
, 3));
1082 vcplxmulconj(dr4
, di4
, cc_ref(i
-1, k
, 4), cc_ref(i
, k
, 4));
1083 vcplxmulconj(dr5
, di5
, cc_ref(i
-1, k
, 5), cc_ref(i
, k
, 5));
1084 v4sf cr2
{vadd(dr2
, dr5
)};
1085 v4sf ci5
{vsub(dr5
, dr2
)};
1086 v4sf cr5
{vsub(di2
, di5
)};
1087 v4sf ci2
{vadd(di2
, di5
)};
1088 v4sf cr3
{vadd(dr3
, dr4
)};
1089 v4sf ci4
{vsub(dr4
, dr3
)};
1090 v4sf cr4
{vsub(di3
, di4
)};
1091 v4sf ci3
{vadd(di3
, di4
)};
1092 ch_ref(i
- 1, 1, k
) = vadd(cc_ref(i
- 1, k
, 1), vadd(cr2
, cr3
));
1093 ch_ref(i
, 1, k
) = vsub(cc_ref(i
, k
, 1), vadd(ci2
, ci3
));
1094 v4sf tr2
{vadd(cc_ref(i
- 1, k
, 1), vmadd(tr11
, cr2
, vmul(tr12
, cr3
)))};
1095 v4sf ti2
{vsub(cc_ref(i
, k
, 1), vmadd(tr11
, ci2
, vmul(tr12
, ci3
)))};
1096 v4sf tr3
{vadd(cc_ref(i
- 1, k
, 1), vmadd(tr12
, cr2
, vmul(tr11
, cr3
)))};
1097 v4sf ti3
{vsub(cc_ref(i
, k
, 1), vmadd(tr12
, ci2
, vmul(tr11
, ci3
)))};
1098 v4sf tr5
{vmadd(ti11
, cr5
, vmul(ti12
, cr4
))};
1099 v4sf ti5
{vmadd(ti11
, ci5
, vmul(ti12
, ci4
))};
1100 v4sf tr4
{vsub(vmul(ti12
, cr5
), vmul(ti11
, cr4
))};
1101 v4sf ti4
{vsub(vmul(ti12
, ci5
), vmul(ti11
, ci4
))};
1102 ch_ref(i
- 1, 3, k
) = vsub(tr2
, tr5
);
1103 ch_ref(ic
- 1, 2, k
) = vadd(tr2
, tr5
);
1104 ch_ref(i
, 3, k
) = vadd(ti5
, ti2
);
1105 ch_ref(ic
, 2, k
) = vsub(ti5
, ti2
);
1106 ch_ref(i
- 1, 5, k
) = vsub(tr3
, tr4
);
1107 ch_ref(ic
- 1, 4, k
) = vadd(tr3
, tr4
);
1108 ch_ref(i
, 5, k
) = vadd(ti4
, ti3
);
1109 ch_ref(ic
, 4, k
) = vsub(ti4
, ti3
);
1114 void radb5_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
1115 const float *const wa1
)
1117 const v4sf tr11
{ld_ps1(0.309016994374947f
)};
1118 const v4sf ti11
{ld_ps1(0.951056516295154f
)};
1119 const v4sf tr12
{ld_ps1(-0.809016994374947f
)};
1120 const v4sf ti12
{ld_ps1(0.587785252292473f
)};
1122 auto cc_ref
= [&cc
,ido
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1123 { return cc
[(a_3
*5 + a_2
)*ido
+ a_1
]; };
1124 auto ch_ref
= [&ch
,ido
,l1
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1125 { return ch
[(a_3
*l1
+ a_2
)*ido
+ a_1
]; };
1127 /* Parameter adjustments */
1128 ch
-= 1 + ido
*(1 + l1
);
1131 const auto wa2
= wa1
+ ido
;
1132 const auto wa3
= wa2
+ ido
;
1133 const auto wa4
= wa3
+ ido
;
1136 for(size_t k
{1};k
<= l1
;++k
)
1138 v4sf ti5
{vadd(cc_ref( 1, 3, k
), cc_ref(1, 3, k
))};
1139 v4sf ti4
{vadd(cc_ref( 1, 5, k
), cc_ref(1, 5, k
))};
1140 v4sf tr2
{vadd(cc_ref(ido
, 2, k
), cc_ref(ido
, 2, k
))};
1141 v4sf tr3
{vadd(cc_ref(ido
, 4, k
), cc_ref(ido
, 4, k
))};
1142 ch_ref(1, k
, 1) = vadd(cc_ref(1, 1, k
), vadd(tr2
, tr3
));
1143 v4sf cr2
{vadd(cc_ref(1, 1, k
), vmadd(tr11
, tr2
, vmul(tr12
, tr3
)))};
1144 v4sf cr3
{vadd(cc_ref(1, 1, k
), vmadd(tr12
, tr2
, vmul(tr11
, tr3
)))};
1145 v4sf ci5
{vmadd(ti11
, ti5
, vmul(ti12
, ti4
))};
1146 v4sf ci4
{vsub(vmul(ti12
, ti5
), vmul(ti11
, ti4
))};
1147 ch_ref(1, k
, 2) = vsub(cr2
, ci5
);
1148 ch_ref(1, k
, 3) = vsub(cr3
, ci4
);
1149 ch_ref(1, k
, 4) = vadd(cr3
, ci4
);
1150 ch_ref(1, k
, 5) = vadd(cr2
, ci5
);
1155 const size_t idp2
{ido
+ 2};
1156 for(size_t k
{1};k
<= l1
;++k
)
1158 for(size_t i
{3};i
<= ido
;i
+= 2)
1160 const size_t ic
{idp2
- i
};
1161 v4sf ti5
{vadd(cc_ref(i
, 3, k
), cc_ref(ic
, 2, k
))};
1162 v4sf ti2
{vsub(cc_ref(i
, 3, k
), cc_ref(ic
, 2, k
))};
1163 v4sf ti4
{vadd(cc_ref(i
, 5, k
), cc_ref(ic
, 4, k
))};
1164 v4sf ti3
{vsub(cc_ref(i
, 5, k
), cc_ref(ic
, 4, k
))};
1165 v4sf tr5
{vsub(cc_ref(i
-1, 3, k
), cc_ref(ic
-1, 2, k
))};
1166 v4sf tr2
{vadd(cc_ref(i
-1, 3, k
), cc_ref(ic
-1, 2, k
))};
1167 v4sf tr4
{vsub(cc_ref(i
-1, 5, k
), cc_ref(ic
-1, 4, k
))};
1168 v4sf tr3
{vadd(cc_ref(i
-1, 5, k
), cc_ref(ic
-1, 4, k
))};
1169 ch_ref(i
- 1, k
, 1) = vadd(cc_ref(i
-1, 1, k
), vadd(tr2
, tr3
));
1170 ch_ref(i
, k
, 1) = vadd(cc_ref(i
, 1, k
), vadd(ti2
, ti3
));
1171 v4sf cr2
{vadd(cc_ref(i
-1, 1, k
), vmadd(tr11
, tr2
, vmul(tr12
, tr3
)))};
1172 v4sf ci2
{vadd(cc_ref(i
, 1, k
), vmadd(tr11
, ti2
, vmul(tr12
, ti3
)))};
1173 v4sf cr3
{vadd(cc_ref(i
-1, 1, k
), vmadd(tr12
, tr2
, vmul(tr11
, tr3
)))};
1174 v4sf ci3
{vadd(cc_ref(i
, 1, k
), vmadd(tr12
, ti2
, vmul(tr11
, ti3
)))};
1175 v4sf cr5
{vmadd(ti11
, tr5
, vmul(ti12
, tr4
))};
1176 v4sf ci5
{vmadd(ti11
, ti5
, vmul(ti12
, ti4
))};
1177 v4sf cr4
{vsub(vmul(ti12
, tr5
), vmul(ti11
, tr4
))};
1178 v4sf ci4
{vsub(vmul(ti12
, ti5
), vmul(ti11
, ti4
))};
1179 v4sf dr3
{vsub(cr3
, ci4
)};
1180 v4sf dr4
{vadd(cr3
, ci4
)};
1181 v4sf di3
{vadd(ci3
, cr4
)};
1182 v4sf di4
{vsub(ci3
, cr4
)};
1183 v4sf dr5
{vadd(cr2
, ci5
)};
1184 v4sf dr2
{vsub(cr2
, ci5
)};
1185 v4sf di5
{vsub(ci2
, cr5
)};
1186 v4sf di2
{vadd(ci2
, cr5
)};
1187 vcplxmul(dr2
, di2
, ld_ps1(wa1
[i
-3]), ld_ps1(wa1
[i
-2]));
1188 vcplxmul(dr3
, di3
, ld_ps1(wa2
[i
-3]), ld_ps1(wa2
[i
-2]));
1189 vcplxmul(dr4
, di4
, ld_ps1(wa3
[i
-3]), ld_ps1(wa3
[i
-2]));
1190 vcplxmul(dr5
, di5
, ld_ps1(wa4
[i
-3]), ld_ps1(wa4
[i
-2]));
1192 ch_ref(i
-1, k
, 2) = dr2
; ch_ref(i
, k
, 2) = di2
;
1193 ch_ref(i
-1, k
, 3) = dr3
; ch_ref(i
, k
, 3) = di3
;
1194 ch_ref(i
-1, k
, 4) = dr4
; ch_ref(i
, k
, 4) = di4
;
1195 ch_ref(i
-1, k
, 5) = dr5
; ch_ref(i
, k
, 5) = di5
;
1200 NOINLINE v4sf
*rfftf1_ps(const size_t n
, const v4sf
*input_readonly
, v4sf
*work1
, v4sf
*work2
,
1201 const float *wa
, const al::span
<const uint
,15> ifac
)
1203 assert(work1
!= work2
);
1205 const v4sf
*in
{input_readonly
};
1206 v4sf
*out
{in
== work2
? work1
: work2
};
1207 const size_t nf
{ifac
[1]};
1213 const size_t kh
{nf
- k1
};
1214 const size_t ip
{ifac
[kh
+ 2]};
1215 const size_t l1
{l2
/ ip
};
1216 const size_t ido
{n
/ l2
};
1221 radf5_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1224 radf4_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1227 radf3_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1230 radf2_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1252 NOINLINE v4sf
*rfftb1_ps(const size_t n
, const v4sf
*input_readonly
, v4sf
*work1
, v4sf
*work2
,
1253 const float *wa
, const al::span
<const uint
,15> ifac
)
1255 assert(work1
!= work2
);
1257 const v4sf
*in
{input_readonly
};
1258 v4sf
*out
{in
== work2
? work1
: work2
};
1259 const size_t nf
{ifac
[1]};
1265 const size_t ip
{ifac
[k1
+ 1]};
1266 const size_t l2
{ip
*l1
};
1267 const size_t ido
{n
/ l2
};
1271 radb5_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1274 radb4_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1277 radb3_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1280 radb2_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1304 v4sf
*cfftf1_ps(const size_t n
, const v4sf
*input_readonly
, v4sf
*work1
, v4sf
*work2
,
1305 const float *wa
, const al::span
<const uint
,15> ifac
, const float fsign
)
1307 assert(work1
!= work2
);
1309 const v4sf
*in
{input_readonly
};
1310 v4sf
*out
{in
== work2
? work1
: work2
};
1311 const size_t nf
{ifac
[1]};
1312 size_t l1
{1}, iw
{0};
1316 const size_t ip
{ifac
[k1
]};
1317 const size_t l2
{ip
*l1
};
1318 const size_t ido
{n
/ l2
};
1319 const size_t idot
{ido
+ ido
};
1323 passf5_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1326 passf4_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1329 passf3_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1332 passf2_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1341 iw
+= (ip
- 1)*idot
;
1356 uint
decompose(const uint n
, const al::span
<uint
,15> ifac
, const al::span
<const uint
,4> ntryh
)
1359 for(const uint ntry
: ntryh
)
1363 const uint nq
{nl
/ ntry
};
1364 const uint nr
{nl
% ntry
};
1367 ifac
[2+nf
++] = ntry
;
1369 if(ntry
== 2 && nf
!= 1)
1371 for(size_t i
{2};i
<= nf
;++i
)
1373 size_t ib
{nf
- i
+ 2};
1374 ifac
[ib
+ 1] = ifac
[ib
];
1385 void rffti1_ps(const uint n
, float *wa
, const al::span
<uint
,15> ifac
)
1387 static constexpr std::array ntryh
{4u,2u,3u,5u};
1389 const uint nf
{decompose(n
, ifac
, ntryh
)};
1390 const double argh
{2.0*al::numbers::pi
/ n
};
1392 size_t nfm1
{nf
- 1};
1394 for(size_t k1
{0};k1
< nfm1
;++k1
)
1396 const size_t ip
{ifac
[k1
+2]};
1397 const size_t l2
{l1
*ip
};
1398 const size_t ido
{n
/ l2
};
1399 const size_t ipm
{ip
- 1};
1401 for(size_t j
{0};j
< ipm
;++j
)
1405 const double argld
{static_cast<double>(ld
)*argh
};
1407 for(size_t ii
{2};ii
< ido
;ii
+= 2)
1410 wa
[i
++] = static_cast<float>(std::cos(fi
*argld
));
1411 wa
[i
++] = static_cast<float>(std::sin(fi
*argld
));
1419 void cffti1_ps(const uint n
, float *wa
, const al::span
<uint
,15> ifac
)
1421 static constexpr std::array ntryh
{5u,3u,4u,2u};
1423 const uint nf
{decompose(n
, ifac
, ntryh
)};
1424 const double argh
{2.0*al::numbers::pi
/ n
};
1427 for(size_t k1
{0};k1
< nf
;++k1
)
1429 const size_t ip
{ifac
[k1
+2]};
1430 const size_t l2
{l1
*ip
};
1431 const size_t ido
{n
/ l2
};
1432 const size_t idot
{ido
+ ido
+ 2};
1433 const size_t ipm
{ip
- 1};
1435 for(size_t j
{0};j
< ipm
;++j
)
1441 const double argld
{static_cast<double>(ld
)*argh
};
1443 for(size_t ii
{3};ii
< idot
;ii
+= 2)
1446 wa
[++i
] = static_cast<float>(std::cos(fi
*argld
));
1447 wa
[++i
] = static_cast<float>(std::sin(fi
*argld
));
1461 /* NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) */
1462 struct PFFFT_Setup
{
1464 uint Ncvec
{}; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1465 std::array
<uint
,15> ifac
{};
1466 pffft_transform_t transform
{};
1468 float *twiddle
{}; /* N/4 elements */
1469 al::span
<v4sf
> e
; /* N/4*3 elements */
1471 alignas(V4sfAlignment
) std::byte end
;
1474 PFFFTSetupPtr
pffft_new_setup(unsigned int N
, pffft_transform_t transform
)
1476 assert(transform
== PFFFT_REAL
|| transform
== PFFFT_COMPLEX
);
1478 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1479 * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1480 * handle other cases (or maybe just switch to a scalar fft, I don't know..)
1482 if(transform
== PFFFT_REAL
)
1483 assert((N
%(2*SimdSize
*SimdSize
)) == 0);
1485 assert((N
%(SimdSize
*SimdSize
)) == 0);
1487 const uint Ncvec
{(transform
== PFFFT_REAL
? N
/2 : N
) / SimdSize
};
1489 const size_t storelen
{std::max(offsetof(PFFFT_Setup
, end
) + 2_zu
*Ncvec
*sizeof(v4sf
),
1490 sizeof(PFFFT_Setup
))};
1491 auto storage
= static_cast<gsl::owner
<std::byte
*>>(::operator new[](storelen
, V4sfAlignVal
));
1492 al::span extrastore
{&storage
[offsetof(PFFFT_Setup
, end
)], 2_zu
*Ncvec
*sizeof(v4sf
)};
1494 PFFFTSetupPtr s
{::new(storage
) PFFFT_Setup
{}};
1496 s
->transform
= transform
;
1499 const size_t ecount
{2_zu
*Ncvec
*(SimdSize
-1)/SimdSize
};
1500 s
->e
= {std::launder(reinterpret_cast<v4sf
*>(extrastore
.data())), ecount
};
1501 s
->twiddle
= std::launder(reinterpret_cast<float*>(&extrastore
[ecount
*sizeof(v4sf
)]));
1503 if constexpr(SimdSize
> 1)
1505 auto e
= std::vector
<float>(s
->e
.size()*SimdSize
, 0.0f
);
1506 for(size_t k
{0};k
< s
->Ncvec
;++k
)
1508 const size_t i
{k
/ SimdSize
};
1509 const size_t j
{k
% SimdSize
};
1510 for(size_t m
{0};m
< SimdSize
-1;++m
)
1512 const double A
{-2.0*al::numbers::pi
*static_cast<double>((m
+1)*k
) / N
};
1513 e
[((i
*3 + m
)*2 + 0)*SimdSize
+ j
] = static_cast<float>(std::cos(A
));
1514 e
[((i
*3 + m
)*2 + 1)*SimdSize
+ j
] = static_cast<float>(std::sin(A
));
1517 std::memcpy(s
->e
.data(), e
.data(), e
.size()*sizeof(float));
1519 if(transform
== PFFFT_REAL
)
1520 rffti1_ps(N
/SimdSize
, s
->twiddle
, s
->ifac
);
1522 cffti1_ps(N
/SimdSize
, s
->twiddle
, s
->ifac
);
1524 /* check that N is decomposable with allowed prime factors */
1526 for(size_t k
{0};k
< s
->ifac
[1];++k
)
1536 void pffft_destroy_setup(gsl::owner
<PFFFT_Setup
*> s
) noexcept
1539 ::operator delete[](gsl::owner
<void*>{s
}, V4sfAlignVal
);
1542 #if !defined(PFFFT_SIMD_DISABLE)
1546 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1547 void reversed_copy(const size_t N
, const v4sf
*in
, const int in_stride
, v4sf
*RESTRICT out
)
1550 interleave2(in
[0], in
[1], g0
, g1
);
1553 *--out
= vswaphl(g0
, g1
); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1554 for(size_t k
{1};k
< N
;++k
)
1557 interleave2(in
[0], in
[1], h0
, h1
);
1559 *--out
= vswaphl(g1
, h0
);
1560 *--out
= vswaphl(h0
, h1
);
1563 *--out
= vswaphl(g1
, g0
);
1566 void unreversed_copy(const size_t N
, const v4sf
*in
, v4sf
*RESTRICT out
, const int out_stride
)
1568 v4sf g0
{in
[0]}, g1
{g0
};
1570 for(size_t k
{1};k
< N
;++k
)
1572 v4sf h0
{*in
++}; v4sf h1
{*in
++};
1573 g1
= vswaphl(g1
, h0
);
1574 h0
= vswaphl(h0
, h1
);
1575 uninterleave2(h0
, g1
, out
[0], out
[1]);
1579 v4sf h0
{*in
++}, h1
{g0
};
1580 g1
= vswaphl(g1
, h0
);
1581 h0
= vswaphl(h0
, h1
);
1582 uninterleave2(h0
, g1
, out
[0], out
[1]);
1585 void pffft_cplx_finalize(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
, const v4sf
*e
)
1589 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1590 for(size_t k
{0};k
< dk
;++k
)
1592 v4sf r0
{in
[8*k
+0]}, i0
{in
[8*k
+1]};
1593 v4sf r1
{in
[8*k
+2]}, i1
{in
[8*k
+3]};
1594 v4sf r2
{in
[8*k
+4]}, i2
{in
[8*k
+5]};
1595 v4sf r3
{in
[8*k
+6]}, i3
{in
[8*k
+7]};
1596 vtranspose4(r0
,r1
,r2
,r3
);
1597 vtranspose4(i0
,i1
,i2
,i3
);
1598 vcplxmul(r1
,i1
,e
[k
*6+0],e
[k
*6+1]);
1599 vcplxmul(r2
,i2
,e
[k
*6+2],e
[k
*6+3]);
1600 vcplxmul(r3
,i3
,e
[k
*6+4],e
[k
*6+5]);
1602 v4sf sr0
{vadd(r0
,r2
)}, dr0
{vsub(r0
, r2
)};
1603 v4sf sr1
{vadd(r1
,r3
)}, dr1
{vsub(r1
, r3
)};
1604 v4sf si0
{vadd(i0
,i2
)}, di0
{vsub(i0
, i2
)};
1605 v4sf si1
{vadd(i1
,i3
)}, di1
{vsub(i1
, i3
)};
1607 /* transformation for each column is:
1609 * [1 1 1 1 0 0 0 0] [r0]
1610 * [1 0 -1 0 0 -1 0 1] [r1]
1611 * [1 -1 1 -1 0 0 0 0] [r2]
1612 * [1 0 -1 0 0 1 0 -1] [r3]
1613 * [0 0 0 0 1 1 1 1] * [i0]
1614 * [0 1 0 -1 1 0 -1 0] [i1]
1615 * [0 0 0 0 1 -1 1 -1] [i2]
1616 * [0 -1 0 1 1 0 -1 0] [i3]
1619 r0
= vadd(sr0
, sr1
); i0
= vadd(si0
, si1
);
1620 r1
= vadd(dr0
, di1
); i1
= vsub(di0
, dr1
);
1621 r2
= vsub(sr0
, sr1
); i2
= vsub(si0
, si1
);
1622 r3
= vsub(dr0
, di1
); i3
= vadd(di0
, dr1
);
1624 *out
++ = r0
; *out
++ = i0
; *out
++ = r1
; *out
++ = i1
;
1625 *out
++ = r2
; *out
++ = i2
; *out
++ = r3
; *out
++ = i3
;
1629 void pffft_cplx_preprocess(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
, const v4sf
*e
)
1633 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1634 for(size_t k
{0};k
< dk
;++k
)
1636 v4sf r0
{in
[8*k
+0]}, i0
{in
[8*k
+1]};
1637 v4sf r1
{in
[8*k
+2]}, i1
{in
[8*k
+3]};
1638 v4sf r2
{in
[8*k
+4]}, i2
{in
[8*k
+5]};
1639 v4sf r3
{in
[8*k
+6]}, i3
{in
[8*k
+7]};
1641 v4sf sr0
{vadd(r0
,r2
)}, dr0
{vsub(r0
, r2
)};
1642 v4sf sr1
{vadd(r1
,r3
)}, dr1
{vsub(r1
, r3
)};
1643 v4sf si0
{vadd(i0
,i2
)}, di0
{vsub(i0
, i2
)};
1644 v4sf si1
{vadd(i1
,i3
)}, di1
{vsub(i1
, i3
)};
1646 r0
= vadd(sr0
, sr1
); i0
= vadd(si0
, si1
);
1647 r1
= vsub(dr0
, di1
); i1
= vadd(di0
, dr1
);
1648 r2
= vsub(sr0
, sr1
); i2
= vsub(si0
, si1
);
1649 r3
= vadd(dr0
, di1
); i3
= vsub(di0
, dr1
);
1651 vcplxmulconj(r1
,i1
,e
[k
*6+0],e
[k
*6+1]);
1652 vcplxmulconj(r2
,i2
,e
[k
*6+2],e
[k
*6+3]);
1653 vcplxmulconj(r3
,i3
,e
[k
*6+4],e
[k
*6+5]);
1655 vtranspose4(r0
,r1
,r2
,r3
);
1656 vtranspose4(i0
,i1
,i2
,i3
);
1658 *out
++ = r0
; *out
++ = i0
; *out
++ = r1
; *out
++ = i1
;
1659 *out
++ = r2
; *out
++ = i2
; *out
++ = r3
; *out
++ = i3
;
1664 force_inline
void pffft_real_finalize_4x4(const v4sf
*in0
, const v4sf
*in1
, const v4sf
*in
,
1665 const v4sf
*e
, v4sf
*RESTRICT out
)
1667 v4sf r0
{*in0
}, i0
{*in1
};
1668 v4sf r1
{*in
++}; v4sf i1
{*in
++};
1669 v4sf r2
{*in
++}; v4sf i2
{*in
++};
1670 v4sf r3
{*in
++}; v4sf i3
{*in
++};
1671 vtranspose4(r0
,r1
,r2
,r3
);
1672 vtranspose4(i0
,i1
,i2
,i3
);
1674 /* transformation for each column is:
1676 * [1 1 1 1 0 0 0 0] [r0]
1677 * [1 0 -1 0 0 -1 0 1] [r1]
1678 * [1 0 -1 0 0 1 0 -1] [r2]
1679 * [1 -1 1 -1 0 0 0 0] [r3]
1680 * [0 0 0 0 1 1 1 1] * [i0]
1681 * [0 -1 0 1 -1 0 1 0] [i1]
1682 * [0 -1 0 1 1 0 -1 0] [i2]
1683 * [0 0 0 0 -1 1 -1 1] [i3]
1686 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1687 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1689 vcplxmul(r1
,i1
,e
[0],e
[1]);
1690 vcplxmul(r2
,i2
,e
[2],e
[3]);
1691 vcplxmul(r3
,i3
,e
[4],e
[5]);
1693 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1694 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1696 v4sf sr0
{vadd(r0
,r2
)}, dr0
{vsub(r0
,r2
)};
1697 v4sf sr1
{vadd(r1
,r3
)}, dr1
{vsub(r3
,r1
)};
1698 v4sf si0
{vadd(i0
,i2
)}, di0
{vsub(i0
,i2
)};
1699 v4sf si1
{vadd(i1
,i3
)}, di1
{vsub(i3
,i1
)};
1701 r0
= vadd(sr0
, sr1
);
1702 r3
= vsub(sr0
, sr1
);
1703 i0
= vadd(si0
, si1
);
1704 i3
= vsub(si1
, si0
);
1705 r1
= vadd(dr0
, di1
);
1706 r2
= vsub(dr0
, di1
);
1707 i1
= vsub(dr1
, di0
);
1708 i2
= vadd(dr1
, di0
);
1720 NOINLINE
void pffft_real_finalize(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
,
1723 static constexpr float s
{al::numbers::sqrt2_v
<float>/2.0f
};
1726 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1727 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1729 const v4sf zero
{vzero()};
1730 const auto cr
= al::bit_cast
<std::array
<float,SimdSize
>>(in
[0]);
1731 const auto ci
= al::bit_cast
<std::array
<float,SimdSize
>>(in
[Ncvec
*2-1]);
1732 pffft_real_finalize_4x4(&zero
, &zero
, in
+1, e
, out
);
1734 /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1736 * [Xr(1)] ] [1 1 1 1 0 0 0 0]
1737 * [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1738 * [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1739 * [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1740 * [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1741 * [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1742 * [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1743 * [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1746 const float xr0
{(cr
[0]+cr
[2]) + (cr
[1]+cr
[3])}; out
[0] = vinsert0(out
[0], xr0
);
1747 const float xi0
{(cr
[0]+cr
[2]) - (cr
[1]+cr
[3])}; out
[1] = vinsert0(out
[1], xi0
);
1748 const float xr2
{(cr
[0]-cr
[2])}; out
[4] = vinsert0(out
[4], xr2
);
1749 const float xi2
{(cr
[3]-cr
[1])}; out
[5] = vinsert0(out
[5], xi2
);
1750 const float xr1
{ ci
[0] + s
*(ci
[1]-ci
[3])}; out
[2] = vinsert0(out
[2], xr1
);
1751 const float xi1
{-ci
[2] - s
*(ci
[1]+ci
[3])}; out
[3] = vinsert0(out
[3], xi1
);
1752 const float xr3
{ ci
[0] - s
*(ci
[1]-ci
[3])}; out
[6] = vinsert0(out
[6], xr3
);
1753 const float xi3
{ ci
[2] - s
*(ci
[1]+ci
[3])}; out
[7] = vinsert0(out
[7], xi3
);
1755 for(size_t k
{1};k
< dk
;++k
)
1756 pffft_real_finalize_4x4(&in
[8*k
-1], &in
[8*k
+0], in
+ 8*k
+1, e
+ k
*6, out
+ k
*8);
1759 force_inline
void pffft_real_preprocess_4x4(const v4sf
*in
, const v4sf
*e
, v4sf
*RESTRICT out
,
1762 v4sf r0
{in
[0]}, i0
{in
[1]}, r1
{in
[2]}, i1
{in
[3]};
1763 v4sf r2
{in
[4]}, i2
{in
[5]}, r3
{in
[6]}, i3
{in
[7]};
1765 /* transformation for each column is:
1767 * [1 1 1 1 0 0 0 0] [r0]
1768 * [1 0 0 -1 0 -1 -1 0] [r1]
1769 * [1 -1 -1 1 0 0 0 0] [r2]
1770 * [1 0 0 -1 0 1 1 0] [r3]
1771 * [0 0 0 0 1 -1 1 -1] * [i0]
1772 * [0 -1 1 0 1 0 0 1] [i1]
1773 * [0 0 0 0 1 1 -1 -1] [i2]
1774 * [0 1 -1 0 1 0 0 1] [i3]
1777 v4sf sr0
{vadd(r0
,r3
)}, dr0
{vsub(r0
,r3
)};
1778 v4sf sr1
{vadd(r1
,r2
)}, dr1
{vsub(r1
,r2
)};
1779 v4sf si0
{vadd(i0
,i3
)}, di0
{vsub(i0
,i3
)};
1780 v4sf si1
{vadd(i1
,i2
)}, di1
{vsub(i1
,i2
)};
1782 r0
= vadd(sr0
, sr1
);
1783 r2
= vsub(sr0
, sr1
);
1784 r1
= vsub(dr0
, si1
);
1785 r3
= vadd(dr0
, si1
);
1786 i0
= vsub(di0
, di1
);
1787 i2
= vadd(di0
, di1
);
1788 i1
= vsub(si0
, dr1
);
1789 i3
= vadd(si0
, dr1
);
1791 vcplxmulconj(r1
,i1
,e
[0],e
[1]);
1792 vcplxmulconj(r2
,i2
,e
[2],e
[3]);
1793 vcplxmulconj(r3
,i3
,e
[4],e
[5]);
1795 vtranspose4(r0
,r1
,r2
,r3
);
1796 vtranspose4(i0
,i1
,i2
,i3
);
1811 NOINLINE
void pffft_real_preprocess(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
,
1814 static constexpr float sqrt2
{al::numbers::sqrt2_v
<float>};
1817 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1818 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1820 std::array
<float,SimdSize
> Xr
{}, Xi
{};
1821 for(size_t k
{0};k
< SimdSize
;++k
)
1823 Xr
[k
] = vextract0(in
[2*k
]);
1824 Xi
[k
] = vextract0(in
[2*k
+ 1]);
1827 pffft_real_preprocess_4x4(in
, e
, out
+1, true); // will write only 6 values
1829 /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1831 * [cr0] [1 0 2 0 1 0 0 0]
1832 * [cr1] [1 0 0 0 -1 0 -2 0]
1833 * [cr2] [1 0 -2 0 1 0 0 0]
1834 * [cr3] [1 0 0 0 -1 0 2 0]
1835 * [ci0] [0 2 0 2 0 0 0 0]
1836 * [ci1] [0 s 0 -s 0 -s 0 -s]
1837 * [ci2] [0 0 0 0 0 -2 0 2]
1838 * [ci3] [0 -s 0 s 0 -s 0 -s]
1840 for(size_t k
{1};k
< dk
;++k
)
1841 pffft_real_preprocess_4x4(in
+8*k
, e
+ k
*6, out
-1+k
*8, false);
1843 const float cr0
{(Xr
[0]+Xi
[0]) + 2*Xr
[2]};
1844 const float cr1
{(Xr
[0]-Xi
[0]) - 2*Xi
[2]};
1845 const float cr2
{(Xr
[0]+Xi
[0]) - 2*Xr
[2]};
1846 const float cr3
{(Xr
[0]-Xi
[0]) + 2*Xi
[2]};
1847 out
[0] = vset4(cr0
, cr1
, cr2
, cr3
);
1848 const float ci0
{ 2*(Xr
[1]+Xr
[3])};
1849 const float ci1
{ sqrt2
*(Xr
[1]-Xr
[3]) - sqrt2
*(Xi
[1]+Xi
[3])};
1850 const float ci2
{ 2*(Xi
[3]-Xi
[1])};
1851 const float ci3
{-sqrt2
*(Xr
[1]-Xr
[3]) - sqrt2
*(Xi
[1]+Xi
[3])};
1852 out
[2*Ncvec
-1] = vset4(ci0
, ci1
, ci2
, ci3
);
1856 void pffft_transform_internal(const PFFFT_Setup
*setup
, const v4sf
*vinput
, v4sf
*voutput
,
1857 v4sf
*scratch
, const pffft_direction_t direction
, const bool ordered
)
1859 assert(scratch
!= nullptr);
1860 assert(voutput
!= scratch
);
1862 const size_t Ncvec
{setup
->Ncvec
};
1863 const bool nf_odd
{(setup
->ifac
[1]&1) != 0};
1865 std::array buff
{voutput
, scratch
};
1866 bool ib
{nf_odd
!= ordered
};
1867 if(direction
== PFFFT_FORWARD
)
1869 /* Swap the initial work buffer for forward FFTs, which helps avoid an
1870 * extra copy for output.
1873 if(setup
->transform
== PFFFT_REAL
)
1875 ib
= (rfftf1_ps(Ncvec
*2, vinput
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
) == buff
[1]);
1876 pffft_real_finalize(Ncvec
, buff
[ib
], buff
[!ib
], setup
->e
.data());
1880 v4sf
*tmp
{buff
[ib
]};
1881 for(size_t k
=0; k
< Ncvec
; ++k
)
1882 uninterleave2(vinput
[k
*2], vinput
[k
*2+1], tmp
[k
*2], tmp
[k
*2+1]);
1884 ib
= (cfftf1_ps(Ncvec
, buff
[ib
], buff
[!ib
], buff
[ib
], setup
->twiddle
, setup
->ifac
, -1.0f
) == buff
[1]);
1885 pffft_cplx_finalize(Ncvec
, buff
[ib
], buff
[!ib
], setup
->e
.data());
1888 pffft_zreorder(setup
, reinterpret_cast<float*>(buff
[!ib
]),
1889 reinterpret_cast<float*>(buff
[ib
]), PFFFT_FORWARD
);
1895 if(vinput
== buff
[ib
])
1896 ib
= !ib
; // may happen when finput == foutput
1900 pffft_zreorder(setup
, reinterpret_cast<const float*>(vinput
),
1901 reinterpret_cast<float*>(buff
[ib
]), PFFFT_BACKWARD
);
1905 if(setup
->transform
== PFFFT_REAL
)
1907 pffft_real_preprocess(Ncvec
, vinput
, buff
[ib
], setup
->e
.data());
1908 ib
= (rfftb1_ps(Ncvec
*2, buff
[ib
], buff
[0], buff
[1], setup
->twiddle
, setup
->ifac
) == buff
[1]);
1912 pffft_cplx_preprocess(Ncvec
, vinput
, buff
[ib
], setup
->e
.data());
1913 ib
= (cfftf1_ps(Ncvec
, buff
[ib
], buff
[0], buff
[1], setup
->twiddle
, setup
->ifac
, +1.0f
) == buff
[1]);
1914 for(size_t k
{0};k
< Ncvec
;++k
)
1915 interleave2(buff
[ib
][k
*2], buff
[ib
][k
*2+1], buff
[ib
][k
*2], buff
[ib
][k
*2+1]);
1919 if(buff
[ib
] != voutput
)
1921 /* extra copy required -- this situation should only happen when finput == foutput */
1922 assert(vinput
==voutput
);
1923 for(size_t k
{0};k
< Ncvec
;++k
)
1925 v4sf a
{buff
[ib
][2*k
]}, b
{buff
[ib
][2*k
+1]};
1926 voutput
[2*k
] = a
; voutput
[2*k
+1] = b
;
1933 void pffft_zreorder(const PFFFT_Setup
*setup
, const float *in
, float *out
,
1934 pffft_direction_t direction
)
1938 const size_t N
{setup
->N
}, Ncvec
{setup
->Ncvec
};
1939 const v4sf
*vin
{reinterpret_cast<const v4sf
*>(in
)};
1940 v4sf
*RESTRICT vout
{reinterpret_cast<v4sf
*>(out
)};
1941 if(setup
->transform
== PFFFT_REAL
)
1943 const size_t dk
{N
/32};
1944 if(direction
== PFFFT_FORWARD
)
1946 for(size_t k
{0};k
< dk
;++k
)
1948 interleave2(vin
[k
*8 + 0], vin
[k
*8 + 1], vout
[2*(0*dk
+ k
) + 0], vout
[2*(0*dk
+ k
) + 1]);
1949 interleave2(vin
[k
*8 + 4], vin
[k
*8 + 5], vout
[2*(2*dk
+ k
) + 0], vout
[2*(2*dk
+ k
) + 1]);
1951 reversed_copy(dk
, vin
+2, 8, vout
+ N
/SimdSize
/2);
1952 reversed_copy(dk
, vin
+6, 8, vout
+ N
/SimdSize
);
1956 for(size_t k
{0};k
< dk
;++k
)
1958 uninterleave2(vin
[2*(0*dk
+ k
) + 0], vin
[2*(0*dk
+ k
) + 1], vout
[k
*8 + 0], vout
[k
*8 + 1]);
1959 uninterleave2(vin
[2*(2*dk
+ k
) + 0], vin
[2*(2*dk
+ k
) + 1], vout
[k
*8 + 4], vout
[k
*8 + 5]);
1961 unreversed_copy(dk
, vin
+ N
/SimdSize
/4, vout
+ N
/SimdSize
- 6, -8);
1962 unreversed_copy(dk
, vin
+ 3_uz
*N
/SimdSize
/4, vout
+ N
/SimdSize
- 2, -8);
1967 if(direction
== PFFFT_FORWARD
)
1969 for(size_t k
{0};k
< Ncvec
;++k
)
1971 size_t kk
{(k
/4) + (k
%4)*(Ncvec
/4)};
1972 interleave2(vin
[k
*2], vin
[k
*2+1], vout
[kk
*2], vout
[kk
*2+1]);
1977 for(size_t k
{0};k
< Ncvec
;++k
)
1979 size_t kk
{(k
/4) + (k
%4)*(Ncvec
/4)};
1980 uninterleave2(vin
[kk
*2], vin
[kk
*2+1], vout
[k
*2], vout
[k
*2+1]);
1986 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
,
1987 float *ab
, float scaling
)
1989 const size_t Ncvec
{s
->Ncvec
};
1990 const v4sf
*RESTRICT va
{reinterpret_cast<const v4sf
*>(a
)};
1991 const v4sf
*RESTRICT vb
{reinterpret_cast<const v4sf
*>(b
)};
1992 v4sf
*RESTRICT vab
{reinterpret_cast<v4sf
*>(ab
)};
1995 __builtin_prefetch(va
);
1996 __builtin_prefetch(vb
);
1997 __builtin_prefetch(vab
);
1998 __builtin_prefetch(va
+2);
1999 __builtin_prefetch(vb
+2);
2000 __builtin_prefetch(vab
+2);
2001 __builtin_prefetch(va
+4);
2002 __builtin_prefetch(vb
+4);
2003 __builtin_prefetch(vab
+4);
2004 __builtin_prefetch(va
+6);
2005 __builtin_prefetch(vb
+6);
2006 __builtin_prefetch(vab
+6);
2008 #define ZCONVOLVE_USING_INLINE_NEON_ASM
2012 const float ar1
{vextract0(va
[0])};
2013 const float ai1
{vextract0(va
[1])};
2014 const float br1
{vextract0(vb
[0])};
2015 const float bi1
{vextract0(vb
[1])};
2016 const float abr1
{vextract0(vab
[0])};
2017 const float abi1
{vextract0(vab
[1])};
2019 #ifdef ZCONVOLVE_USING_INLINE_ASM
2020 /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
2021 * Ubuntu. So this will be restricted to GCC.
2023 * Does it still miscompile with Clang? Is it even needed with today's
2026 const float *a_
{a
}, *b_
{b
}; float *ab_
{ab
};
2028 asm volatile("mov r8, %2 \n"
2029 "vdup.f32 q15, %4 \n"
2037 "vld1.f32 {q0,q1}, [%0,:128]! \n"
2038 "vld1.f32 {q4,q5}, [%1,:128]! \n"
2039 "vld1.f32 {q2,q3}, [%0,:128]! \n"
2040 "vld1.f32 {q6,q7}, [%1,:128]! \n"
2041 "vld1.f32 {q8,q9}, [r8,:128]! \n"
2043 "vmul.f32 q10, q0, q4 \n"
2044 "vmul.f32 q11, q0, q5 \n"
2045 "vmul.f32 q12, q2, q6 \n"
2046 "vmul.f32 q13, q2, q7 \n"
2047 "vmls.f32 q10, q1, q5 \n"
2048 "vmla.f32 q11, q1, q4 \n"
2049 "vld1.f32 {q0,q1}, [r8,:128]! \n"
2050 "vmls.f32 q12, q3, q7 \n"
2051 "vmla.f32 q13, q3, q6 \n"
2052 "vmla.f32 q8, q10, q15 \n"
2053 "vmla.f32 q9, q11, q15 \n"
2054 "vmla.f32 q0, q12, q15 \n"
2055 "vmla.f32 q1, q13, q15 \n"
2056 "vst1.f32 {q8,q9},[%2,:128]! \n"
2057 "vst1.f32 {q0,q1},[%2,:128]! \n"
2060 : "+r"(a_
), "+r"(b_
), "+r"(ab_
), "+r"(N
) : "r"(scaling
) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
2064 /* Default routine, works fine for non-arm cpus with current compilers. */
2065 const v4sf vscal
{ld_ps1(scaling
)};
2066 for(size_t i
{0};i
< Ncvec
;i
+= 2)
2068 v4sf ar4
{va
[2*i
+0]}, ai4
{va
[2*i
+1]};
2069 v4sf br4
{vb
[2*i
+0]}, bi4
{vb
[2*i
+1]};
2070 vcplxmul(ar4
, ai4
, br4
, bi4
);
2071 vab
[2*i
+0] = vmadd(ar4
, vscal
, vab
[2*i
+0]);
2072 vab
[2*i
+1] = vmadd(ai4
, vscal
, vab
[2*i
+1]);
2073 ar4
= va
[2*i
+2]; ai4
= va
[2*i
+3];
2074 br4
= vb
[2*i
+2]; bi4
= vb
[2*i
+3];
2075 vcplxmul(ar4
, ai4
, br4
, bi4
);
2076 vab
[2*i
+2] = vmadd(ar4
, vscal
, vab
[2*i
+2]);
2077 vab
[2*i
+3] = vmadd(ai4
, vscal
, vab
[2*i
+3]);
2081 if(s
->transform
== PFFFT_REAL
)
2083 vab
[0] = vinsert0(vab
[0], abr1
+ ar1
*br1
*scaling
);
2084 vab
[1] = vinsert0(vab
[1], abi1
+ ai1
*bi1
*scaling
);
2088 void pffft_zconvolve_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
, float *ab
)
2090 const size_t Ncvec
{s
->Ncvec
};
2091 const v4sf
*RESTRICT va
{reinterpret_cast<const v4sf
*>(a
)};
2092 const v4sf
*RESTRICT vb
{reinterpret_cast<const v4sf
*>(b
)};
2093 v4sf
*RESTRICT vab
{reinterpret_cast<v4sf
*>(ab
)};
2096 __builtin_prefetch(va
);
2097 __builtin_prefetch(vb
);
2098 __builtin_prefetch(vab
);
2099 __builtin_prefetch(va
+2);
2100 __builtin_prefetch(vb
+2);
2101 __builtin_prefetch(vab
+2);
2102 __builtin_prefetch(va
+4);
2103 __builtin_prefetch(vb
+4);
2104 __builtin_prefetch(vab
+4);
2105 __builtin_prefetch(va
+6);
2106 __builtin_prefetch(vb
+6);
2107 __builtin_prefetch(vab
+6);
2110 const float ar1
{vextract0(va
[0])};
2111 const float ai1
{vextract0(va
[1])};
2112 const float br1
{vextract0(vb
[0])};
2113 const float bi1
{vextract0(vb
[1])};
2114 const float abr1
{vextract0(vab
[0])};
2115 const float abi1
{vextract0(vab
[1])};
2117 /* No inline assembly for this version. I'm not familiar enough with NEON
2118 * assembly, and I don't know that it's needed with today's optimizers.
2120 for(size_t i
{0};i
< Ncvec
;i
+= 2)
2122 v4sf ar4
{va
[2*i
+0]}, ai4
{va
[2*i
+1]};
2123 v4sf br4
{vb
[2*i
+0]}, bi4
{vb
[2*i
+1]};
2124 vcplxmul(ar4
, ai4
, br4
, bi4
);
2125 vab
[2*i
+0] = vadd(ar4
, vab
[2*i
+0]);
2126 vab
[2*i
+1] = vadd(ai4
, vab
[2*i
+1]);
2127 ar4
= va
[2*i
+2]; ai4
= va
[2*i
+3];
2128 br4
= vb
[2*i
+2]; bi4
= vb
[2*i
+3];
2129 vcplxmul(ar4
, ai4
, br4
, bi4
);
2130 vab
[2*i
+2] = vadd(ar4
, vab
[2*i
+2]);
2131 vab
[2*i
+3] = vadd(ai4
, vab
[2*i
+3]);
2134 if(s
->transform
== PFFFT_REAL
)
2136 vab
[0] = vinsert0(vab
[0], abr1
+ ar1
*br1
);
2137 vab
[1] = vinsert0(vab
[1], abi1
+ ai1
*bi1
);
2142 void pffft_transform(const PFFFT_Setup
*setup
, const float *input
, float *output
, float *work
,
2143 pffft_direction_t direction
)
2145 assert(valigned(input
) && valigned(output
) && valigned(work
));
2146 pffft_transform_internal(setup
, reinterpret_cast<const v4sf
*>(al::assume_aligned
<16>(input
)),
2147 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(output
)),
2148 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(work
)), direction
, false);
2151 void pffft_transform_ordered(const PFFFT_Setup
*setup
, const float *input
, float *output
,
2152 float *work
, pffft_direction_t direction
)
2154 assert(valigned(input
) && valigned(output
) && valigned(work
));
2155 pffft_transform_internal(setup
, reinterpret_cast<const v4sf
*>(al::assume_aligned
<16>(input
)),
2156 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(output
)),
2157 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(work
)), direction
, true);
2160 #else // defined(PFFFT_SIMD_DISABLE)
2162 // standard routine using scalar floats, without SIMD stuff.
2166 void pffft_transform_internal(const PFFFT_Setup
*setup
, const float *input
, float *output
,
2167 float *scratch
, const pffft_direction_t direction
, bool ordered
)
2169 const size_t Ncvec
{setup
->Ncvec
};
2170 const bool nf_odd
{(setup
->ifac
[1]&1) != 0};
2172 assert(scratch
!= nullptr);
2174 /* z-domain data for complex transforms is already ordered without SIMD. */
2175 if(setup
->transform
== PFFFT_COMPLEX
)
2178 float *buff
[2]{output
, scratch
};
2179 bool ib
{nf_odd
!= ordered
};
2180 if(direction
== PFFFT_FORWARD
)
2182 if(setup
->transform
== PFFFT_REAL
)
2183 ib
= (rfftf1_ps(Ncvec
*2, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
) == buff
[1]);
2185 ib
= (cfftf1_ps(Ncvec
, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
, -1.0f
) == buff
[1]);
2188 pffft_zreorder(setup
, buff
[ib
], buff
[!ib
], PFFFT_FORWARD
);
2194 if (input
== buff
[ib
])
2195 ib
= !ib
; // may happen when finput == foutput
2199 pffft_zreorder(setup
, input
, buff
[ib
], PFFFT_BACKWARD
);
2203 if(setup
->transform
== PFFFT_REAL
)
2204 ib
= (rfftb1_ps(Ncvec
*2, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
) == buff
[1]);
2206 ib
= (cfftf1_ps(Ncvec
, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
, +1.0f
) == buff
[1]);
2208 if(buff
[ib
] != output
)
2210 // extra copy required -- this situation should happens only when finput == foutput
2211 assert(input
==output
);
2212 for(size_t k
{0};k
< Ncvec
;++k
)
2214 float a
{buff
[ib
][2*k
]}, b
{buff
[ib
][2*k
+1]};
2215 output
[2*k
] = a
; output
[2*k
+1] = b
;
2222 void pffft_zreorder(const PFFFT_Setup
*setup
, const float *in
, float *RESTRICT out
,
2223 pffft_direction_t direction
)
2225 const size_t N
{setup
->N
};
2226 if(setup
->transform
== PFFFT_COMPLEX
)
2228 for(size_t k
{0};k
< 2*N
;++k
)
2232 else if(direction
== PFFFT_FORWARD
)
2235 for(size_t k
{N
-1};k
> 1;--k
)
2243 for(size_t k
{1};k
< N
-1;++k
)
2250 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
,
2251 float *ab
, float scaling
)
2253 size_t Ncvec
{s
->Ncvec
};
2255 if(s
->transform
== PFFFT_REAL
)
2257 // take care of the fftpack ordering
2258 ab
[0] += a
[0]*b
[0]*scaling
;
2259 ab
[2*Ncvec
-1] += a
[2*Ncvec
-1]*b
[2*Ncvec
-1]*scaling
;
2260 ++ab
; ++a
; ++b
; --Ncvec
;
2262 for(size_t i
{0};i
< Ncvec
;++i
)
2264 float ar
{a
[2*i
+0]}, ai
{a
[2*i
+1]};
2265 const float br
{b
[2*i
+0]}, bi
{b
[2*i
+1]};
2266 vcplxmul(ar
, ai
, br
, bi
);
2267 ab
[2*i
+0] += ar
*scaling
;
2268 ab
[2*i
+1] += ai
*scaling
;
2272 void pffft_zconvolve_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
, float *ab
)
2274 size_t Ncvec
{s
->Ncvec
};
2276 if(s
->transform
== PFFFT_REAL
)
2278 // take care of the fftpack ordering
2280 ab
[2*Ncvec
-1] += a
[2*Ncvec
-1]*b
[2*Ncvec
-1];
2281 ++ab
; ++a
; ++b
; --Ncvec
;
2283 for(size_t i
{0};i
< Ncvec
;++i
)
2285 float ar
{a
[2*i
+0]}, ai
{a
[2*i
+1]};
2286 const float br
{b
[2*i
+0]}, bi
{b
[2*i
+1]};
2287 vcplxmul(ar
, ai
, br
, bi
);
2294 void pffft_transform(const PFFFT_Setup
*setup
, const float *input
, float *output
, float *work
,
2295 pffft_direction_t direction
)
2297 pffft_transform_internal(setup
, input
, output
, work
, direction
, false);
2300 void pffft_transform_ordered(const PFFFT_Setup
*setup
, const float *input
, float *output
,
2301 float *work
, pffft_direction_t direction
)
2303 pffft_transform_internal(setup
, input
, output
, work
, direction
, true);
2306 #endif /* defined(PFFFT_SIMD_DISABLE) */
2307 /* NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) */