3 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
4 * Copyright (c) 2023 Christopher Robinson
6 * Based on original fortran 77 code from FFTPACKv4 from NETLIB
7 * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
10 * As confirmed by the NCAR fftpack software curators, the following
11 * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
12 * released under the same terms.
16 * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
18 * Copyright (c) 2004 the University Corporation for Atmospheric
19 * Research ("UCAR"). All rights reserved. Developed by NCAR's
20 * Computational and Information Systems Laboratory, UCAR,
23 * Redistribution and use of the Software in source and binary forms,
24 * with or without modification, is permitted provided that the
25 * following conditions are met:
27 * - Neither the names of NCAR's Computational and Information Systems
28 * Laboratory, the University Corporation for Atmospheric Research,
29 * nor the names of its sponsors or contributors may be used to
30 * endorse or promote products derived from this Software without
31 * specific prior written permission.
33 * - Redistributions of source code must retain the above copyright
34 * notices, this list of conditions, and the disclaimer below.
36 * - Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions, and the disclaimer below in the
38 * documentation and/or other materials provided with the
41 * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42 * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
43 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
44 * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
45 * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
46 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
52 * PFFFT : a Pretty Fast FFT.
54 * This file is largerly based on the original FFTPACK implementation, modified
55 * in order to take advantage of SIMD instructions of modern CPUs.
73 #include "alnumbers.h"
74 #include "alnumeric.h"
76 #include "opthelpers.h"
79 using uint
= unsigned int;
83 #if defined(__GNUC__) || defined(_MSC_VER)
84 #define RESTRICT __restrict
89 /* Vector support macros: the rest of the code is independent of
90 * SSE/Altivec/NEON -- adding support for other platforms with 4-element
91 * vectors should be limited to these macros
94 /* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
95 //#define PFFFT_SIMD_DISABLE
97 #ifndef PFFFT_SIMD_DISABLE
99 * Altivec support macros
101 #if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
103 using v4sf
= vector
float;
104 constexpr uint SimdSize
{4};
105 force_inline v4sf
vzero() noexcept
{ return (vector
float)vec_splat_u8(0); }
106 force_inline v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return vec_madd(a
, b
, vzero()); }
107 force_inline v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return vec_add(a
, b
); }
108 force_inline v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return vec_madd(a
, b
, c
); }
109 force_inline v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return vec_sub(a
, b
); }
110 force_inline v4sf
ld_ps1(float a
) noexcept
{ return vec_splats(a
); }
112 force_inline v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
114 /* There a more efficient way to do this? */
115 alignas(16) std::array
<float,4> vals
{{a
, b
, c
, d
}};
116 return vec_ld(0, vals
.data());
118 force_inline v4sf
vinsert0(v4sf v
, float a
) noexcept
{ return vec_insert(a
, v
, 0); }
119 force_inline
float vextract0(v4sf v
) noexcept
{ return vec_extract(v
, 0); }
121 force_inline
void interleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
123 out1
= vec_mergeh(in1
, in2
);
124 out2
= vec_mergel(in1
, in2
);
126 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
128 out1
= vec_perm(in1
, in2
, (vector
unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27});
129 out2
= vec_perm(in1
, in2
, (vector
unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31});
133 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
135 v4sf y0
{vec_mergeh(x0
, x2
)};
136 v4sf y1
{vec_mergel(x0
, x2
)};
137 v4sf y2
{vec_mergeh(x1
, x3
)};
138 v4sf y3
{vec_mergel(x1
, x3
)};
139 x0
= vec_mergeh(y0
, y2
);
140 x1
= vec_mergel(y0
, y2
);
141 x2
= vec_mergeh(y1
, y3
);
142 x3
= vec_mergel(y1
, y3
);
145 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
146 { return vec_perm(a
,b
, (vector
unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}); }
149 * SSE1 support macros
151 #elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
152 (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
154 #include <xmmintrin.h>
156 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/
157 * finalize functions anyway so you will have to work if you want to enable AVX
158 * with its 256-bit vectors.
160 constexpr uint SimdSize
{4};
161 force_inline v4sf
vzero() noexcept
{ return _mm_setzero_ps(); }
162 force_inline v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return _mm_mul_ps(a
, b
); }
163 force_inline v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return _mm_add_ps(a
, b
); }
164 force_inline v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return _mm_add_ps(_mm_mul_ps(a
,b
), c
); }
165 force_inline v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return _mm_sub_ps(a
, b
); }
166 force_inline v4sf
ld_ps1(float a
) noexcept
{ return _mm_set1_ps(a
); }
168 force_inline v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
169 { return _mm_setr_ps(a
, b
, c
, d
); }
170 force_inline v4sf
vinsert0(const v4sf v
, const float a
) noexcept
171 { return _mm_move_ss(v
, _mm_set_ss(a
)); }
172 force_inline
float vextract0(v4sf v
) noexcept
173 { return _mm_cvtss_f32(v
); }
175 force_inline
void interleave2(const v4sf in1
, const v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
177 out1
= _mm_unpacklo_ps(in1
, in2
);
178 out2
= _mm_unpackhi_ps(in1
, in2
);
180 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
182 out1
= _mm_shuffle_ps(in1
, in2
, _MM_SHUFFLE(2,0,2,0));
183 out2
= _mm_shuffle_ps(in1
, in2
, _MM_SHUFFLE(3,1,3,1));
186 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
187 { _MM_TRANSPOSE4_PS(x0
, x1
, x2
, x3
); }
189 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
190 { return _mm_shuffle_ps(b
, a
, _MM_SHUFFLE(3,2,1,0)); }
193 * ARM NEON support macros
195 #elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)
197 #include <arm_neon.h>
198 using v4sf
= float32x4_t
;
199 constexpr uint SimdSize
{4};
200 force_inline v4sf
vzero() noexcept
{ return vdupq_n_f32(0.0f
); }
201 force_inline v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return vmulq_f32(a
, b
); }
202 force_inline v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return vaddq_f32(a
, b
); }
203 force_inline v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return vmlaq_f32(c
, a
, b
); }
204 force_inline v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return vsubq_f32(a
, b
); }
205 force_inline v4sf
ld_ps1(float a
) noexcept
{ return vdupq_n_f32(a
); }
207 force_inline v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
209 float32x4_t ret
{vmovq_n_f32(a
)};
210 ret
= vsetq_lane_f32(b
, ret
, 1);
211 ret
= vsetq_lane_f32(c
, ret
, 2);
212 ret
= vsetq_lane_f32(d
, ret
, 3);
215 force_inline v4sf
vinsert0(v4sf v
, float a
) noexcept
216 { return vsetq_lane_f32(a
, v
, 0); }
217 force_inline
float vextract0(v4sf v
) noexcept
218 { return vgetq_lane_f32(v
, 0); }
220 force_inline
void interleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
222 float32x4x2_t tmp
{vzipq_f32(in1
, in2
)};
226 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
228 float32x4x2_t tmp
{vuzpq_f32(in1
, in2
)};
233 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
235 /* marginally faster version:
236 * asm("vtrn.32 %q0, %q1;\n"
237 * "vtrn.32 %q2, %q3\n
240 * : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
242 float32x4x2_t t0_
{vzipq_f32(x0
, x2
)};
243 float32x4x2_t t1_
{vzipq_f32(x1
, x3
)};
244 float32x4x2_t u0_
{vzipq_f32(t0_
.val
[0], t1_
.val
[0])};
245 float32x4x2_t u1_
{vzipq_f32(t0_
.val
[1], t1_
.val
[1])};
252 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
253 { return vcombine_f32(vget_low_f32(b
), vget_high_f32(a
)); }
256 * Generic GCC vector macros
258 #elif defined(__GNUC__)
260 using v4sf
[[gnu::vector_size(16), gnu::aligned(16)]] = float;
261 constexpr uint SimdSize
{4};
262 force_inline
constexpr v4sf
vzero() noexcept
{ return v4sf
{0.0f
, 0.0f
, 0.0f
, 0.0f
}; }
263 force_inline
constexpr v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return a
* b
; }
264 force_inline
constexpr v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return a
+ b
; }
265 force_inline
constexpr v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return a
*b
+ c
; }
266 force_inline
constexpr v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return a
- b
; }
267 force_inline
constexpr v4sf
ld_ps1(float a
) noexcept
{ return v4sf
{a
, a
, a
, a
}; }
269 force_inline
constexpr v4sf
vset4(float a
, float b
, float c
, float d
) noexcept
270 { return v4sf
{a
, b
, c
, d
}; }
271 force_inline
constexpr v4sf
vinsert0(v4sf v
, float a
) noexcept
272 { return v4sf
{a
, v
[1], v
[2], v
[3]}; }
273 force_inline
float vextract0(v4sf v
) noexcept
276 force_inline v4sf
unpacklo(v4sf a
, v4sf b
) noexcept
277 { return v4sf
{a
[0], b
[0], a
[1], b
[1]}; }
278 force_inline v4sf
unpackhi(v4sf a
, v4sf b
) noexcept
279 { return v4sf
{a
[2], b
[2], a
[3], b
[3]}; }
281 force_inline
void interleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
283 out1
= unpacklo(in1
, in2
);
284 out2
= unpackhi(in1
, in2
);
286 force_inline
void uninterleave2(v4sf in1
, v4sf in2
, v4sf
&out1
, v4sf
&out2
) noexcept
288 out1
= v4sf
{in1
[0], in1
[2], in2
[0], in2
[2]};
289 out2
= v4sf
{in1
[1], in1
[3], in2
[1], in2
[3]};
292 force_inline
void vtranspose4(v4sf
&x0
, v4sf
&x1
, v4sf
&x2
, v4sf
&x3
) noexcept
294 v4sf tmp0
{unpacklo(x0
, x1
)};
295 v4sf tmp2
{unpacklo(x2
, x3
)};
296 v4sf tmp1
{unpackhi(x0
, x1
)};
297 v4sf tmp3
{unpackhi(x2
, x3
)};
298 x0
= v4sf
{tmp0
[0], tmp0
[1], tmp2
[0], tmp2
[1]};
299 x1
= v4sf
{tmp0
[2], tmp0
[3], tmp2
[2], tmp2
[3]};
300 x2
= v4sf
{tmp1
[0], tmp1
[1], tmp3
[0], tmp3
[1]};
301 x3
= v4sf
{tmp1
[2], tmp1
[3], tmp3
[2], tmp3
[3]};
304 force_inline v4sf
vswaphl(v4sf a
, v4sf b
) noexcept
305 { return v4sf
{b
[0], b
[1], a
[2], a
[3]}; }
309 #warning "building with simd disabled !\n";
310 #define PFFFT_SIMD_DISABLE // fallback to scalar code
313 #endif /* PFFFT_SIMD_DISABLE */
315 // fallback mode for situations where SIMD is not available, use scalar mode instead
316 #ifdef PFFFT_SIMD_DISABLE
318 constexpr uint SimdSize
{1};
319 force_inline
constexpr v4sf
vmul(v4sf a
, v4sf b
) noexcept
{ return a
* b
; }
320 force_inline
constexpr v4sf
vadd(v4sf a
, v4sf b
) noexcept
{ return a
+ b
; }
321 force_inline
constexpr v4sf
vmadd(v4sf a
, v4sf b
, v4sf c
) noexcept
{ return a
*b
+ c
; }
322 force_inline
constexpr v4sf
vsub(v4sf a
, v4sf b
) noexcept
{ return a
- b
; }
323 force_inline
constexpr v4sf
ld_ps1(float a
) noexcept
{ return a
; }
327 [[maybe_unused
]] inline
328 auto valigned(const float *ptr
) noexcept
-> bool
330 static constexpr uintptr_t alignmask
{SimdSize
*sizeof(float) - 1};
331 return (reinterpret_cast<uintptr_t>(ptr
) & alignmask
) == 0;
335 // shortcuts for complex multiplications
336 force_inline
void vcplxmul(v4sf
&ar
, v4sf
&ai
, v4sf br
, v4sf bi
) noexcept
338 v4sf tmp
{vmul(ar
, bi
)};
339 ar
= vsub(vmul(ar
, br
), vmul(ai
, bi
));
340 ai
= vmadd(ai
, br
, tmp
);
342 force_inline
void vcplxmulconj(v4sf
&ar
, v4sf
&ai
, v4sf br
, v4sf bi
) noexcept
344 v4sf tmp
{vmul(ar
, bi
)};
345 ar
= vmadd(ai
, bi
, vmul(ar
, br
));
346 ai
= vsub(vmul(ai
, br
), tmp
);
349 #if !defined(PFFFT_SIMD_DISABLE)
351 inline void assertv4(const al::span
<const float,4> v_f
[[maybe_unused
]],
352 const float f0
[[maybe_unused
]], const float f1
[[maybe_unused
]],
353 const float f2
[[maybe_unused
]], const float f3
[[maybe_unused
]])
354 { assert(v_f
[0] == f0
&& v_f
[1] == f1
&& v_f
[2] == f2
&& v_f
[3] == f3
); }
356 template<typename T
, T
...N
>
357 constexpr auto make_float_array(std::integer_sequence
<T
,N
...>)
358 { return std::array
{static_cast<float>(N
)...}; }
360 /* detect bugs with the vector support macros */
361 [[maybe_unused
]] void validate_pffft_simd()
363 using float4
= std::array
<float,4>;
364 static constexpr auto f
= make_float_array(std::make_index_sequence
<16>{});
366 v4sf a0_v
{vset4(f
[ 0], f
[ 1], f
[ 2], f
[ 3])};
367 v4sf a1_v
{vset4(f
[ 4], f
[ 5], f
[ 6], f
[ 7])};
368 v4sf a2_v
{vset4(f
[ 8], f
[ 9], f
[10], f
[11])};
369 v4sf a3_v
{vset4(f
[12], f
[13], f
[14], f
[15])};
373 auto t_f
= al::bit_cast
<float4
>(t_v
);
374 printf("VZERO=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
375 assertv4(t_f
, 0, 0, 0, 0);
377 t_v
= vadd(a1_v
, a2_v
);
378 t_f
= al::bit_cast
<float4
>(t_v
);
379 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
380 assertv4(t_f
, 12, 14, 16, 18);
382 t_v
= vmul(a1_v
, a2_v
);
383 t_f
= al::bit_cast
<float4
>(t_v
);
384 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
385 assertv4(t_f
, 32, 45, 60, 77);
387 t_v
= vmadd(a1_v
, a2_v
, a0_v
);
388 t_f
= al::bit_cast
<float4
>(t_v
);
389 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
390 assertv4(t_f
, 32, 46, 62, 80);
392 interleave2(a1_v
, a2_v
, t_v
, u_v
);
393 t_f
= al::bit_cast
<float4
>(t_v
);
394 auto u_f
= al::bit_cast
<float4
>(u_v
);
395 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
396 t_f
[0], t_f
[1], t_f
[2], t_f
[3], u_f
[0], u_f
[1], u_f
[2], u_f
[3]);
397 assertv4(t_f
, 4, 8, 5, 9);
398 assertv4(u_f
, 6, 10, 7, 11);
400 uninterleave2(a1_v
, a2_v
, t_v
, u_v
);
401 t_f
= al::bit_cast
<float4
>(t_v
);
402 u_f
= al::bit_cast
<float4
>(u_v
);
403 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
404 t_f
[0], t_f
[1], t_f
[2], t_f
[3], u_f
[0], u_f
[1], u_f
[2], u_f
[3]);
405 assertv4(t_f
, 4, 6, 8, 10);
406 assertv4(u_f
, 5, 7, 9, 11);
409 t_f
= al::bit_cast
<float4
>(t_v
);
410 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
411 assertv4(t_f
, 15, 15, 15, 15);
413 t_v
= vswaphl(a1_v
, a2_v
);
414 t_f
= al::bit_cast
<float4
>(t_v
);
415 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f
[0], t_f
[1], t_f
[2], t_f
[3]);
416 assertv4(t_f
, 8, 9, 6, 7);
418 vtranspose4(a0_v
, a1_v
, a2_v
, a3_v
);
419 auto a0_f
= al::bit_cast
<float4
>(a0_v
);
420 auto a1_f
= al::bit_cast
<float4
>(a1_v
);
421 auto a2_f
= al::bit_cast
<float4
>(a2_v
);
422 auto a3_f
= al::bit_cast
<float4
>(a3_v
);
423 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
424 a0_f
[0], a0_f
[1], a0_f
[2], a0_f
[3], a1_f
[0], a1_f
[1], a1_f
[2], a1_f
[3],
425 a2_f
[0], a2_f
[1], a2_f
[2], a2_f
[3], a3_f
[0], a3_f
[1], a3_f
[2], a3_f
[3]);
426 assertv4(a0_f
, 0, 4, 8, 12);
427 assertv4(a1_f
, 1, 5, 9, 13);
428 assertv4(a2_f
, 2, 6, 10, 14);
429 assertv4(a3_f
, 3, 7, 11, 15);
431 #endif //!PFFFT_SIMD_DISABLE
433 /* SSE and co like 16-bytes aligned pointers */
434 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
435 constexpr auto V4sfAlignment
= size_t(64);
436 constexpr auto V4sfAlignVal
= std::align_val_t(V4sfAlignment
);
438 /* NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
439 * FIXME: Converting this from raw pointers to spans or something will probably
440 * need significant work to maintain performance, given non-sequential range-
441 * checked accesses and lack of 'restrict' to indicate non-aliased memory. At
442 * least, some tests should be done to check the impact of using range-checked
443 * spans here before blindly switching.
446 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
448 NOINLINE
void passf2_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
449 const float *const wa1
, const float fsign
)
451 const size_t l1ido
{l1
*ido
};
454 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+= ido
, cc
+= 2*ido
)
456 ch
[0] = vadd(cc
[0], cc
[ido
+0]);
457 ch
[l1ido
] = vsub(cc
[0], cc
[ido
+0]);
458 ch
[1] = vadd(cc
[1], cc
[ido
+1]);
459 ch
[l1ido
+ 1] = vsub(cc
[1], cc
[ido
+1]);
464 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+= ido
, cc
+= 2*ido
)
466 for(size_t i
{0};i
< ido
-1;i
+= 2)
468 v4sf tr2
{vsub(cc
[i
+0], cc
[i
+ido
+0])};
469 v4sf ti2
{vsub(cc
[i
+1], cc
[i
+ido
+1])};
470 v4sf wr
{ld_ps1(wa1
[i
])}, wi
{ld_ps1(wa1
[i
+1]*fsign
)};
471 ch
[i
] = vadd(cc
[i
+0], cc
[i
+ido
+0]);
472 ch
[i
+1] = vadd(cc
[i
+1], cc
[i
+ido
+1]);
473 vcplxmul(tr2
, ti2
, wr
, wi
);
482 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
484 NOINLINE
void passf3_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
485 const float *const wa1
, const float fsign
)
489 const v4sf taur
{ld_ps1(-0.5f
)};
490 const v4sf taui
{ld_ps1(0.866025403784439f
*fsign
)};
491 const size_t l1ido
{l1
*ido
};
492 const auto wa2
= wa1
+ ido
;
493 for(size_t k
{0};k
< l1ido
;k
+= ido
, cc
+= 3*ido
, ch
+=ido
)
495 for(size_t i
{0};i
< ido
-1;i
+= 2)
497 v4sf tr2
{vadd(cc
[i
+ido
], cc
[i
+2*ido
])};
498 v4sf cr2
{vmadd(taur
, tr2
, cc
[i
])};
499 ch
[i
] = vadd(tr2
, cc
[i
]);
500 v4sf ti2
{vadd(cc
[i
+ido
+1], cc
[i
+2*ido
+1])};
501 v4sf ci2
{vmadd(taur
, ti2
, cc
[i
+1])};
502 ch
[i
+1] = vadd(cc
[i
+1], ti2
);
503 v4sf cr3
{vmul(taui
, vsub(cc
[i
+ido
], cc
[i
+2*ido
]))};
504 v4sf ci3
{vmul(taui
, vsub(cc
[i
+ido
+1], cc
[i
+2*ido
+1]))};
505 v4sf dr2
{vsub(cr2
, ci3
)};
506 v4sf dr3
{vadd(cr2
, ci3
)};
507 v4sf di2
{vadd(ci2
, cr3
)};
508 v4sf di3
{vsub(ci2
, cr3
)};
509 float wr1
{wa1
[i
]}, wi1
{fsign
*wa1
[i
+1]}, wr2
{wa2
[i
]}, wi2
{fsign
*wa2
[i
+1]};
510 vcplxmul(dr2
, di2
, ld_ps1(wr1
), ld_ps1(wi1
));
512 ch
[i
+l1ido
+ 1] = di2
;
513 vcplxmul(dr3
, di3
, ld_ps1(wr2
), ld_ps1(wi2
));
515 ch
[i
+2*l1ido
+1] = di3
;
520 NOINLINE
void passf4_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
521 const float *const wa1
, const float fsign
)
523 /* fsign == -1 for forward transform and +1 for backward transform */
524 const v4sf vsign
{ld_ps1(fsign
)};
525 const size_t l1ido
{l1
*ido
};
528 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+= ido
, cc
+= 4*ido
)
530 v4sf tr1
{vsub(cc
[0], cc
[2*ido
+ 0])};
531 v4sf tr2
{vadd(cc
[0], cc
[2*ido
+ 0])};
532 v4sf ti1
{vsub(cc
[1], cc
[2*ido
+ 1])};
533 v4sf ti2
{vadd(cc
[1], cc
[2*ido
+ 1])};
534 v4sf ti4
{vmul(vsub(cc
[1*ido
+ 0], cc
[3*ido
+ 0]), vsign
)};
535 v4sf tr4
{vmul(vsub(cc
[3*ido
+ 1], cc
[1*ido
+ 1]), vsign
)};
536 v4sf tr3
{vadd(cc
[ido
+ 0], cc
[3*ido
+ 0])};
537 v4sf ti3
{vadd(cc
[ido
+ 1], cc
[3*ido
+ 1])};
539 ch
[0*l1ido
+ 0] = vadd(tr2
, tr3
);
540 ch
[0*l1ido
+ 1] = vadd(ti2
, ti3
);
541 ch
[1*l1ido
+ 0] = vadd(tr1
, tr4
);
542 ch
[1*l1ido
+ 1] = vadd(ti1
, ti4
);
543 ch
[2*l1ido
+ 0] = vsub(tr2
, tr3
);
544 ch
[2*l1ido
+ 1] = vsub(ti2
, ti3
);
545 ch
[3*l1ido
+ 0] = vsub(tr1
, tr4
);
546 ch
[3*l1ido
+ 1] = vsub(ti1
, ti4
);
551 const auto wa2
= wa1
+ ido
;
552 const auto wa3
= wa2
+ ido
;
553 for(size_t k
{0};k
< l1ido
;k
+= ido
, ch
+=ido
, cc
+= 4*ido
)
555 for(size_t i
{0};i
< ido
-1;i
+=2)
557 v4sf tr1
{vsub(cc
[i
+ 0], cc
[i
+ 2*ido
+ 0])};
558 v4sf tr2
{vadd(cc
[i
+ 0], cc
[i
+ 2*ido
+ 0])};
559 v4sf ti1
{vsub(cc
[i
+ 1], cc
[i
+ 2*ido
+ 1])};
560 v4sf ti2
{vadd(cc
[i
+ 1], cc
[i
+ 2*ido
+ 1])};
561 v4sf tr4
{vmul(vsub(cc
[i
+ 3*ido
+ 1], cc
[i
+ 1*ido
+ 1]), vsign
)};
562 v4sf ti4
{vmul(vsub(cc
[i
+ 1*ido
+ 0], cc
[i
+ 3*ido
+ 0]), vsign
)};
563 v4sf tr3
{vadd(cc
[i
+ ido
+ 0], cc
[i
+ 3*ido
+ 0])};
564 v4sf ti3
{vadd(cc
[i
+ ido
+ 1], cc
[i
+ 3*ido
+ 1])};
566 ch
[i
] = vadd(tr2
, tr3
);
567 v4sf cr3
{vsub(tr2
, tr3
)};
568 ch
[i
+ 1] = vadd(ti2
, ti3
);
569 v4sf ci3
{vsub(ti2
, ti3
)};
571 v4sf cr2
{vadd(tr1
, tr4
)};
572 v4sf cr4
{vsub(tr1
, tr4
)};
573 v4sf ci2
{vadd(ti1
, ti4
)};
574 v4sf ci4
{vsub(ti1
, ti4
)};
575 float wr1
{wa1
[i
]}, wi1
{fsign
*wa1
[i
+1]};
576 vcplxmul(cr2
, ci2
, ld_ps1(wr1
), ld_ps1(wi1
));
577 float wr2
{wa2
[i
]}, wi2
{fsign
*wa2
[i
+1]};
579 ch
[i
+ l1ido
+ 1] = ci2
;
581 vcplxmul(cr3
, ci3
, ld_ps1(wr2
), ld_ps1(wi2
));
582 float wr3
{wa3
[i
]}, wi3
{fsign
*wa3
[i
+1]};
583 ch
[i
+ 2*l1ido
] = cr3
;
584 ch
[i
+ 2*l1ido
+ 1] = ci3
;
586 vcplxmul(cr4
, ci4
, ld_ps1(wr3
), ld_ps1(wi3
));
587 ch
[i
+ 3*l1ido
] = cr4
;
588 ch
[i
+ 3*l1ido
+ 1] = ci4
;
595 * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
597 NOINLINE
void passf5_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
598 const float *const wa1
, const float fsign
)
600 const v4sf tr11
{ld_ps1(0.309016994374947f
)};
601 const v4sf tr12
{ld_ps1(-0.809016994374947f
)};
602 const v4sf ti11
{ld_ps1(0.951056516295154f
*fsign
)};
603 const v4sf ti12
{ld_ps1(0.587785252292473f
*fsign
)};
605 auto cc_ref
= [&cc
,ido
](size_t a_1
, size_t a_2
) noexcept
-> auto&
606 { return cc
[(a_2
-1)*ido
+ a_1
+ 1]; };
607 auto ch_ref
= [&ch
,ido
,l1
](size_t a_1
, size_t a_3
) noexcept
-> auto&
608 { return ch
[(a_3
-1)*l1
*ido
+ a_1
+ 1]; };
612 const auto wa2
= wa1
+ ido
;
613 const auto wa3
= wa2
+ ido
;
614 const auto wa4
= wa3
+ ido
;
615 for(size_t k
{0};k
< l1
;++k
, cc
+= 5*ido
, ch
+= ido
)
617 for(size_t i
{0};i
< ido
-1;i
+= 2)
619 v4sf ti5
{vsub(cc_ref(i
, 2), cc_ref(i
, 5))};
620 v4sf ti2
{vadd(cc_ref(i
, 2), cc_ref(i
, 5))};
621 v4sf ti4
{vsub(cc_ref(i
, 3), cc_ref(i
, 4))};
622 v4sf ti3
{vadd(cc_ref(i
, 3), cc_ref(i
, 4))};
623 v4sf tr5
{vsub(cc_ref(i
-1, 2), cc_ref(i
-1, 5))};
624 v4sf tr2
{vadd(cc_ref(i
-1, 2), cc_ref(i
-1, 5))};
625 v4sf tr4
{vsub(cc_ref(i
-1, 3), cc_ref(i
-1, 4))};
626 v4sf tr3
{vadd(cc_ref(i
-1, 3), cc_ref(i
-1, 4))};
627 ch_ref(i
-1, 1) = vadd(cc_ref(i
-1, 1), vadd(tr2
, tr3
));
628 ch_ref(i
, 1) = vadd(cc_ref(i
, 1), vadd(ti2
, ti3
));
629 v4sf cr2
{vadd(cc_ref(i
-1, 1), vmadd(tr11
, tr2
, vmul(tr12
, tr3
)))};
630 v4sf ci2
{vadd(cc_ref(i
, 1), vmadd(tr11
, ti2
, vmul(tr12
, ti3
)))};
631 v4sf cr3
{vadd(cc_ref(i
-1, 1), vmadd(tr12
, tr2
, vmul(tr11
, tr3
)))};
632 v4sf ci3
{vadd(cc_ref(i
, 1), vmadd(tr12
, ti2
, vmul(tr11
, ti3
)))};
633 v4sf cr5
{vmadd(ti11
, tr5
, vmul(ti12
, tr4
))};
634 v4sf ci5
{vmadd(ti11
, ti5
, vmul(ti12
, ti4
))};
635 v4sf cr4
{vsub(vmul(ti12
, tr5
), vmul(ti11
, tr4
))};
636 v4sf ci4
{vsub(vmul(ti12
, ti5
), vmul(ti11
, ti4
))};
637 v4sf dr3
{vsub(cr3
, ci4
)};
638 v4sf dr4
{vadd(cr3
, ci4
)};
639 v4sf di3
{vadd(ci3
, cr4
)};
640 v4sf di4
{vsub(ci3
, cr4
)};
641 v4sf dr5
{vadd(cr2
, ci5
)};
642 v4sf dr2
{vsub(cr2
, ci5
)};
643 v4sf di5
{vsub(ci2
, cr5
)};
644 v4sf di2
{vadd(ci2
, cr5
)};
645 float wr1
{wa1
[i
]}, wi1
{fsign
*wa1
[i
+1]}, wr2
{wa2
[i
]}, wi2
{fsign
*wa2
[i
+1]};
646 float wr3
{wa3
[i
]}, wi3
{fsign
*wa3
[i
+1]}, wr4
{wa4
[i
]}, wi4
{fsign
*wa4
[i
+1]};
647 vcplxmul(dr2
, di2
, ld_ps1(wr1
), ld_ps1(wi1
));
648 ch_ref(i
- 1, 2) = dr2
;
650 vcplxmul(dr3
, di3
, ld_ps1(wr2
), ld_ps1(wi2
));
651 ch_ref(i
- 1, 3) = dr3
;
653 vcplxmul(dr4
, di4
, ld_ps1(wr3
), ld_ps1(wi3
));
654 ch_ref(i
- 1, 4) = dr4
;
656 vcplxmul(dr5
, di5
, ld_ps1(wr4
), ld_ps1(wi4
));
657 ch_ref(i
- 1, 5) = dr5
;
663 NOINLINE
void radf2_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
,
664 v4sf
*RESTRICT ch
, const float *const wa1
)
666 const size_t l1ido
{l1
*ido
};
667 for(size_t k
{0};k
< l1ido
;k
+= ido
)
669 v4sf a
{cc
[k
]}, b
{cc
[k
+ l1ido
]};
670 ch
[2*k
] = vadd(a
, b
);
671 ch
[2*(k
+ido
)-1] = vsub(a
, b
);
677 for(size_t k
{0};k
< l1ido
;k
+= ido
)
679 for(size_t i
{2};i
< ido
;i
+= 2)
681 v4sf tr2
{cc
[i
- 1 + k
+ l1ido
]}, ti2
{cc
[i
+ k
+ l1ido
]};
682 v4sf br
{cc
[i
- 1 + k
]}, bi
{cc
[i
+ k
]};
683 vcplxmulconj(tr2
, ti2
, ld_ps1(wa1
[i
- 2]), ld_ps1(wa1
[i
- 1]));
684 ch
[i
+ 2*k
] = vadd(bi
, ti2
);
685 ch
[2*(k
+ido
) - i
] = vsub(ti2
, bi
);
686 ch
[i
- 1 + 2*k
] = vadd(br
, tr2
);
687 ch
[2*(k
+ido
) - i
-1] = vsub(br
, tr2
);
693 const v4sf minus_one
{ld_ps1(-1.0f
)};
694 for(size_t k
{0};k
< l1ido
;k
+= ido
)
696 ch
[2*k
+ ido
] = vmul(minus_one
, cc
[ido
-1 + k
+ l1ido
]);
697 ch
[2*k
+ ido
-1] = cc
[k
+ ido
-1];
702 NOINLINE
void radb2_ps(const size_t ido
, const size_t l1
, const v4sf
*cc
, v4sf
*RESTRICT ch
,
703 const float *const wa1
)
705 const size_t l1ido
{l1
*ido
};
706 for(size_t k
{0};k
< l1ido
;k
+= ido
)
709 v4sf b
{cc
[2*(k
+ido
) - 1]};
711 ch
[k
+ l1ido
] = vsub(a
, b
);
717 for(size_t k
{0};k
< l1ido
;k
+= ido
)
719 for(size_t i
{2};i
< ido
;i
+= 2)
721 v4sf a
{cc
[i
-1 + 2*k
]};
722 v4sf b
{cc
[2*(k
+ ido
) - i
- 1]};
723 v4sf c
{cc
[i
+0 + 2*k
]};
724 v4sf d
{cc
[2*(k
+ ido
) - i
+ 0]};
725 ch
[i
-1 + k
] = vadd(a
, b
);
726 v4sf tr2
{vsub(a
, b
)};
727 ch
[i
+0 + k
] = vsub(c
, d
);
728 v4sf ti2
{vadd(c
, d
)};
729 vcplxmul(tr2
, ti2
, ld_ps1(wa1
[i
- 2]), ld_ps1(wa1
[i
- 1]));
730 ch
[i
-1 + k
+ l1ido
] = tr2
;
731 ch
[i
+0 + k
+ l1ido
] = ti2
;
737 const v4sf minus_two
{ld_ps1(-2.0f
)};
738 for(size_t k
{0};k
< l1ido
;k
+= ido
)
740 v4sf a
{cc
[2*k
+ ido
-1]};
741 v4sf b
{cc
[2*k
+ ido
]};
742 ch
[k
+ ido
-1] = vadd(a
,a
);
743 ch
[k
+ ido
-1 + l1ido
] = vmul(minus_two
, b
);
747 void radf3_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
748 const float *const wa1
)
750 const v4sf taur
{ld_ps1(-0.5f
)};
751 const v4sf taui
{ld_ps1(0.866025403784439f
)};
752 for(size_t k
{0};k
< l1
;++k
)
754 v4sf cr2
{vadd(cc
[(k
+ l1
)*ido
], cc
[(k
+ 2*l1
)*ido
])};
755 ch
[ (3*k
)*ido
] = vadd(cc
[k
*ido
], cr2
);
756 ch
[ (3*k
+ 2)*ido
] = vmul(taui
, vsub(cc
[(k
+ l1
*2)*ido
], cc
[(k
+ l1
)*ido
]));
757 ch
[ido
-1 + (3*k
+ 1)*ido
] = vmadd(taur
, cr2
, cc
[k
*ido
]);
762 const auto wa2
= wa1
+ ido
;
763 for(size_t k
{0};k
< l1
;++k
)
765 for(size_t i
{2};i
< ido
;i
+= 2)
767 const size_t ic
{ido
- i
};
768 v4sf wr1
{ld_ps1(wa1
[i
- 2])};
769 v4sf wi1
{ld_ps1(wa1
[i
- 1])};
770 v4sf dr2
{cc
[i
- 1 + (k
+ l1
)*ido
]};
771 v4sf di2
{cc
[i
+ (k
+ l1
)*ido
]};
772 vcplxmulconj(dr2
, di2
, wr1
, wi1
);
774 v4sf wr2
{ld_ps1(wa2
[i
- 2])};
775 v4sf wi2
{ld_ps1(wa2
[i
- 1])};
776 v4sf dr3
{cc
[i
- 1 + (k
+ l1
*2)*ido
]};
777 v4sf di3
{cc
[i
+ (k
+ l1
*2)*ido
]};
778 vcplxmulconj(dr3
, di3
, wr2
, wi2
);
780 v4sf cr2
{vadd(dr2
, dr3
)};
781 v4sf ci2
{vadd(di2
, di3
)};
782 ch
[i
- 1 + 3*k
*ido
] = vadd(cc
[i
- 1 + k
*ido
], cr2
);
783 ch
[i
+ 3*k
*ido
] = vadd(cc
[i
+ k
*ido
], ci2
);
784 v4sf tr2
{vmadd(taur
, cr2
, cc
[i
- 1 + k
*ido
])};
785 v4sf ti2
{vmadd(taur
, ci2
, cc
[i
+ k
*ido
])};
786 v4sf tr3
{vmul(taui
, vsub(di2
, di3
))};
787 v4sf ti3
{vmul(taui
, vsub(dr3
, dr2
))};
788 ch
[i
- 1 + (3*k
+ 2)*ido
] = vadd(tr2
, tr3
);
789 ch
[ic
- 1 + (3*k
+ 1)*ido
] = vsub(tr2
, tr3
);
790 ch
[i
+ (3*k
+ 2)*ido
] = vadd(ti2
, ti3
);
791 ch
[ic
+ (3*k
+ 1)*ido
] = vsub(ti3
, ti2
);
797 void radb3_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
798 const float *const wa1
)
800 static constexpr float taur
{-0.5f
};
801 static constexpr float taui
{0.866025403784439f
};
802 static constexpr float taui_2
{taui
*2.0f
};
804 const v4sf vtaur
{ld_ps1(taur
)};
805 const v4sf vtaui_2
{ld_ps1(taui_2
)};
806 for(size_t k
{0};k
< l1
;++k
)
808 v4sf tr2
= cc
[ido
-1 + (3*k
+ 1)*ido
];
810 v4sf cr2
= vmadd(vtaur
, tr2
, cc
[3*k
*ido
]);
811 ch
[k
*ido
] = vadd(cc
[3*k
*ido
], tr2
);
812 v4sf ci3
= vmul(vtaui_2
, cc
[(3*k
+ 2)*ido
]);
813 ch
[(k
+ l1
)*ido
] = vsub(cr2
, ci3
);
814 ch
[(k
+ 2*l1
)*ido
] = vadd(cr2
, ci3
);
819 const auto wa2
= wa1
+ ido
;
820 const v4sf vtaui
{ld_ps1(taui
)};
821 for(size_t k
{0};k
< l1
;++k
)
823 for(size_t i
{2};i
< ido
;i
+= 2)
825 const size_t ic
{ido
- i
};
826 v4sf tr2
{vadd(cc
[i
- 1 + (3*k
+ 2)*ido
], cc
[ic
- 1 + (3*k
+ 1)*ido
])};
827 v4sf cr2
{vmadd(vtaur
, tr2
, cc
[i
- 1 + 3*k
*ido
])};
828 ch
[i
- 1 + k
*ido
] = vadd(cc
[i
- 1 + 3*k
*ido
], tr2
);
829 v4sf ti2
{vsub(cc
[i
+ (3*k
+ 2)*ido
], cc
[ic
+ (3*k
+ 1)*ido
])};
830 v4sf ci2
{vmadd(vtaur
, ti2
, cc
[i
+ 3*k
*ido
])};
831 ch
[i
+ k
*ido
] = vadd(cc
[i
+ 3*k
*ido
], ti2
);
832 v4sf cr3
{vmul(vtaui
, vsub(cc
[i
- 1 + (3*k
+ 2)*ido
], cc
[ic
- 1 + (3*k
+ 1)*ido
]))};
833 v4sf ci3
{vmul(vtaui
, vadd(cc
[i
+ (3*k
+ 2)*ido
], cc
[ic
+ (3*k
+ 1)*ido
]))};
834 v4sf dr2
{vsub(cr2
, ci3
)};
835 v4sf dr3
{vadd(cr2
, ci3
)};
836 v4sf di2
{vadd(ci2
, cr3
)};
837 v4sf di3
{vsub(ci2
, cr3
)};
838 vcplxmul(dr2
, di2
, ld_ps1(wa1
[i
-2]), ld_ps1(wa1
[i
-1]));
839 ch
[i
- 1 + (k
+ l1
)*ido
] = dr2
;
840 ch
[i
+ (k
+ l1
)*ido
] = di2
;
841 vcplxmul(dr3
, di3
, ld_ps1(wa2
[i
-2]), ld_ps1(wa2
[i
-1]));
842 ch
[i
- 1 + (k
+ 2*l1
)*ido
] = dr3
;
843 ch
[i
+ (k
+ 2*l1
)*ido
] = di3
;
848 NOINLINE
void radf4_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
,
849 v4sf
*RESTRICT ch
, const float *const wa1
)
851 const size_t l1ido
{l1
*ido
};
853 const v4sf
*RESTRICT cc_
{cc
}, *RESTRICT cc_end
{cc
+ l1ido
};
854 v4sf
*RESTRICT ch_
{ch
};
857 // this loop represents between 25% and 40% of total radf4_ps cost !
858 v4sf a0
{cc
[0]}, a1
{cc
[l1ido
]};
859 v4sf a2
{cc
[2*l1ido
]}, a3
{cc
[3*l1ido
]};
860 v4sf tr1
{vadd(a1
, a3
)};
861 v4sf tr2
{vadd(a0
, a2
)};
862 ch
[2*ido
-1] = vsub(a0
, a2
);
863 ch
[2*ido
] = vsub(a3
, a1
);
864 ch
[0 ] = vadd(tr1
, tr2
);
865 ch
[4*ido
-1] = vsub(tr2
, tr1
);
866 cc
+= ido
; ch
+= 4*ido
;
875 const auto wa2
= wa1
+ ido
;
876 const auto wa3
= wa2
+ ido
;
878 for(size_t k
{0};k
< l1ido
;k
+= ido
)
880 const v4sf
*RESTRICT pc
{cc
+ 1 + k
};
881 for(size_t i
{2};i
< ido
;i
+= 2, pc
+= 2)
883 const size_t ic
{ido
- i
};
885 v4sf cr2
{pc
[1*l1ido
+0]};
886 v4sf ci2
{pc
[1*l1ido
+1]};
887 v4sf wr
{ld_ps1(wa1
[i
- 2])};
888 v4sf wi
{ld_ps1(wa1
[i
- 1])};
889 vcplxmulconj(cr2
,ci2
,wr
,wi
);
891 v4sf cr3
{pc
[2*l1ido
+0]};
892 v4sf ci3
{pc
[2*l1ido
+1]};
893 wr
= ld_ps1(wa2
[i
-2]);
894 wi
= ld_ps1(wa2
[i
-1]);
895 vcplxmulconj(cr3
, ci3
, wr
, wi
);
897 v4sf cr4
{pc
[3*l1ido
]};
898 v4sf ci4
{pc
[3*l1ido
+1]};
899 wr
= ld_ps1(wa3
[i
-2]);
900 wi
= ld_ps1(wa3
[i
-1]);
901 vcplxmulconj(cr4
, ci4
, wr
, wi
);
903 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
905 v4sf tr1
{vadd(cr2
,cr4
)};
906 v4sf tr4
{vsub(cr4
,cr2
)};
907 v4sf tr2
{vadd(pc
[0],cr3
)};
908 v4sf tr3
{vsub(pc
[0],cr3
)};
909 ch
[i
- 1 + 4*k
] = vadd(tr2
,tr1
);
910 ch
[ic
- 1 + 4*k
+ 3*ido
] = vsub(tr2
,tr1
); // at this point tr1 and tr2 can be disposed
911 v4sf ti1
{vadd(ci2
,ci4
)};
912 v4sf ti4
{vsub(ci2
,ci4
)};
913 ch
[i
- 1 + 4*k
+ 2*ido
] = vadd(tr3
,ti4
);
914 ch
[ic
- 1 + 4*k
+ 1*ido
] = vsub(tr3
,ti4
); // dispose tr3, ti4
915 v4sf ti2
{vadd(pc
[1],ci3
)};
916 v4sf ti3
{vsub(pc
[1],ci3
)};
917 ch
[i
+ 4*k
] = vadd(ti1
, ti2
);
918 ch
[ic
+ 4*k
+ 3*ido
] = vsub(ti1
, ti2
);
919 ch
[i
+ 4*k
+ 2*ido
] = vadd(tr4
, ti3
);
920 ch
[ic
+ 4*k
+ 1*ido
] = vsub(tr4
, ti3
);
926 const v4sf minus_hsqt2
{ld_ps1(al::numbers::sqrt2_v
<float> * -0.5f
)};
927 for(size_t k
{0};k
< l1ido
;k
+= ido
)
929 v4sf a
{cc
[ido
-1 + k
+ l1ido
]}, b
{cc
[ido
-1 + k
+ 3*l1ido
]};
930 v4sf c
{cc
[ido
-1 + k
]}, d
{cc
[ido
-1 + k
+ 2*l1ido
]};
931 v4sf ti1
{vmul(minus_hsqt2
, vadd(b
, a
))};
932 v4sf tr1
{vmul(minus_hsqt2
, vsub(b
, a
))};
933 ch
[ido
-1 + 4*k
] = vadd(c
, tr1
);
934 ch
[ido
-1 + 4*k
+ 2*ido
] = vsub(c
, tr1
);
935 ch
[ 4*k
+ 1*ido
] = vsub(ti1
, d
);
936 ch
[ 4*k
+ 3*ido
] = vadd(ti1
, d
);
941 NOINLINE
void radb4_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
,
942 v4sf
*RESTRICT ch
, const float *const wa1
)
944 const v4sf two
{ld_ps1(2.0f
)};
945 const size_t l1ido
{l1
*ido
};
947 const v4sf
*RESTRICT cc_
{cc
}, *RESTRICT ch_end
{ch
+ l1ido
};
951 v4sf a
{cc
[0]}, b
{cc
[4*ido
-1]};
952 v4sf c
{cc
[2*ido
]}, d
{cc
[2*ido
-1]};
953 v4sf tr3
{vmul(two
,d
)};
956 v4sf tr4
{vmul(two
,c
)};
957 ch
[0*l1ido
] = vadd(tr2
, tr3
);
958 ch
[2*l1ido
] = vsub(tr2
, tr3
);
959 ch
[1*l1ido
] = vsub(tr1
, tr4
);
960 ch
[3*l1ido
] = vadd(tr1
, tr4
);
962 cc
+= 4*ido
; ch
+= ido
;
970 const auto wa2
= wa1
+ ido
;
971 const auto wa3
= wa2
+ ido
;
973 for(size_t k
{0};k
< l1ido
;k
+= ido
)
975 const v4sf
*RESTRICT pc
{cc
- 1 + 4*k
};
976 v4sf
*RESTRICT ph
{ch
+ k
+ 1};
977 for(size_t i
{2};i
< ido
;i
+= 2)
979 v4sf tr1
{vsub(pc
[ i
], pc
[4*ido
- i
])};
980 v4sf tr2
{vadd(pc
[ i
], pc
[4*ido
- i
])};
981 v4sf ti4
{vsub(pc
[2*ido
+ i
], pc
[2*ido
- i
])};
982 v4sf tr3
{vadd(pc
[2*ido
+ i
], pc
[2*ido
- i
])};
983 ph
[0] = vadd(tr2
, tr3
);
984 v4sf cr3
{vsub(tr2
, tr3
)};
986 v4sf ti3
{vsub(pc
[2*ido
+ i
+ 1], pc
[2*ido
- i
+ 1])};
987 v4sf tr4
{vadd(pc
[2*ido
+ i
+ 1], pc
[2*ido
- i
+ 1])};
988 v4sf cr2
{vsub(tr1
, tr4
)};
989 v4sf cr4
{vadd(tr1
, tr4
)};
991 v4sf ti1
{vadd(pc
[i
+ 1], pc
[4*ido
- i
+ 1])};
992 v4sf ti2
{vsub(pc
[i
+ 1], pc
[4*ido
- i
+ 1])};
994 ph
[1] = vadd(ti2
, ti3
); ph
+= l1ido
;
995 v4sf ci3
{vsub(ti2
, ti3
)};
996 v4sf ci2
{vadd(ti1
, ti4
)};
997 v4sf ci4
{vsub(ti1
, ti4
)};
998 vcplxmul(cr2
, ci2
, ld_ps1(wa1
[i
-2]), ld_ps1(wa1
[i
-1]));
1000 ph
[1] = ci2
; ph
+= l1ido
;
1001 vcplxmul(cr3
, ci3
, ld_ps1(wa2
[i
-2]), ld_ps1(wa2
[i
-1]));
1003 ph
[1] = ci3
; ph
+= l1ido
;
1004 vcplxmul(cr4
, ci4
, ld_ps1(wa3
[i
-2]), ld_ps1(wa3
[i
-1]));
1006 ph
[1] = ci4
; ph
= ph
- 3*l1ido
+ 2;
1012 const v4sf minus_sqrt2
{ld_ps1(-1.414213562373095f
)};
1013 for(size_t k
{0};k
< l1ido
;k
+= ido
)
1015 const size_t i0
{4*k
+ ido
};
1016 v4sf c
{cc
[i0
-1]}, d
{cc
[i0
+ 2*ido
-1]};
1017 v4sf a
{cc
[i0
+0]}, b
{cc
[i0
+ 2*ido
+0]};
1018 v4sf tr1
{vsub(c
,d
)};
1019 v4sf tr2
{vadd(c
,d
)};
1020 v4sf ti1
{vadd(b
,a
)};
1021 v4sf ti2
{vsub(b
,a
)};
1022 ch
[ido
-1 + k
+ 0*l1ido
] = vadd(tr2
,tr2
);
1023 ch
[ido
-1 + k
+ 1*l1ido
] = vmul(minus_sqrt2
, vsub(ti1
, tr1
));
1024 ch
[ido
-1 + k
+ 2*l1ido
] = vadd(ti2
, ti2
);
1025 ch
[ido
-1 + k
+ 3*l1ido
] = vmul(minus_sqrt2
, vadd(ti1
, tr1
));
1029 void radf5_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
1030 const float *const wa1
)
1032 const v4sf tr11
{ld_ps1(0.309016994374947f
)};
1033 const v4sf ti11
{ld_ps1(0.951056516295154f
)};
1034 const v4sf tr12
{ld_ps1(-0.809016994374947f
)};
1035 const v4sf ti12
{ld_ps1(0.587785252292473f
)};
1037 auto cc_ref
= [&cc
,l1
,ido
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1038 { return cc
[(a_3
*l1
+ a_2
)*ido
+ a_1
]; };
1039 auto ch_ref
= [&ch
,ido
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1040 { return ch
[(a_3
*5 + a_2
)*ido
+ a_1
]; };
1042 /* Parameter adjustments */
1044 cc
-= 1 + ido
* (1 + l1
);
1046 const auto wa2
= wa1
+ ido
;
1047 const auto wa3
= wa2
+ ido
;
1048 const auto wa4
= wa3
+ ido
;
1051 for(size_t k
{1};k
<= l1
;++k
)
1053 v4sf cr2
{vadd(cc_ref(1, k
, 5), cc_ref(1, k
, 2))};
1054 v4sf ci5
{vsub(cc_ref(1, k
, 5), cc_ref(1, k
, 2))};
1055 v4sf cr3
{vadd(cc_ref(1, k
, 4), cc_ref(1, k
, 3))};
1056 v4sf ci4
{vsub(cc_ref(1, k
, 4), cc_ref(1, k
, 3))};
1057 ch_ref(1, 1, k
) = vadd(cc_ref(1, k
, 1), vadd(cr2
, cr3
));
1058 ch_ref(ido
, 2, k
) = vadd(cc_ref(1, k
, 1), vmadd(tr11
, cr2
, vmul(tr12
, cr3
)));
1059 ch_ref(1, 3, k
) = vmadd(ti11
, ci5
, vmul(ti12
, ci4
));
1060 ch_ref(ido
, 4, k
) = vadd(cc_ref(1, k
, 1), vmadd(tr12
, cr2
, vmul(tr11
, cr3
)));
1061 ch_ref(1, 5, k
) = vsub(vmul(ti12
, ci5
), vmul(ti11
, ci4
));
1062 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
1067 const size_t idp2
{ido
+ 2};
1068 for(size_t k
{1};k
<= l1
;++k
)
1070 for(size_t i
{3};i
<= ido
;i
+= 2)
1072 const size_t ic
{idp2
- i
};
1073 v4sf dr2
{ld_ps1(wa1
[i
-3])};
1074 v4sf di2
{ld_ps1(wa1
[i
-2])};
1075 v4sf dr3
{ld_ps1(wa2
[i
-3])};
1076 v4sf di3
{ld_ps1(wa2
[i
-2])};
1077 v4sf dr4
{ld_ps1(wa3
[i
-3])};
1078 v4sf di4
{ld_ps1(wa3
[i
-2])};
1079 v4sf dr5
{ld_ps1(wa4
[i
-3])};
1080 v4sf di5
{ld_ps1(wa4
[i
-2])};
1081 vcplxmulconj(dr2
, di2
, cc_ref(i
-1, k
, 2), cc_ref(i
, k
, 2));
1082 vcplxmulconj(dr3
, di3
, cc_ref(i
-1, k
, 3), cc_ref(i
, k
, 3));
1083 vcplxmulconj(dr4
, di4
, cc_ref(i
-1, k
, 4), cc_ref(i
, k
, 4));
1084 vcplxmulconj(dr5
, di5
, cc_ref(i
-1, k
, 5), cc_ref(i
, k
, 5));
1085 v4sf cr2
{vadd(dr2
, dr5
)};
1086 v4sf ci5
{vsub(dr5
, dr2
)};
1087 v4sf cr5
{vsub(di2
, di5
)};
1088 v4sf ci2
{vadd(di2
, di5
)};
1089 v4sf cr3
{vadd(dr3
, dr4
)};
1090 v4sf ci4
{vsub(dr4
, dr3
)};
1091 v4sf cr4
{vsub(di3
, di4
)};
1092 v4sf ci3
{vadd(di3
, di4
)};
1093 ch_ref(i
- 1, 1, k
) = vadd(cc_ref(i
- 1, k
, 1), vadd(cr2
, cr3
));
1094 ch_ref(i
, 1, k
) = vsub(cc_ref(i
, k
, 1), vadd(ci2
, ci3
));
1095 v4sf tr2
{vadd(cc_ref(i
- 1, k
, 1), vmadd(tr11
, cr2
, vmul(tr12
, cr3
)))};
1096 v4sf ti2
{vsub(cc_ref(i
, k
, 1), vmadd(tr11
, ci2
, vmul(tr12
, ci3
)))};
1097 v4sf tr3
{vadd(cc_ref(i
- 1, k
, 1), vmadd(tr12
, cr2
, vmul(tr11
, cr3
)))};
1098 v4sf ti3
{vsub(cc_ref(i
, k
, 1), vmadd(tr12
, ci2
, vmul(tr11
, ci3
)))};
1099 v4sf tr5
{vmadd(ti11
, cr5
, vmul(ti12
, cr4
))};
1100 v4sf ti5
{vmadd(ti11
, ci5
, vmul(ti12
, ci4
))};
1101 v4sf tr4
{vsub(vmul(ti12
, cr5
), vmul(ti11
, cr4
))};
1102 v4sf ti4
{vsub(vmul(ti12
, ci5
), vmul(ti11
, ci4
))};
1103 ch_ref(i
- 1, 3, k
) = vsub(tr2
, tr5
);
1104 ch_ref(ic
- 1, 2, k
) = vadd(tr2
, tr5
);
1105 ch_ref(i
, 3, k
) = vadd(ti5
, ti2
);
1106 ch_ref(ic
, 2, k
) = vsub(ti5
, ti2
);
1107 ch_ref(i
- 1, 5, k
) = vsub(tr3
, tr4
);
1108 ch_ref(ic
- 1, 4, k
) = vadd(tr3
, tr4
);
1109 ch_ref(i
, 5, k
) = vadd(ti4
, ti3
);
1110 ch_ref(ic
, 4, k
) = vsub(ti4
, ti3
);
1115 void radb5_ps(const size_t ido
, const size_t l1
, const v4sf
*RESTRICT cc
, v4sf
*RESTRICT ch
,
1116 const float *const wa1
)
1118 const v4sf tr11
{ld_ps1(0.309016994374947f
)};
1119 const v4sf ti11
{ld_ps1(0.951056516295154f
)};
1120 const v4sf tr12
{ld_ps1(-0.809016994374947f
)};
1121 const v4sf ti12
{ld_ps1(0.587785252292473f
)};
1123 auto cc_ref
= [&cc
,ido
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1124 { return cc
[(a_3
*5 + a_2
)*ido
+ a_1
]; };
1125 auto ch_ref
= [&ch
,ido
,l1
](size_t a_1
, size_t a_2
, size_t a_3
) noexcept
-> auto&
1126 { return ch
[(a_3
*l1
+ a_2
)*ido
+ a_1
]; };
1128 /* Parameter adjustments */
1129 ch
-= 1 + ido
*(1 + l1
);
1132 const auto wa2
= wa1
+ ido
;
1133 const auto wa3
= wa2
+ ido
;
1134 const auto wa4
= wa3
+ ido
;
1137 for(size_t k
{1};k
<= l1
;++k
)
1139 v4sf ti5
{vadd(cc_ref( 1, 3, k
), cc_ref(1, 3, k
))};
1140 v4sf ti4
{vadd(cc_ref( 1, 5, k
), cc_ref(1, 5, k
))};
1141 v4sf tr2
{vadd(cc_ref(ido
, 2, k
), cc_ref(ido
, 2, k
))};
1142 v4sf tr3
{vadd(cc_ref(ido
, 4, k
), cc_ref(ido
, 4, k
))};
1143 ch_ref(1, k
, 1) = vadd(cc_ref(1, 1, k
), vadd(tr2
, tr3
));
1144 v4sf cr2
{vadd(cc_ref(1, 1, k
), vmadd(tr11
, tr2
, vmul(tr12
, tr3
)))};
1145 v4sf cr3
{vadd(cc_ref(1, 1, k
), vmadd(tr12
, tr2
, vmul(tr11
, tr3
)))};
1146 v4sf ci5
{vmadd(ti11
, ti5
, vmul(ti12
, ti4
))};
1147 v4sf ci4
{vsub(vmul(ti12
, ti5
), vmul(ti11
, ti4
))};
1148 ch_ref(1, k
, 2) = vsub(cr2
, ci5
);
1149 ch_ref(1, k
, 3) = vsub(cr3
, ci4
);
1150 ch_ref(1, k
, 4) = vadd(cr3
, ci4
);
1151 ch_ref(1, k
, 5) = vadd(cr2
, ci5
);
1156 const size_t idp2
{ido
+ 2};
1157 for(size_t k
{1};k
<= l1
;++k
)
1159 for(size_t i
{3};i
<= ido
;i
+= 2)
1161 const size_t ic
{idp2
- i
};
1162 v4sf ti5
{vadd(cc_ref(i
, 3, k
), cc_ref(ic
, 2, k
))};
1163 v4sf ti2
{vsub(cc_ref(i
, 3, k
), cc_ref(ic
, 2, k
))};
1164 v4sf ti4
{vadd(cc_ref(i
, 5, k
), cc_ref(ic
, 4, k
))};
1165 v4sf ti3
{vsub(cc_ref(i
, 5, k
), cc_ref(ic
, 4, k
))};
1166 v4sf tr5
{vsub(cc_ref(i
-1, 3, k
), cc_ref(ic
-1, 2, k
))};
1167 v4sf tr2
{vadd(cc_ref(i
-1, 3, k
), cc_ref(ic
-1, 2, k
))};
1168 v4sf tr4
{vsub(cc_ref(i
-1, 5, k
), cc_ref(ic
-1, 4, k
))};
1169 v4sf tr3
{vadd(cc_ref(i
-1, 5, k
), cc_ref(ic
-1, 4, k
))};
1170 ch_ref(i
- 1, k
, 1) = vadd(cc_ref(i
-1, 1, k
), vadd(tr2
, tr3
));
1171 ch_ref(i
, k
, 1) = vadd(cc_ref(i
, 1, k
), vadd(ti2
, ti3
));
1172 v4sf cr2
{vadd(cc_ref(i
-1, 1, k
), vmadd(tr11
, tr2
, vmul(tr12
, tr3
)))};
1173 v4sf ci2
{vadd(cc_ref(i
, 1, k
), vmadd(tr11
, ti2
, vmul(tr12
, ti3
)))};
1174 v4sf cr3
{vadd(cc_ref(i
-1, 1, k
), vmadd(tr12
, tr2
, vmul(tr11
, tr3
)))};
1175 v4sf ci3
{vadd(cc_ref(i
, 1, k
), vmadd(tr12
, ti2
, vmul(tr11
, ti3
)))};
1176 v4sf cr5
{vmadd(ti11
, tr5
, vmul(ti12
, tr4
))};
1177 v4sf ci5
{vmadd(ti11
, ti5
, vmul(ti12
, ti4
))};
1178 v4sf cr4
{vsub(vmul(ti12
, tr5
), vmul(ti11
, tr4
))};
1179 v4sf ci4
{vsub(vmul(ti12
, ti5
), vmul(ti11
, ti4
))};
1180 v4sf dr3
{vsub(cr3
, ci4
)};
1181 v4sf dr4
{vadd(cr3
, ci4
)};
1182 v4sf di3
{vadd(ci3
, cr4
)};
1183 v4sf di4
{vsub(ci3
, cr4
)};
1184 v4sf dr5
{vadd(cr2
, ci5
)};
1185 v4sf dr2
{vsub(cr2
, ci5
)};
1186 v4sf di5
{vsub(ci2
, cr5
)};
1187 v4sf di2
{vadd(ci2
, cr5
)};
1188 vcplxmul(dr2
, di2
, ld_ps1(wa1
[i
-3]), ld_ps1(wa1
[i
-2]));
1189 vcplxmul(dr3
, di3
, ld_ps1(wa2
[i
-3]), ld_ps1(wa2
[i
-2]));
1190 vcplxmul(dr4
, di4
, ld_ps1(wa3
[i
-3]), ld_ps1(wa3
[i
-2]));
1191 vcplxmul(dr5
, di5
, ld_ps1(wa4
[i
-3]), ld_ps1(wa4
[i
-2]));
1193 ch_ref(i
-1, k
, 2) = dr2
; ch_ref(i
, k
, 2) = di2
;
1194 ch_ref(i
-1, k
, 3) = dr3
; ch_ref(i
, k
, 3) = di3
;
1195 ch_ref(i
-1, k
, 4) = dr4
; ch_ref(i
, k
, 4) = di4
;
1196 ch_ref(i
-1, k
, 5) = dr5
; ch_ref(i
, k
, 5) = di5
;
1201 NOINLINE v4sf
*rfftf1_ps(const size_t n
, const v4sf
*input_readonly
, v4sf
*work1
, v4sf
*work2
,
1202 const float *wa
, const al::span
<const uint
,15> ifac
)
1204 assert(work1
!= work2
);
1206 const v4sf
*in
{input_readonly
};
1207 v4sf
*out
{in
== work2
? work1
: work2
};
1208 const size_t nf
{ifac
[1]};
1214 const size_t kh
{nf
- k1
};
1215 const size_t ip
{ifac
[kh
+ 2]};
1216 const size_t l1
{l2
/ ip
};
1217 const size_t ido
{n
/ l2
};
1222 radf5_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1225 radf4_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1228 radf3_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1231 radf2_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1253 NOINLINE v4sf
*rfftb1_ps(const size_t n
, const v4sf
*input_readonly
, v4sf
*work1
, v4sf
*work2
,
1254 const float *wa
, const al::span
<const uint
,15> ifac
)
1256 assert(work1
!= work2
);
1258 const v4sf
*in
{input_readonly
};
1259 v4sf
*out
{in
== work2
? work1
: work2
};
1260 const size_t nf
{ifac
[1]};
1266 const size_t ip
{ifac
[k1
+ 1]};
1267 const size_t l2
{ip
*l1
};
1268 const size_t ido
{n
/ l2
};
1272 radb5_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1275 radb4_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1278 radb3_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1281 radb2_ps(ido
, l1
, in
, out
, &wa
[iw
]);
1305 v4sf
*cfftf1_ps(const size_t n
, const v4sf
*input_readonly
, v4sf
*work1
, v4sf
*work2
,
1306 const float *wa
, const al::span
<const uint
,15> ifac
, const float fsign
)
1308 assert(work1
!= work2
);
1310 const v4sf
*in
{input_readonly
};
1311 v4sf
*out
{in
== work2
? work1
: work2
};
1312 const size_t nf
{ifac
[1]};
1313 size_t l1
{1}, iw
{0};
1317 const size_t ip
{ifac
[k1
]};
1318 const size_t l2
{ip
*l1
};
1319 const size_t ido
{n
/ l2
};
1320 const size_t idot
{ido
+ ido
};
1324 passf5_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1327 passf4_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1330 passf3_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1333 passf2_ps(idot
, l1
, in
, out
, &wa
[iw
], fsign
);
1342 iw
+= (ip
- 1)*idot
;
1357 uint
decompose(const uint n
, const al::span
<uint
,15> ifac
, const al::span
<const uint
,4> ntryh
)
1360 for(const uint ntry
: ntryh
)
1364 const uint nq
{nl
/ ntry
};
1365 const uint nr
{nl
% ntry
};
1368 ifac
[2+nf
++] = ntry
;
1370 if(ntry
== 2 && nf
!= 1)
1372 for(size_t i
{2};i
<= nf
;++i
)
1374 size_t ib
{nf
- i
+ 2};
1375 ifac
[ib
+ 1] = ifac
[ib
];
1386 void rffti1_ps(const uint n
, float *wa
, const al::span
<uint
,15> ifac
)
1388 static constexpr std::array ntryh
{4u,2u,3u,5u};
1390 const uint nf
{decompose(n
, ifac
, ntryh
)};
1391 const double argh
{2.0*al::numbers::pi
/ n
};
1393 size_t nfm1
{nf
- 1};
1395 for(size_t k1
{0};k1
< nfm1
;++k1
)
1397 const size_t ip
{ifac
[k1
+2]};
1398 const size_t l2
{l1
*ip
};
1399 const size_t ido
{n
/ l2
};
1400 const size_t ipm
{ip
- 1};
1402 for(size_t j
{0};j
< ipm
;++j
)
1406 const double argld
{static_cast<double>(ld
)*argh
};
1408 for(size_t ii
{2};ii
< ido
;ii
+= 2)
1411 wa
[i
++] = static_cast<float>(std::cos(fi
*argld
));
1412 wa
[i
++] = static_cast<float>(std::sin(fi
*argld
));
1420 void cffti1_ps(const uint n
, float *wa
, const al::span
<uint
,15> ifac
)
1422 static constexpr std::array ntryh
{5u,3u,4u,2u};
1424 const uint nf
{decompose(n
, ifac
, ntryh
)};
1425 const double argh
{2.0*al::numbers::pi
/ n
};
1428 for(size_t k1
{0};k1
< nf
;++k1
)
1430 const size_t ip
{ifac
[k1
+2]};
1431 const size_t l2
{l1
*ip
};
1432 const size_t ido
{n
/ l2
};
1433 const size_t idot
{ido
+ ido
+ 2};
1434 const size_t ipm
{ip
- 1};
1436 for(size_t j
{0};j
< ipm
;++j
)
1442 const double argld
{static_cast<double>(ld
)*argh
};
1444 for(size_t ii
{3};ii
< idot
;ii
+= 2)
1447 wa
[++i
] = static_cast<float>(std::cos(fi
*argld
));
1448 wa
[++i
] = static_cast<float>(std::sin(fi
*argld
));
1462 /* NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) */
1463 struct PFFFT_Setup
{
1465 uint Ncvec
{}; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1466 std::array
<uint
,15> ifac
{};
1467 pffft_transform_t transform
{};
1469 float *twiddle
{}; /* N/4 elements */
1470 al::span
<v4sf
> e
; /* N/4*3 elements */
1472 alignas(V4sfAlignment
) std::byte end
;
1475 PFFFTSetupPtr
pffft_new_setup(unsigned int N
, pffft_transform_t transform
)
1477 assert(transform
== PFFFT_REAL
|| transform
== PFFFT_COMPLEX
);
1479 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1480 * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1481 * handle other cases (or maybe just switch to a scalar fft, I don't know..)
1483 if(transform
== PFFFT_REAL
)
1484 assert((N
%(2*SimdSize
*SimdSize
)) == 0);
1486 assert((N
%(SimdSize
*SimdSize
)) == 0);
1488 const uint Ncvec
{(transform
== PFFFT_REAL
? N
/2 : N
) / SimdSize
};
1490 const size_t storelen
{std::max(offsetof(PFFFT_Setup
, end
) + 2_zu
*Ncvec
*sizeof(v4sf
),
1491 sizeof(PFFFT_Setup
))};
1492 auto storage
= static_cast<gsl::owner
<std::byte
*>>(::operator new[](storelen
, V4sfAlignVal
));
1493 al::span extrastore
{&storage
[offsetof(PFFFT_Setup
, end
)], 2_zu
*Ncvec
*sizeof(v4sf
)};
1495 PFFFTSetupPtr s
{::new(storage
) PFFFT_Setup
{}};
1497 s
->transform
= transform
;
1500 const size_t ecount
{2_zu
*Ncvec
*(SimdSize
-1)/SimdSize
};
1501 s
->e
= {std::launder(reinterpret_cast<v4sf
*>(extrastore
.data())), ecount
};
1502 s
->twiddle
= std::launder(reinterpret_cast<float*>(&extrastore
[ecount
*sizeof(v4sf
)]));
1504 if constexpr(SimdSize
> 1)
1506 auto e
= std::vector
<float>(s
->e
.size()*SimdSize
, 0.0f
);
1507 for(size_t k
{0};k
< s
->Ncvec
;++k
)
1509 const size_t i
{k
/ SimdSize
};
1510 const size_t j
{k
% SimdSize
};
1511 for(size_t m
{0};m
< SimdSize
-1;++m
)
1513 const double A
{-2.0*al::numbers::pi
*static_cast<double>((m
+1)*k
) / N
};
1514 e
[((i
*3 + m
)*2 + 0)*SimdSize
+ j
] = static_cast<float>(std::cos(A
));
1515 e
[((i
*3 + m
)*2 + 1)*SimdSize
+ j
] = static_cast<float>(std::sin(A
));
1518 std::memcpy(s
->e
.data(), e
.data(), e
.size()*sizeof(float));
1520 if(transform
== PFFFT_REAL
)
1521 rffti1_ps(N
/SimdSize
, s
->twiddle
, s
->ifac
);
1523 cffti1_ps(N
/SimdSize
, s
->twiddle
, s
->ifac
);
1525 /* check that N is decomposable with allowed prime factors */
1527 for(size_t k
{0};k
< s
->ifac
[1];++k
)
1537 void pffft_destroy_setup(gsl::owner
<PFFFT_Setup
*> s
) noexcept
1540 ::operator delete[](gsl::owner
<void*>{s
}, V4sfAlignVal
);
1543 #if !defined(PFFFT_SIMD_DISABLE)
1547 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1548 void reversed_copy(const size_t N
, const v4sf
*in
, const int in_stride
, v4sf
*RESTRICT out
)
1551 interleave2(in
[0], in
[1], g0
, g1
);
1554 *--out
= vswaphl(g0
, g1
); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1555 for(size_t k
{1};k
< N
;++k
)
1558 interleave2(in
[0], in
[1], h0
, h1
);
1560 *--out
= vswaphl(g1
, h0
);
1561 *--out
= vswaphl(h0
, h1
);
1564 *--out
= vswaphl(g1
, g0
);
1567 void unreversed_copy(const size_t N
, const v4sf
*in
, v4sf
*RESTRICT out
, const int out_stride
)
1569 v4sf g0
{in
[0]}, g1
{g0
};
1571 for(size_t k
{1};k
< N
;++k
)
1573 v4sf h0
{*in
++}; v4sf h1
{*in
++};
1574 g1
= vswaphl(g1
, h0
);
1575 h0
= vswaphl(h0
, h1
);
1576 uninterleave2(h0
, g1
, out
[0], out
[1]);
1580 v4sf h0
{*in
++}, h1
{g0
};
1581 g1
= vswaphl(g1
, h0
);
1582 h0
= vswaphl(h0
, h1
);
1583 uninterleave2(h0
, g1
, out
[0], out
[1]);
1586 void pffft_cplx_finalize(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
, const v4sf
*e
)
1590 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1591 for(size_t k
{0};k
< dk
;++k
)
1593 v4sf r0
{in
[8*k
+0]}, i0
{in
[8*k
+1]};
1594 v4sf r1
{in
[8*k
+2]}, i1
{in
[8*k
+3]};
1595 v4sf r2
{in
[8*k
+4]}, i2
{in
[8*k
+5]};
1596 v4sf r3
{in
[8*k
+6]}, i3
{in
[8*k
+7]};
1597 vtranspose4(r0
,r1
,r2
,r3
);
1598 vtranspose4(i0
,i1
,i2
,i3
);
1599 vcplxmul(r1
,i1
,e
[k
*6+0],e
[k
*6+1]);
1600 vcplxmul(r2
,i2
,e
[k
*6+2],e
[k
*6+3]);
1601 vcplxmul(r3
,i3
,e
[k
*6+4],e
[k
*6+5]);
1603 v4sf sr0
{vadd(r0
,r2
)}, dr0
{vsub(r0
, r2
)};
1604 v4sf sr1
{vadd(r1
,r3
)}, dr1
{vsub(r1
, r3
)};
1605 v4sf si0
{vadd(i0
,i2
)}, di0
{vsub(i0
, i2
)};
1606 v4sf si1
{vadd(i1
,i3
)}, di1
{vsub(i1
, i3
)};
1608 /* transformation for each column is:
1610 * [1 1 1 1 0 0 0 0] [r0]
1611 * [1 0 -1 0 0 -1 0 1] [r1]
1612 * [1 -1 1 -1 0 0 0 0] [r2]
1613 * [1 0 -1 0 0 1 0 -1] [r3]
1614 * [0 0 0 0 1 1 1 1] * [i0]
1615 * [0 1 0 -1 1 0 -1 0] [i1]
1616 * [0 0 0 0 1 -1 1 -1] [i2]
1617 * [0 -1 0 1 1 0 -1 0] [i3]
1620 r0
= vadd(sr0
, sr1
); i0
= vadd(si0
, si1
);
1621 r1
= vadd(dr0
, di1
); i1
= vsub(di0
, dr1
);
1622 r2
= vsub(sr0
, sr1
); i2
= vsub(si0
, si1
);
1623 r3
= vsub(dr0
, di1
); i3
= vadd(di0
, dr1
);
1625 *out
++ = r0
; *out
++ = i0
; *out
++ = r1
; *out
++ = i1
;
1626 *out
++ = r2
; *out
++ = i2
; *out
++ = r3
; *out
++ = i3
;
1630 void pffft_cplx_preprocess(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
, const v4sf
*e
)
1634 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1635 for(size_t k
{0};k
< dk
;++k
)
1637 v4sf r0
{in
[8*k
+0]}, i0
{in
[8*k
+1]};
1638 v4sf r1
{in
[8*k
+2]}, i1
{in
[8*k
+3]};
1639 v4sf r2
{in
[8*k
+4]}, i2
{in
[8*k
+5]};
1640 v4sf r3
{in
[8*k
+6]}, i3
{in
[8*k
+7]};
1642 v4sf sr0
{vadd(r0
,r2
)}, dr0
{vsub(r0
, r2
)};
1643 v4sf sr1
{vadd(r1
,r3
)}, dr1
{vsub(r1
, r3
)};
1644 v4sf si0
{vadd(i0
,i2
)}, di0
{vsub(i0
, i2
)};
1645 v4sf si1
{vadd(i1
,i3
)}, di1
{vsub(i1
, i3
)};
1647 r0
= vadd(sr0
, sr1
); i0
= vadd(si0
, si1
);
1648 r1
= vsub(dr0
, di1
); i1
= vadd(di0
, dr1
);
1649 r2
= vsub(sr0
, sr1
); i2
= vsub(si0
, si1
);
1650 r3
= vadd(dr0
, di1
); i3
= vsub(di0
, dr1
);
1652 vcplxmulconj(r1
,i1
,e
[k
*6+0],e
[k
*6+1]);
1653 vcplxmulconj(r2
,i2
,e
[k
*6+2],e
[k
*6+3]);
1654 vcplxmulconj(r3
,i3
,e
[k
*6+4],e
[k
*6+5]);
1656 vtranspose4(r0
,r1
,r2
,r3
);
1657 vtranspose4(i0
,i1
,i2
,i3
);
1659 *out
++ = r0
; *out
++ = i0
; *out
++ = r1
; *out
++ = i1
;
1660 *out
++ = r2
; *out
++ = i2
; *out
++ = r3
; *out
++ = i3
;
1665 force_inline
void pffft_real_finalize_4x4(const v4sf
*in0
, const v4sf
*in1
, const v4sf
*in
,
1666 const v4sf
*e
, v4sf
*RESTRICT out
)
1668 v4sf r0
{*in0
}, i0
{*in1
};
1669 v4sf r1
{*in
++}; v4sf i1
{*in
++};
1670 v4sf r2
{*in
++}; v4sf i2
{*in
++};
1671 v4sf r3
{*in
++}; v4sf i3
{*in
++};
1672 vtranspose4(r0
,r1
,r2
,r3
);
1673 vtranspose4(i0
,i1
,i2
,i3
);
1675 /* transformation for each column is:
1677 * [1 1 1 1 0 0 0 0] [r0]
1678 * [1 0 -1 0 0 -1 0 1] [r1]
1679 * [1 0 -1 0 0 1 0 -1] [r2]
1680 * [1 -1 1 -1 0 0 0 0] [r3]
1681 * [0 0 0 0 1 1 1 1] * [i0]
1682 * [0 -1 0 1 -1 0 1 0] [i1]
1683 * [0 -1 0 1 1 0 -1 0] [i2]
1684 * [0 0 0 0 -1 1 -1 1] [i3]
1687 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1688 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1690 vcplxmul(r1
,i1
,e
[0],e
[1]);
1691 vcplxmul(r2
,i2
,e
[2],e
[3]);
1692 vcplxmul(r3
,i3
,e
[4],e
[5]);
1694 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1695 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1697 v4sf sr0
{vadd(r0
,r2
)}, dr0
{vsub(r0
,r2
)};
1698 v4sf sr1
{vadd(r1
,r3
)}, dr1
{vsub(r3
,r1
)};
1699 v4sf si0
{vadd(i0
,i2
)}, di0
{vsub(i0
,i2
)};
1700 v4sf si1
{vadd(i1
,i3
)}, di1
{vsub(i3
,i1
)};
1702 r0
= vadd(sr0
, sr1
);
1703 r3
= vsub(sr0
, sr1
);
1704 i0
= vadd(si0
, si1
);
1705 i3
= vsub(si1
, si0
);
1706 r1
= vadd(dr0
, di1
);
1707 r2
= vsub(dr0
, di1
);
1708 i1
= vsub(dr1
, di0
);
1709 i2
= vadd(dr1
, di0
);
1721 NOINLINE
void pffft_real_finalize(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
,
1724 static constexpr float s
{al::numbers::sqrt2_v
<float>/2.0f
};
1727 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1728 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1730 const v4sf zero
{vzero()};
1731 const auto cr
= al::bit_cast
<std::array
<float,SimdSize
>>(in
[0]);
1732 const auto ci
= al::bit_cast
<std::array
<float,SimdSize
>>(in
[Ncvec
*2-1]);
1733 pffft_real_finalize_4x4(&zero
, &zero
, in
+1, e
, out
);
1735 /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1737 * [Xr(1)] ] [1 1 1 1 0 0 0 0]
1738 * [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1739 * [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1740 * [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1741 * [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1742 * [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1743 * [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1744 * [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1747 const float xr0
{(cr
[0]+cr
[2]) + (cr
[1]+cr
[3])}; out
[0] = vinsert0(out
[0], xr0
);
1748 const float xi0
{(cr
[0]+cr
[2]) - (cr
[1]+cr
[3])}; out
[1] = vinsert0(out
[1], xi0
);
1749 const float xr2
{(cr
[0]-cr
[2])}; out
[4] = vinsert0(out
[4], xr2
);
1750 const float xi2
{(cr
[3]-cr
[1])}; out
[5] = vinsert0(out
[5], xi2
);
1751 const float xr1
{ ci
[0] + s
*(ci
[1]-ci
[3])}; out
[2] = vinsert0(out
[2], xr1
);
1752 const float xi1
{-ci
[2] - s
*(ci
[1]+ci
[3])}; out
[3] = vinsert0(out
[3], xi1
);
1753 const float xr3
{ ci
[0] - s
*(ci
[1]-ci
[3])}; out
[6] = vinsert0(out
[6], xr3
);
1754 const float xi3
{ ci
[2] - s
*(ci
[1]+ci
[3])}; out
[7] = vinsert0(out
[7], xi3
);
1756 for(size_t k
{1};k
< dk
;++k
)
1757 pffft_real_finalize_4x4(&in
[8*k
-1], &in
[8*k
+0], in
+ 8*k
+1, e
+ k
*6, out
+ k
*8);
1760 force_inline
void pffft_real_preprocess_4x4(const v4sf
*in
, const v4sf
*e
, v4sf
*RESTRICT out
,
1763 v4sf r0
{in
[0]}, i0
{in
[1]}, r1
{in
[2]}, i1
{in
[3]};
1764 v4sf r2
{in
[4]}, i2
{in
[5]}, r3
{in
[6]}, i3
{in
[7]};
1766 /* transformation for each column is:
1768 * [1 1 1 1 0 0 0 0] [r0]
1769 * [1 0 0 -1 0 -1 -1 0] [r1]
1770 * [1 -1 -1 1 0 0 0 0] [r2]
1771 * [1 0 0 -1 0 1 1 0] [r3]
1772 * [0 0 0 0 1 -1 1 -1] * [i0]
1773 * [0 -1 1 0 1 0 0 1] [i1]
1774 * [0 0 0 0 1 1 -1 -1] [i2]
1775 * [0 1 -1 0 1 0 0 1] [i3]
1778 v4sf sr0
{vadd(r0
,r3
)}, dr0
{vsub(r0
,r3
)};
1779 v4sf sr1
{vadd(r1
,r2
)}, dr1
{vsub(r1
,r2
)};
1780 v4sf si0
{vadd(i0
,i3
)}, di0
{vsub(i0
,i3
)};
1781 v4sf si1
{vadd(i1
,i2
)}, di1
{vsub(i1
,i2
)};
1783 r0
= vadd(sr0
, sr1
);
1784 r2
= vsub(sr0
, sr1
);
1785 r1
= vsub(dr0
, si1
);
1786 r3
= vadd(dr0
, si1
);
1787 i0
= vsub(di0
, di1
);
1788 i2
= vadd(di0
, di1
);
1789 i1
= vsub(si0
, dr1
);
1790 i3
= vadd(si0
, dr1
);
1792 vcplxmulconj(r1
,i1
,e
[0],e
[1]);
1793 vcplxmulconj(r2
,i2
,e
[2],e
[3]);
1794 vcplxmulconj(r3
,i3
,e
[4],e
[5]);
1796 vtranspose4(r0
,r1
,r2
,r3
);
1797 vtranspose4(i0
,i1
,i2
,i3
);
1812 NOINLINE
void pffft_real_preprocess(const size_t Ncvec
, const v4sf
*in
, v4sf
*RESTRICT out
,
1815 static constexpr float sqrt2
{al::numbers::sqrt2_v
<float>};
1818 const size_t dk
{Ncvec
/SimdSize
}; // number of 4x4 matrix blocks
1819 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1821 std::array
<float,SimdSize
> Xr
{}, Xi
{};
1822 for(size_t k
{0};k
< SimdSize
;++k
)
1824 Xr
[k
] = vextract0(in
[2*k
]);
1825 Xi
[k
] = vextract0(in
[2*k
+ 1]);
1828 pffft_real_preprocess_4x4(in
, e
, out
+1, true); // will write only 6 values
1830 /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1832 * [cr0] [1 0 2 0 1 0 0 0]
1833 * [cr1] [1 0 0 0 -1 0 -2 0]
1834 * [cr2] [1 0 -2 0 1 0 0 0]
1835 * [cr3] [1 0 0 0 -1 0 2 0]
1836 * [ci0] [0 2 0 2 0 0 0 0]
1837 * [ci1] [0 s 0 -s 0 -s 0 -s]
1838 * [ci2] [0 0 0 0 0 -2 0 2]
1839 * [ci3] [0 -s 0 s 0 -s 0 -s]
1841 for(size_t k
{1};k
< dk
;++k
)
1842 pffft_real_preprocess_4x4(in
+8*k
, e
+ k
*6, out
-1+k
*8, false);
1844 const float cr0
{(Xr
[0]+Xi
[0]) + 2*Xr
[2]};
1845 const float cr1
{(Xr
[0]-Xi
[0]) - 2*Xi
[2]};
1846 const float cr2
{(Xr
[0]+Xi
[0]) - 2*Xr
[2]};
1847 const float cr3
{(Xr
[0]-Xi
[0]) + 2*Xi
[2]};
1848 out
[0] = vset4(cr0
, cr1
, cr2
, cr3
);
1849 const float ci0
{ 2*(Xr
[1]+Xr
[3])};
1850 const float ci1
{ sqrt2
*(Xr
[1]-Xr
[3]) - sqrt2
*(Xi
[1]+Xi
[3])};
1851 const float ci2
{ 2*(Xi
[3]-Xi
[1])};
1852 const float ci3
{-sqrt2
*(Xr
[1]-Xr
[3]) - sqrt2
*(Xi
[1]+Xi
[3])};
1853 out
[2*Ncvec
-1] = vset4(ci0
, ci1
, ci2
, ci3
);
1857 void pffft_transform_internal(const PFFFT_Setup
*setup
, const v4sf
*vinput
, v4sf
*voutput
,
1858 v4sf
*scratch
, const pffft_direction_t direction
, const bool ordered
)
1860 assert(scratch
!= nullptr);
1861 assert(voutput
!= scratch
);
1863 const size_t Ncvec
{setup
->Ncvec
};
1864 const bool nf_odd
{(setup
->ifac
[1]&1) != 0};
1866 std::array buff
{voutput
, scratch
};
1867 bool ib
{nf_odd
!= ordered
};
1868 if(direction
== PFFFT_FORWARD
)
1870 /* Swap the initial work buffer for forward FFTs, which helps avoid an
1871 * extra copy for output.
1874 if(setup
->transform
== PFFFT_REAL
)
1876 ib
= (rfftf1_ps(Ncvec
*2, vinput
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
) == buff
[1]);
1877 pffft_real_finalize(Ncvec
, buff
[ib
], buff
[!ib
], setup
->e
.data());
1881 v4sf
*tmp
{buff
[ib
]};
1882 for(size_t k
=0; k
< Ncvec
; ++k
)
1883 uninterleave2(vinput
[k
*2], vinput
[k
*2+1], tmp
[k
*2], tmp
[k
*2+1]);
1885 ib
= (cfftf1_ps(Ncvec
, buff
[ib
], buff
[!ib
], buff
[ib
], setup
->twiddle
, setup
->ifac
, -1.0f
) == buff
[1]);
1886 pffft_cplx_finalize(Ncvec
, buff
[ib
], buff
[!ib
], setup
->e
.data());
1889 pffft_zreorder(setup
, reinterpret_cast<float*>(buff
[!ib
]),
1890 reinterpret_cast<float*>(buff
[ib
]), PFFFT_FORWARD
);
1896 if(vinput
== buff
[ib
])
1897 ib
= !ib
; // may happen when finput == foutput
1901 pffft_zreorder(setup
, reinterpret_cast<const float*>(vinput
),
1902 reinterpret_cast<float*>(buff
[ib
]), PFFFT_BACKWARD
);
1906 if(setup
->transform
== PFFFT_REAL
)
1908 pffft_real_preprocess(Ncvec
, vinput
, buff
[ib
], setup
->e
.data());
1909 ib
= (rfftb1_ps(Ncvec
*2, buff
[ib
], buff
[0], buff
[1], setup
->twiddle
, setup
->ifac
) == buff
[1]);
1913 pffft_cplx_preprocess(Ncvec
, vinput
, buff
[ib
], setup
->e
.data());
1914 ib
= (cfftf1_ps(Ncvec
, buff
[ib
], buff
[0], buff
[1], setup
->twiddle
, setup
->ifac
, +1.0f
) == buff
[1]);
1915 for(size_t k
{0};k
< Ncvec
;++k
)
1916 interleave2(buff
[ib
][k
*2], buff
[ib
][k
*2+1], buff
[ib
][k
*2], buff
[ib
][k
*2+1]);
1920 if(buff
[ib
] != voutput
)
1922 /* extra copy required -- this situation should only happen when finput == foutput */
1923 assert(vinput
==voutput
);
1924 for(size_t k
{0};k
< Ncvec
;++k
)
1926 v4sf a
{buff
[ib
][2*k
]}, b
{buff
[ib
][2*k
+1]};
1927 voutput
[2*k
] = a
; voutput
[2*k
+1] = b
;
1934 void pffft_zreorder(const PFFFT_Setup
*setup
, const float *in
, float *out
,
1935 pffft_direction_t direction
)
1939 const size_t N
{setup
->N
}, Ncvec
{setup
->Ncvec
};
1940 const v4sf
*vin
{reinterpret_cast<const v4sf
*>(in
)};
1941 v4sf
*RESTRICT vout
{reinterpret_cast<v4sf
*>(out
)};
1942 if(setup
->transform
== PFFFT_REAL
)
1944 const size_t dk
{N
/32};
1945 if(direction
== PFFFT_FORWARD
)
1947 for(size_t k
{0};k
< dk
;++k
)
1949 interleave2(vin
[k
*8 + 0], vin
[k
*8 + 1], vout
[2*(0*dk
+ k
) + 0], vout
[2*(0*dk
+ k
) + 1]);
1950 interleave2(vin
[k
*8 + 4], vin
[k
*8 + 5], vout
[2*(2*dk
+ k
) + 0], vout
[2*(2*dk
+ k
) + 1]);
1952 reversed_copy(dk
, vin
+2, 8, vout
+ N
/SimdSize
/2);
1953 reversed_copy(dk
, vin
+6, 8, vout
+ N
/SimdSize
);
1957 for(size_t k
{0};k
< dk
;++k
)
1959 uninterleave2(vin
[2*(0*dk
+ k
) + 0], vin
[2*(0*dk
+ k
) + 1], vout
[k
*8 + 0], vout
[k
*8 + 1]);
1960 uninterleave2(vin
[2*(2*dk
+ k
) + 0], vin
[2*(2*dk
+ k
) + 1], vout
[k
*8 + 4], vout
[k
*8 + 5]);
1962 unreversed_copy(dk
, vin
+ N
/SimdSize
/4, vout
+ N
/SimdSize
- 6, -8);
1963 unreversed_copy(dk
, vin
+ 3_uz
*N
/SimdSize
/4, vout
+ N
/SimdSize
- 2, -8);
1968 if(direction
== PFFFT_FORWARD
)
1970 for(size_t k
{0};k
< Ncvec
;++k
)
1972 size_t kk
{(k
/4) + (k
%4)*(Ncvec
/4)};
1973 interleave2(vin
[k
*2], vin
[k
*2+1], vout
[kk
*2], vout
[kk
*2+1]);
1978 for(size_t k
{0};k
< Ncvec
;++k
)
1980 size_t kk
{(k
/4) + (k
%4)*(Ncvec
/4)};
1981 uninterleave2(vin
[kk
*2], vin
[kk
*2+1], vout
[k
*2], vout
[k
*2+1]);
1987 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
,
1988 float *ab
, float scaling
)
1990 const size_t Ncvec
{s
->Ncvec
};
1991 const v4sf
*RESTRICT va
{reinterpret_cast<const v4sf
*>(a
)};
1992 const v4sf
*RESTRICT vb
{reinterpret_cast<const v4sf
*>(b
)};
1993 v4sf
*RESTRICT vab
{reinterpret_cast<v4sf
*>(ab
)};
1996 __builtin_prefetch(va
);
1997 __builtin_prefetch(vb
);
1998 __builtin_prefetch(vab
);
1999 __builtin_prefetch(va
+2);
2000 __builtin_prefetch(vb
+2);
2001 __builtin_prefetch(vab
+2);
2002 __builtin_prefetch(va
+4);
2003 __builtin_prefetch(vb
+4);
2004 __builtin_prefetch(vab
+4);
2005 __builtin_prefetch(va
+6);
2006 __builtin_prefetch(vb
+6);
2007 __builtin_prefetch(vab
+6);
2009 #define ZCONVOLVE_USING_INLINE_NEON_ASM
2013 const float ar1
{vextract0(va
[0])};
2014 const float ai1
{vextract0(va
[1])};
2015 const float br1
{vextract0(vb
[0])};
2016 const float bi1
{vextract0(vb
[1])};
2017 const float abr1
{vextract0(vab
[0])};
2018 const float abi1
{vextract0(vab
[1])};
2020 #ifdef ZCONVOLVE_USING_INLINE_ASM
2021 /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
2022 * Ubuntu. So this will be restricted to GCC.
2024 * Does it still miscompile with Clang? Is it even needed with today's
2027 const float *a_
{a
}, *b_
{b
}; float *ab_
{ab
};
2029 asm volatile("mov r8, %2 \n"
2030 "vdup.f32 q15, %4 \n"
2038 "vld1.f32 {q0,q1}, [%0,:128]! \n"
2039 "vld1.f32 {q4,q5}, [%1,:128]! \n"
2040 "vld1.f32 {q2,q3}, [%0,:128]! \n"
2041 "vld1.f32 {q6,q7}, [%1,:128]! \n"
2042 "vld1.f32 {q8,q9}, [r8,:128]! \n"
2044 "vmul.f32 q10, q0, q4 \n"
2045 "vmul.f32 q11, q0, q5 \n"
2046 "vmul.f32 q12, q2, q6 \n"
2047 "vmul.f32 q13, q2, q7 \n"
2048 "vmls.f32 q10, q1, q5 \n"
2049 "vmla.f32 q11, q1, q4 \n"
2050 "vld1.f32 {q0,q1}, [r8,:128]! \n"
2051 "vmls.f32 q12, q3, q7 \n"
2052 "vmla.f32 q13, q3, q6 \n"
2053 "vmla.f32 q8, q10, q15 \n"
2054 "vmla.f32 q9, q11, q15 \n"
2055 "vmla.f32 q0, q12, q15 \n"
2056 "vmla.f32 q1, q13, q15 \n"
2057 "vst1.f32 {q8,q9},[%2,:128]! \n"
2058 "vst1.f32 {q0,q1},[%2,:128]! \n"
2061 : "+r"(a_
), "+r"(b_
), "+r"(ab_
), "+r"(N
) : "r"(scaling
) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
2065 /* Default routine, works fine for non-arm cpus with current compilers. */
2066 const v4sf vscal
{ld_ps1(scaling
)};
2067 for(size_t i
{0};i
< Ncvec
;i
+= 2)
2069 v4sf ar4
{va
[2*i
+0]}, ai4
{va
[2*i
+1]};
2070 v4sf br4
{vb
[2*i
+0]}, bi4
{vb
[2*i
+1]};
2071 vcplxmul(ar4
, ai4
, br4
, bi4
);
2072 vab
[2*i
+0] = vmadd(ar4
, vscal
, vab
[2*i
+0]);
2073 vab
[2*i
+1] = vmadd(ai4
, vscal
, vab
[2*i
+1]);
2074 ar4
= va
[2*i
+2]; ai4
= va
[2*i
+3];
2075 br4
= vb
[2*i
+2]; bi4
= vb
[2*i
+3];
2076 vcplxmul(ar4
, ai4
, br4
, bi4
);
2077 vab
[2*i
+2] = vmadd(ar4
, vscal
, vab
[2*i
+2]);
2078 vab
[2*i
+3] = vmadd(ai4
, vscal
, vab
[2*i
+3]);
2082 if(s
->transform
== PFFFT_REAL
)
2084 vab
[0] = vinsert0(vab
[0], abr1
+ ar1
*br1
*scaling
);
2085 vab
[1] = vinsert0(vab
[1], abi1
+ ai1
*bi1
*scaling
);
2089 void pffft_zconvolve_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
, float *ab
)
2091 const size_t Ncvec
{s
->Ncvec
};
2092 const v4sf
*RESTRICT va
{reinterpret_cast<const v4sf
*>(a
)};
2093 const v4sf
*RESTRICT vb
{reinterpret_cast<const v4sf
*>(b
)};
2094 v4sf
*RESTRICT vab
{reinterpret_cast<v4sf
*>(ab
)};
2097 __builtin_prefetch(va
);
2098 __builtin_prefetch(vb
);
2099 __builtin_prefetch(vab
);
2100 __builtin_prefetch(va
+2);
2101 __builtin_prefetch(vb
+2);
2102 __builtin_prefetch(vab
+2);
2103 __builtin_prefetch(va
+4);
2104 __builtin_prefetch(vb
+4);
2105 __builtin_prefetch(vab
+4);
2106 __builtin_prefetch(va
+6);
2107 __builtin_prefetch(vb
+6);
2108 __builtin_prefetch(vab
+6);
2111 const float ar1
{vextract0(va
[0])};
2112 const float ai1
{vextract0(va
[1])};
2113 const float br1
{vextract0(vb
[0])};
2114 const float bi1
{vextract0(vb
[1])};
2115 const float abr1
{vextract0(vab
[0])};
2116 const float abi1
{vextract0(vab
[1])};
2118 /* No inline assembly for this version. I'm not familiar enough with NEON
2119 * assembly, and I don't know that it's needed with today's optimizers.
2121 for(size_t i
{0};i
< Ncvec
;i
+= 2)
2123 v4sf ar4
{va
[2*i
+0]}, ai4
{va
[2*i
+1]};
2124 v4sf br4
{vb
[2*i
+0]}, bi4
{vb
[2*i
+1]};
2125 vcplxmul(ar4
, ai4
, br4
, bi4
);
2126 vab
[2*i
+0] = vadd(ar4
, vab
[2*i
+0]);
2127 vab
[2*i
+1] = vadd(ai4
, vab
[2*i
+1]);
2128 ar4
= va
[2*i
+2]; ai4
= va
[2*i
+3];
2129 br4
= vb
[2*i
+2]; bi4
= vb
[2*i
+3];
2130 vcplxmul(ar4
, ai4
, br4
, bi4
);
2131 vab
[2*i
+2] = vadd(ar4
, vab
[2*i
+2]);
2132 vab
[2*i
+3] = vadd(ai4
, vab
[2*i
+3]);
2135 if(s
->transform
== PFFFT_REAL
)
2137 vab
[0] = vinsert0(vab
[0], abr1
+ ar1
*br1
);
2138 vab
[1] = vinsert0(vab
[1], abi1
+ ai1
*bi1
);
2143 void pffft_transform(const PFFFT_Setup
*setup
, const float *input
, float *output
, float *work
,
2144 pffft_direction_t direction
)
2146 assert(valigned(input
) && valigned(output
) && valigned(work
));
2147 pffft_transform_internal(setup
, reinterpret_cast<const v4sf
*>(al::assume_aligned
<16>(input
)),
2148 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(output
)),
2149 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(work
)), direction
, false);
2152 void pffft_transform_ordered(const PFFFT_Setup
*setup
, const float *input
, float *output
,
2153 float *work
, pffft_direction_t direction
)
2155 assert(valigned(input
) && valigned(output
) && valigned(work
));
2156 pffft_transform_internal(setup
, reinterpret_cast<const v4sf
*>(al::assume_aligned
<16>(input
)),
2157 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(output
)),
2158 reinterpret_cast<v4sf
*>(al::assume_aligned
<16>(work
)), direction
, true);
2161 #else // defined(PFFFT_SIMD_DISABLE)
2163 // standard routine using scalar floats, without SIMD stuff.
2167 void pffft_transform_internal(const PFFFT_Setup
*setup
, const float *input
, float *output
,
2168 float *scratch
, const pffft_direction_t direction
, bool ordered
)
2170 const size_t Ncvec
{setup
->Ncvec
};
2171 const bool nf_odd
{(setup
->ifac
[1]&1) != 0};
2173 assert(scratch
!= nullptr);
2175 /* z-domain data for complex transforms is already ordered without SIMD. */
2176 if(setup
->transform
== PFFFT_COMPLEX
)
2179 float *buff
[2]{output
, scratch
};
2180 bool ib
{nf_odd
!= ordered
};
2181 if(direction
== PFFFT_FORWARD
)
2183 if(setup
->transform
== PFFFT_REAL
)
2184 ib
= (rfftf1_ps(Ncvec
*2, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
) == buff
[1]);
2186 ib
= (cfftf1_ps(Ncvec
, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
, -1.0f
) == buff
[1]);
2189 pffft_zreorder(setup
, buff
[ib
], buff
[!ib
], PFFFT_FORWARD
);
2195 if (input
== buff
[ib
])
2196 ib
= !ib
; // may happen when finput == foutput
2200 pffft_zreorder(setup
, input
, buff
[ib
], PFFFT_BACKWARD
);
2204 if(setup
->transform
== PFFFT_REAL
)
2205 ib
= (rfftb1_ps(Ncvec
*2, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
) == buff
[1]);
2207 ib
= (cfftf1_ps(Ncvec
, input
, buff
[ib
], buff
[!ib
], setup
->twiddle
, setup
->ifac
, +1.0f
) == buff
[1]);
2209 if(buff
[ib
] != output
)
2211 // extra copy required -- this situation should happens only when finput == foutput
2212 assert(input
==output
);
2213 for(size_t k
{0};k
< Ncvec
;++k
)
2215 float a
{buff
[ib
][2*k
]}, b
{buff
[ib
][2*k
+1]};
2216 output
[2*k
] = a
; output
[2*k
+1] = b
;
2223 void pffft_zreorder(const PFFFT_Setup
*setup
, const float *in
, float *RESTRICT out
,
2224 pffft_direction_t direction
)
2226 const size_t N
{setup
->N
};
2227 if(setup
->transform
== PFFFT_COMPLEX
)
2229 for(size_t k
{0};k
< 2*N
;++k
)
2233 else if(direction
== PFFFT_FORWARD
)
2236 for(size_t k
{N
-1};k
> 1;--k
)
2244 for(size_t k
{1};k
< N
-1;++k
)
2251 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
,
2252 float *ab
, float scaling
)
2254 size_t Ncvec
{s
->Ncvec
};
2256 if(s
->transform
== PFFFT_REAL
)
2258 // take care of the fftpack ordering
2259 ab
[0] += a
[0]*b
[0]*scaling
;
2260 ab
[2*Ncvec
-1] += a
[2*Ncvec
-1]*b
[2*Ncvec
-1]*scaling
;
2261 ++ab
; ++a
; ++b
; --Ncvec
;
2263 for(size_t i
{0};i
< Ncvec
;++i
)
2265 float ar
{a
[2*i
+0]}, ai
{a
[2*i
+1]};
2266 const float br
{b
[2*i
+0]}, bi
{b
[2*i
+1]};
2267 vcplxmul(ar
, ai
, br
, bi
);
2268 ab
[2*i
+0] += ar
*scaling
;
2269 ab
[2*i
+1] += ai
*scaling
;
2273 void pffft_zconvolve_accumulate(const PFFFT_Setup
*s
, const float *a
, const float *b
, float *ab
)
2275 size_t Ncvec
{s
->Ncvec
};
2277 if(s
->transform
== PFFFT_REAL
)
2279 // take care of the fftpack ordering
2281 ab
[2*Ncvec
-1] += a
[2*Ncvec
-1]*b
[2*Ncvec
-1];
2282 ++ab
; ++a
; ++b
; --Ncvec
;
2284 for(size_t i
{0};i
< Ncvec
;++i
)
2286 float ar
{a
[2*i
+0]}, ai
{a
[2*i
+1]};
2287 const float br
{b
[2*i
+0]}, bi
{b
[2*i
+1]};
2288 vcplxmul(ar
, ai
, br
, bi
);
2295 void pffft_transform(const PFFFT_Setup
*setup
, const float *input
, float *output
, float *work
,
2296 pffft_direction_t direction
)
2298 pffft_transform_internal(setup
, input
, output
, work
, direction
, false);
2301 void pffft_transform_ordered(const PFFFT_Setup
*setup
, const float *input
, float *output
,
2302 float *work
, pffft_direction_t direction
)
2304 pffft_transform_internal(setup
, input
, output
, work
, direction
, true);
2307 #endif /* defined(PFFFT_SIMD_DISABLE) */
2308 /* NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) */