1 #ifndef PHASE_SHIFTER_H
2 #define PHASE_SHIFTER_H
4 #ifdef HAVE_SSE_INTRINSICS
6 #elif defined(HAVE_NEON)
14 #include "alnumbers.h"
16 #include "opthelpers.h"
19 /* Implements a wide-band +90 degree phase-shift. Note that this should be
20 * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
21 * signal delay (FilterSize/2) to properly align.
23 template<std::size_t FilterSize
>
24 struct SIMDALIGN PhaseShifterT
{
25 static_assert(FilterSize
>= 16, "FilterSize needs to be at least 16");
26 static_assert((FilterSize
&(FilterSize
-1)) == 0, "FilterSize needs to be power-of-two");
28 alignas(16) std::array
<float,FilterSize
/2> mCoeffs
{};
32 /* Every other coefficient is 0, so we only need to calculate and store
33 * the non-0 terms and double-step over the input to apply it. The
34 * calculated coefficients are in reverse to make applying in the time-
35 * domain more efficient.
37 for(std::size_t i
{0};i
< FilterSize
/2;++i
)
39 const int k
{static_cast<int>(i
*2 + 1) - int{FilterSize
/2}};
41 /* Calculate the Blackman window value for this coefficient. */
42 const double w
{2.0*al::numbers::pi
* static_cast<double>(i
*2 + 1)
43 / double{FilterSize
}};
44 const double window
{0.3635819 - 0.4891775*std::cos(w
) + 0.1365995*std::cos(2.0*w
)
45 - 0.0106411*std::cos(3.0*w
)};
47 const double pk
{al::numbers::pi
* static_cast<double>(k
)};
48 mCoeffs
[i
] = static_cast<float>(window
* (1.0-std::cos(pk
)) / pk
);
52 void process(const al::span
<float> dst
, const al::span
<const float> src
) const;
55 #if defined(HAVE_NEON)
56 static auto unpacklo(float32x4_t a
, float32x4_t b
)
58 float32x2x2_t result
{vzip_f32(vget_low_f32(a
), vget_low_f32(b
))};
59 return vcombine_f32(result
.val
[0], result
.val
[1]);
61 static auto unpackhi(float32x4_t a
, float32x4_t b
)
63 float32x2x2_t result
{vzip_f32(vget_high_f32(a
), vget_high_f32(b
))};
64 return vcombine_f32(result
.val
[0], result
.val
[1]);
66 static auto load4(float32_t a
, float32_t b
, float32_t c
, float32_t d
)
68 float32x4_t ret
{vmovq_n_f32(a
)};
69 ret
= vsetq_lane_f32(b
, ret
, 1);
70 ret
= vsetq_lane_f32(c
, ret
, 2);
71 ret
= vsetq_lane_f32(d
, ret
, 3);
74 static void vtranspose4(float32x4_t
&x0
, float32x4_t
&x1
, float32x4_t
&x2
, float32x4_t
&x3
)
76 float32x4x2_t t0_
{vzipq_f32(x0
, x2
)};
77 float32x4x2_t t1_
{vzipq_f32(x1
, x3
)};
78 float32x4x2_t u0_
{vzipq_f32(t0_
.val
[0], t1_
.val
[0])};
79 float32x4x2_t u1_
{vzipq_f32(t0_
.val
[1], t1_
.val
[1])};
88 template<std::size_t S
>
90 void PhaseShifterT
<S
>::process(const al::span
<float> dst
, const al::span
<const float> src
) const
92 auto in
= src
.begin();
93 #ifdef HAVE_SSE_INTRINSICS
94 if(const std::size_t todo
{dst
.size()>>2})
96 auto out
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), todo
};
97 std::generate(out
.begin(), out
.end(), [&in
,this]
99 __m128 r0
{_mm_setzero_ps()};
100 __m128 r1
{_mm_setzero_ps()};
101 __m128 r2
{_mm_setzero_ps()};
102 __m128 r3
{_mm_setzero_ps()};
103 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
105 const __m128 coeffs
{_mm_load_ps(&mCoeffs
[j
])};
106 const __m128 s0
{_mm_loadu_ps(&in
[j
*2])};
107 const __m128 s1
{_mm_loadu_ps(&in
[j
*2 + 4])};
108 const __m128 s2
{_mm_movehl_ps(_mm_movelh_ps(s1
, s1
), s0
)};
109 const __m128 s3
{_mm_loadh_pi(_mm_movehl_ps(s1
, s1
),
110 reinterpret_cast<const __m64
*>(&in
[j
*2 + 8]))};
112 __m128 s
{_mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(2, 0, 2, 0))};
113 r0
= _mm_add_ps(r0
, _mm_mul_ps(s
, coeffs
));
115 s
= _mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(3, 1, 3, 1));
116 r1
= _mm_add_ps(r1
, _mm_mul_ps(s
, coeffs
));
118 s
= _mm_shuffle_ps(s2
, s3
, _MM_SHUFFLE(2, 0, 2, 0));
119 r2
= _mm_add_ps(r2
, _mm_mul_ps(s
, coeffs
));
121 s
= _mm_shuffle_ps(s2
, s3
, _MM_SHUFFLE(3, 1, 3, 1));
122 r3
= _mm_add_ps(r3
, _mm_mul_ps(s
, coeffs
));
126 _MM_TRANSPOSE4_PS(r0
, r1
, r2
, r3
);
127 return _mm_add_ps(_mm_add_ps(r0
, r1
), _mm_add_ps(r2
, r3
));
130 if(const std::size_t todo
{dst
.size()&3})
132 auto out
= dst
.last(todo
);
133 std::generate(out
.begin(), out
.end(), [&in
,this]
135 __m128 r4
{_mm_setzero_ps()};
136 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
138 const __m128 coeffs
{_mm_load_ps(&mCoeffs
[j
])};
139 const __m128 s
{_mm_setr_ps(in
[j
*2], in
[j
*2 + 2], in
[j
*2 + 4], in
[j
*2 + 6])};
140 r4
= _mm_add_ps(r4
, _mm_mul_ps(s
, coeffs
));
143 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
144 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
145 return _mm_cvtss_f32(r4
);
149 #elif defined(HAVE_NEON)
151 if(const std::size_t todo
{dst
.size()>>2})
153 auto out
= al::span
{reinterpret_cast<float32x4_t
*>(dst
.data()), todo
};
154 std::generate(out
.begin(), out
.end(), [&in
,this]
156 float32x4_t r0
{vdupq_n_f32(0.0f
)};
157 float32x4_t r1
{vdupq_n_f32(0.0f
)};
158 float32x4_t r2
{vdupq_n_f32(0.0f
)};
159 float32x4_t r3
{vdupq_n_f32(0.0f
)};
160 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
162 const float32x4_t coeffs
{vld1q_f32(&mCoeffs
[j
])};
163 const float32x4_t s0
{vld1q_f32(&in
[j
*2])};
164 const float32x4_t s1
{vld1q_f32(&in
[j
*2 + 4])};
165 const float32x4_t s2
{vcombine_f32(vget_high_f32(s0
), vget_low_f32(s1
))};
166 const float32x4_t s3
{vcombine_f32(vget_high_f32(s1
), vld1_f32(&in
[j
*2 + 8]))};
167 const float32x4x2_t values0
{vuzpq_f32(s0
, s1
)};
168 const float32x4x2_t values1
{vuzpq_f32(s2
, s3
)};
170 r0
= vmlaq_f32(r0
, values0
.val
[0], coeffs
);
171 r1
= vmlaq_f32(r1
, values0
.val
[1], coeffs
);
172 r2
= vmlaq_f32(r2
, values1
.val
[0], coeffs
);
173 r3
= vmlaq_f32(r3
, values1
.val
[1], coeffs
);
177 vtranspose4(r0
, r1
, r2
, r3
);
178 return vaddq_f32(vaddq_f32(r0
, r1
), vaddq_f32(r2
, r3
));
181 if(const std::size_t todo
{dst
.size()&3})
183 auto out
= dst
.last(todo
);
184 std::generate(out
.begin(), out
.end(), [&in
,this]
186 float32x4_t r4
{vdupq_n_f32(0.0f
)};
187 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
189 const float32x4_t coeffs
{vld1q_f32(&mCoeffs
[j
])};
190 const float32x4_t s
{load4(in
[j
*2], in
[j
*2 + 2], in
[j
*2 + 4], in
[j
*2 + 6])};
191 r4
= vmlaq_f32(r4
, s
, coeffs
);
194 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
195 return vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
201 std::generate(dst
.begin(), dst
.end(), [&in
,this]
204 for(std::size_t j
{0};j
< mCoeffs
.size();++j
)
205 ret
+= in
[j
*2] * mCoeffs
[j
];
212 #endif /* PHASE_SHIFTER_H */