1 #ifndef PHASE_SHIFTER_H
2 #define PHASE_SHIFTER_H
4 #include "config_simd.h"
6 #if HAVE_SSE_INTRINSICS
16 #include "alnumbers.h"
18 #include "opthelpers.h"
21 /* Implements a wide-band +90 degree phase-shift. Note that this should be
22 * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
23 * signal delay (FilterSize/2) to properly align.
25 template<std::size_t FilterSize
>
26 struct SIMDALIGN PhaseShifterT
{
27 static_assert(FilterSize
>= 16, "FilterSize needs to be at least 16");
28 static_assert((FilterSize
&(FilterSize
-1)) == 0, "FilterSize needs to be power-of-two");
30 alignas(16) std::array
<float,FilterSize
/2> mCoeffs
{};
34 /* Every other coefficient is 0, so we only need to calculate and store
35 * the non-0 terms and double-step over the input to apply it. The
36 * calculated coefficients are in reverse to make applying in the time-
37 * domain more efficient.
39 for(std::size_t i
{0};i
< FilterSize
/2;++i
)
41 const int k
{static_cast<int>(i
*2 + 1) - int{FilterSize
/2}};
43 /* Calculate the Blackman window value for this coefficient. */
44 const double w
{2.0*al::numbers::pi
* static_cast<double>(i
*2 + 1)
45 / double{FilterSize
}};
46 const double window
{0.3635819 - 0.4891775*std::cos(w
) + 0.1365995*std::cos(2.0*w
)
47 - 0.0106411*std::cos(3.0*w
)};
49 const double pk
{al::numbers::pi
* static_cast<double>(k
)};
50 mCoeffs
[i
] = static_cast<float>(window
* (1.0-std::cos(pk
)) / pk
);
54 void process(const al::span
<float> dst
, const al::span
<const float> src
) const;
58 static auto unpacklo(float32x4_t a
, float32x4_t b
)
60 float32x2x2_t result
{vzip_f32(vget_low_f32(a
), vget_low_f32(b
))};
61 return vcombine_f32(result
.val
[0], result
.val
[1]);
63 static auto unpackhi(float32x4_t a
, float32x4_t b
)
65 float32x2x2_t result
{vzip_f32(vget_high_f32(a
), vget_high_f32(b
))};
66 return vcombine_f32(result
.val
[0], result
.val
[1]);
68 static auto load4(float32_t a
, float32_t b
, float32_t c
, float32_t d
)
70 float32x4_t ret
{vmovq_n_f32(a
)};
71 ret
= vsetq_lane_f32(b
, ret
, 1);
72 ret
= vsetq_lane_f32(c
, ret
, 2);
73 ret
= vsetq_lane_f32(d
, ret
, 3);
76 static void vtranspose4(float32x4_t
&x0
, float32x4_t
&x1
, float32x4_t
&x2
, float32x4_t
&x3
)
78 float32x4x2_t t0_
{vzipq_f32(x0
, x2
)};
79 float32x4x2_t t1_
{vzipq_f32(x1
, x3
)};
80 float32x4x2_t u0_
{vzipq_f32(t0_
.val
[0], t1_
.val
[0])};
81 float32x4x2_t u1_
{vzipq_f32(t0_
.val
[1], t1_
.val
[1])};
90 template<std::size_t S
>
92 void PhaseShifterT
<S
>::process(const al::span
<float> dst
, const al::span
<const float> src
) const
94 auto in
= src
.begin();
95 #if HAVE_SSE_INTRINSICS
96 if(const std::size_t todo
{dst
.size()>>2})
98 auto out
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), todo
};
99 std::generate(out
.begin(), out
.end(), [&in
,this]
101 __m128 r0
{_mm_setzero_ps()};
102 __m128 r1
{_mm_setzero_ps()};
103 __m128 r2
{_mm_setzero_ps()};
104 __m128 r3
{_mm_setzero_ps()};
105 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
107 const __m128 coeffs
{_mm_load_ps(&mCoeffs
[j
])};
108 const __m128 s0
{_mm_loadu_ps(&in
[j
*2])};
109 const __m128 s1
{_mm_loadu_ps(&in
[j
*2 + 4])};
110 const __m128 s2
{_mm_movehl_ps(_mm_movelh_ps(s1
, s1
), s0
)};
111 const __m128 s3
{_mm_loadh_pi(_mm_movehl_ps(s1
, s1
),
112 reinterpret_cast<const __m64
*>(&in
[j
*2 + 8]))};
114 __m128 s
{_mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(2, 0, 2, 0))};
115 r0
= _mm_add_ps(r0
, _mm_mul_ps(s
, coeffs
));
117 s
= _mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(3, 1, 3, 1));
118 r1
= _mm_add_ps(r1
, _mm_mul_ps(s
, coeffs
));
120 s
= _mm_shuffle_ps(s2
, s3
, _MM_SHUFFLE(2, 0, 2, 0));
121 r2
= _mm_add_ps(r2
, _mm_mul_ps(s
, coeffs
));
123 s
= _mm_shuffle_ps(s2
, s3
, _MM_SHUFFLE(3, 1, 3, 1));
124 r3
= _mm_add_ps(r3
, _mm_mul_ps(s
, coeffs
));
128 _MM_TRANSPOSE4_PS(r0
, r1
, r2
, r3
);
129 return _mm_add_ps(_mm_add_ps(r0
, r1
), _mm_add_ps(r2
, r3
));
132 if(const std::size_t todo
{dst
.size()&3})
134 auto out
= dst
.last(todo
);
135 std::generate(out
.begin(), out
.end(), [&in
,this]
137 __m128 r4
{_mm_setzero_ps()};
138 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
140 const __m128 coeffs
{_mm_load_ps(&mCoeffs
[j
])};
141 const __m128 s
{_mm_setr_ps(in
[j
*2], in
[j
*2 + 2], in
[j
*2 + 4], in
[j
*2 + 6])};
142 r4
= _mm_add_ps(r4
, _mm_mul_ps(s
, coeffs
));
145 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
146 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
147 return _mm_cvtss_f32(r4
);
153 if(const std::size_t todo
{dst
.size()>>2})
155 auto out
= al::span
{reinterpret_cast<float32x4_t
*>(dst
.data()), todo
};
156 std::generate(out
.begin(), out
.end(), [&in
,this]
158 float32x4_t r0
{vdupq_n_f32(0.0f
)};
159 float32x4_t r1
{vdupq_n_f32(0.0f
)};
160 float32x4_t r2
{vdupq_n_f32(0.0f
)};
161 float32x4_t r3
{vdupq_n_f32(0.0f
)};
162 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
164 const float32x4_t coeffs
{vld1q_f32(&mCoeffs
[j
])};
165 const float32x4_t s0
{vld1q_f32(&in
[j
*2])};
166 const float32x4_t s1
{vld1q_f32(&in
[j
*2 + 4])};
167 const float32x4_t s2
{vcombine_f32(vget_high_f32(s0
), vget_low_f32(s1
))};
168 const float32x4_t s3
{vcombine_f32(vget_high_f32(s1
), vld1_f32(&in
[j
*2 + 8]))};
169 const float32x4x2_t values0
{vuzpq_f32(s0
, s1
)};
170 const float32x4x2_t values1
{vuzpq_f32(s2
, s3
)};
172 r0
= vmlaq_f32(r0
, values0
.val
[0], coeffs
);
173 r1
= vmlaq_f32(r1
, values0
.val
[1], coeffs
);
174 r2
= vmlaq_f32(r2
, values1
.val
[0], coeffs
);
175 r3
= vmlaq_f32(r3
, values1
.val
[1], coeffs
);
179 vtranspose4(r0
, r1
, r2
, r3
);
180 return vaddq_f32(vaddq_f32(r0
, r1
), vaddq_f32(r2
, r3
));
183 if(const std::size_t todo
{dst
.size()&3})
185 auto out
= dst
.last(todo
);
186 std::generate(out
.begin(), out
.end(), [&in
,this]
188 float32x4_t r4
{vdupq_n_f32(0.0f
)};
189 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
191 const float32x4_t coeffs
{vld1q_f32(&mCoeffs
[j
])};
192 const float32x4_t s
{load4(in
[j
*2], in
[j
*2 + 2], in
[j
*2 + 4], in
[j
*2 + 6])};
193 r4
= vmlaq_f32(r4
, s
, coeffs
);
196 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
197 return vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
203 std::generate(dst
.begin(), dst
.end(), [&in
,this]
206 for(std::size_t j
{0};j
< mCoeffs
.size();++j
)
207 ret
+= in
[j
*2] * mCoeffs
[j
];
214 #endif /* PHASE_SHIFTER_H */