Use 0/1 defines for backend and simd macros
[openal-soft.git] / common / phase_shifter.h
blob5ff867400121710d3b664a7bef0d5472b61c88d7
1 #ifndef PHASE_SHIFTER_H
2 #define PHASE_SHIFTER_H
4 #include "config_simd.h"
6 #if HAVE_SSE_INTRINSICS
7 #include <xmmintrin.h>
8 #elif HAVE_NEON
9 #include <arm_neon.h>
10 #endif
12 #include <array>
13 #include <cmath>
14 #include <cstddef>
16 #include "alnumbers.h"
17 #include "alspan.h"
18 #include "opthelpers.h"
21 /* Implements a wide-band +90 degree phase-shift. Note that this should be
22 * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
23 * signal delay (FilterSize/2) to properly align.
25 template<std::size_t FilterSize>
26 struct SIMDALIGN PhaseShifterT {
27 static_assert(FilterSize >= 16, "FilterSize needs to be at least 16");
28 static_assert((FilterSize&(FilterSize-1)) == 0, "FilterSize needs to be power-of-two");
30 alignas(16) std::array<float,FilterSize/2> mCoeffs{};
32 PhaseShifterT()
34 /* Every other coefficient is 0, so we only need to calculate and store
35 * the non-0 terms and double-step over the input to apply it. The
36 * calculated coefficients are in reverse to make applying in the time-
37 * domain more efficient.
39 for(std::size_t i{0};i < FilterSize/2;++i)
41 const int k{static_cast<int>(i*2 + 1) - int{FilterSize/2}};
43 /* Calculate the Blackman window value for this coefficient. */
44 const double w{2.0*al::numbers::pi * static_cast<double>(i*2 + 1)
45 / double{FilterSize}};
46 const double window{0.3635819 - 0.4891775*std::cos(w) + 0.1365995*std::cos(2.0*w)
47 - 0.0106411*std::cos(3.0*w)};
49 const double pk{al::numbers::pi * static_cast<double>(k)};
50 mCoeffs[i] = static_cast<float>(window * (1.0-std::cos(pk)) / pk);
54 void process(const al::span<float> dst, const al::span<const float> src) const;
56 private:
57 #if HAVE_NEON
58 static auto unpacklo(float32x4_t a, float32x4_t b)
60 float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
61 return vcombine_f32(result.val[0], result.val[1]);
63 static auto unpackhi(float32x4_t a, float32x4_t b)
65 float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
66 return vcombine_f32(result.val[0], result.val[1]);
68 static auto load4(float32_t a, float32_t b, float32_t c, float32_t d)
70 float32x4_t ret{vmovq_n_f32(a)};
71 ret = vsetq_lane_f32(b, ret, 1);
72 ret = vsetq_lane_f32(c, ret, 2);
73 ret = vsetq_lane_f32(d, ret, 3);
74 return ret;
76 static void vtranspose4(float32x4_t &x0, float32x4_t &x1, float32x4_t &x2, float32x4_t &x3)
78 float32x4x2_t t0_{vzipq_f32(x0, x2)};
79 float32x4x2_t t1_{vzipq_f32(x1, x3)};
80 float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
81 float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
82 x0 = u0_.val[0];
83 x1 = u0_.val[1];
84 x2 = u1_.val[0];
85 x3 = u1_.val[1];
87 #endif
90 template<std::size_t S>
91 NOINLINE inline
92 void PhaseShifterT<S>::process(const al::span<float> dst, const al::span<const float> src) const
94 auto in = src.begin();
95 #if HAVE_SSE_INTRINSICS
96 if(const std::size_t todo{dst.size()>>2})
98 auto out = al::span{reinterpret_cast<__m128*>(dst.data()), todo};
99 std::generate(out.begin(), out.end(), [&in,this]
101 __m128 r0{_mm_setzero_ps()};
102 __m128 r1{_mm_setzero_ps()};
103 __m128 r2{_mm_setzero_ps()};
104 __m128 r3{_mm_setzero_ps()};
105 for(std::size_t j{0};j < mCoeffs.size();j+=4)
107 const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
108 const __m128 s0{_mm_loadu_ps(&in[j*2])};
109 const __m128 s1{_mm_loadu_ps(&in[j*2 + 4])};
110 const __m128 s2{_mm_movehl_ps(_mm_movelh_ps(s1, s1), s0)};
111 const __m128 s3{_mm_loadh_pi(_mm_movehl_ps(s1, s1),
112 reinterpret_cast<const __m64*>(&in[j*2 + 8]))};
114 __m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))};
115 r0 = _mm_add_ps(r0, _mm_mul_ps(s, coeffs));
117 s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
118 r1 = _mm_add_ps(r1, _mm_mul_ps(s, coeffs));
120 s = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(2, 0, 2, 0));
121 r2 = _mm_add_ps(r2, _mm_mul_ps(s, coeffs));
123 s = _mm_shuffle_ps(s2, s3, _MM_SHUFFLE(3, 1, 3, 1));
124 r3 = _mm_add_ps(r3, _mm_mul_ps(s, coeffs));
126 in += 4;
128 _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
129 return _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
132 if(const std::size_t todo{dst.size()&3})
134 auto out = dst.last(todo);
135 std::generate(out.begin(), out.end(), [&in,this]
137 __m128 r4{_mm_setzero_ps()};
138 for(std::size_t j{0};j < mCoeffs.size();j+=4)
140 const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
141 const __m128 s{_mm_setr_ps(in[j*2], in[j*2 + 2], in[j*2 + 4], in[j*2 + 6])};
142 r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
144 ++in;
145 r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
146 r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
147 return _mm_cvtss_f32(r4);
151 #elif HAVE_NEON
153 if(const std::size_t todo{dst.size()>>2})
155 auto out = al::span{reinterpret_cast<float32x4_t*>(dst.data()), todo};
156 std::generate(out.begin(), out.end(), [&in,this]
158 float32x4_t r0{vdupq_n_f32(0.0f)};
159 float32x4_t r1{vdupq_n_f32(0.0f)};
160 float32x4_t r2{vdupq_n_f32(0.0f)};
161 float32x4_t r3{vdupq_n_f32(0.0f)};
162 for(std::size_t j{0};j < mCoeffs.size();j+=4)
164 const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
165 const float32x4_t s0{vld1q_f32(&in[j*2])};
166 const float32x4_t s1{vld1q_f32(&in[j*2 + 4])};
167 const float32x4_t s2{vcombine_f32(vget_high_f32(s0), vget_low_f32(s1))};
168 const float32x4_t s3{vcombine_f32(vget_high_f32(s1), vld1_f32(&in[j*2 + 8]))};
169 const float32x4x2_t values0{vuzpq_f32(s0, s1)};
170 const float32x4x2_t values1{vuzpq_f32(s2, s3)};
172 r0 = vmlaq_f32(r0, values0.val[0], coeffs);
173 r1 = vmlaq_f32(r1, values0.val[1], coeffs);
174 r2 = vmlaq_f32(r2, values1.val[0], coeffs);
175 r3 = vmlaq_f32(r3, values1.val[1], coeffs);
177 in += 4;
179 vtranspose4(r0, r1, r2, r3);
180 return vaddq_f32(vaddq_f32(r0, r1), vaddq_f32(r2, r3));
183 if(const std::size_t todo{dst.size()&3})
185 auto out = dst.last(todo);
186 std::generate(out.begin(), out.end(), [&in,this]
188 float32x4_t r4{vdupq_n_f32(0.0f)};
189 for(std::size_t j{0};j < mCoeffs.size();j+=4)
191 const float32x4_t coeffs{vld1q_f32(&mCoeffs[j])};
192 const float32x4_t s{load4(in[j*2], in[j*2 + 2], in[j*2 + 4], in[j*2 + 6])};
193 r4 = vmlaq_f32(r4, s, coeffs);
195 ++in;
196 r4 = vaddq_f32(r4, vrev64q_f32(r4));
197 return vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
201 #else
203 std::generate(dst.begin(), dst.end(), [&in,this]
205 float ret{0.0f};
206 for(std::size_t j{0};j < mCoeffs.size();++j)
207 ret += in[j*2] * mCoeffs[j];
208 ++in;
209 return ret;
211 #endif
214 #endif /* PHASE_SHIFTER_H */