Rename some cmake target names to avoid conflicts
[openal-soft.git] / core / mixer / mixer_neon.cpp
blob600c014b7ab97c62c2823ae82f7d3519e7a8ac5e
1 #include "config.h"
3 #include <arm_neon.h>
5 #include <algorithm>
6 #include <array>
7 #include <cstddef>
8 #include <limits>
9 #include <variant>
11 #include "alnumeric.h"
12 #include "alspan.h"
13 #include "core/bsinc_defs.h"
14 #include "core/bufferline.h"
15 #include "core/cubic_defs.h"
16 #include "core/mixer/hrtfdefs.h"
17 #include "core/resampler_limits.h"
18 #include "defs.h"
19 #include "hrtfbase.h"
20 #include "opthelpers.h"
22 struct CTag;
23 struct NEONTag;
24 struct LerpTag;
25 struct CubicTag;
26 struct BSincTag;
27 struct FastBSincTag;
30 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
31 #pragma GCC target("fpu=neon")
32 #endif
34 using uint = unsigned int;
36 namespace {
38 constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
39 constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
40 constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
42 constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
43 constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
44 constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
46 force_inline
47 void vtranspose4(float32x4_t &x0, float32x4_t &x1, float32x4_t &x2, float32x4_t &x3) noexcept
49 float32x4x2_t t0_{vzipq_f32(x0, x2)};
50 float32x4x2_t t1_{vzipq_f32(x1, x3)};
51 float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
52 float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
53 x0 = u0_.val[0];
54 x1 = u0_.val[1];
55 x2 = u1_.val[0];
56 x3 = u1_.val[1];
59 inline float32x4_t set_f4(float l0, float l1, float l2, float l3)
61 float32x4_t ret{vmovq_n_f32(l0)};
62 ret = vsetq_lane_f32(l1, ret, 1);
63 ret = vsetq_lane_f32(l2, ret, 2);
64 ret = vsetq_lane_f32(l3, ret, 3);
65 return ret;
68 inline void ApplyCoeffs(const al::span<float2> Values, const size_t IrSize,
69 const ConstHrirSpan Coeffs, const float left, const float right)
71 ASSUME(IrSize >= MinIrLength);
72 ASSUME(IrSize <= HrirLength);
74 auto dup_samples = [left,right]() -> float32x4_t
76 float32x2_t leftright2{vset_lane_f32(right, vmov_n_f32(left), 1)};
77 return vcombine_f32(leftright2, leftright2);
79 const auto leftright4 = dup_samples();
81 /* Using a loop here instead of std::transform since some builds seem to
82 * have an issue with accessing an array/span of float32x4_t.
84 for(size_t c{0};c < IrSize;c += 2)
86 auto vals = vld1q_f32(&Values[c][0]);
87 vals = vmlaq_f32(vals, vld1q_f32(&Coeffs[c][0]), leftright4);
88 vst1q_f32(&Values[c][0], vals);
92 force_inline void MixLine(const al::span<const float> InSamples, const al::span<float> dst,
93 float &CurrentGain, const float TargetGain, const float delta, const size_t fade_len,
94 const size_t realign_len, size_t Counter)
96 const auto step = float{(TargetGain-CurrentGain) * delta};
98 auto pos = size_t{0};
99 if(std::abs(step) > std::numeric_limits<float>::epsilon())
101 const auto gain = float{CurrentGain};
102 auto step_count = float{0.0f};
103 /* Mix with applying gain steps in aligned multiples of 4. */
104 if(const size_t todo{fade_len >> 2})
106 const auto four4 = vdupq_n_f32(4.0f);
107 const auto step4 = vdupq_n_f32(step);
108 const auto gain4 = vdupq_n_f32(gain);
109 auto step_count4 = set_f4(0.0f, 1.0f, 2.0f, 3.0f);
111 const auto in4 = al::span{reinterpret_cast<const float32x4_t*>(InSamples.data()),
112 InSamples.size()/4}.first(todo);
113 const auto out4 = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
114 std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
115 [gain4,step4,four4,&step_count4](const float32x4_t val4, float32x4_t dry4)
117 /* dry += val * (gain + step*step_count) */
118 dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
119 step_count4 = vaddq_f32(step_count4, four4);
120 return dry4;
122 pos += in4.size()*4;
124 /* NOTE: step_count4 now represents the next four counts after the
125 * last four mixed samples, so the lowest element represents the
126 * next step count to apply.
128 step_count = vgetq_lane_f32(step_count4, 0);
130 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
131 if(const size_t leftover{fade_len&3})
133 const auto in = InSamples.subspan(pos, leftover);
134 const auto out = dst.subspan(pos);
136 std::transform(in.begin(), in.end(), out.begin(), out.begin(),
137 [gain,step,&step_count](const float val, float dry) noexcept -> float
139 dry += val * (gain + step*step_count);
140 step_count += 1.0f;
141 return dry;
143 pos += leftover;
145 if(pos < Counter)
147 CurrentGain = gain + step*step_count;
148 return;
151 /* Mix until pos is aligned with 4 or the mix is done. */
152 if(const size_t leftover{realign_len&3})
154 const auto in = InSamples.subspan(pos, leftover);
155 const auto out = dst.subspan(pos);
157 std::transform(in.begin(), in.end(), out.begin(), out.begin(),
158 [TargetGain](const float val, const float dry) noexcept -> float
159 { return dry + val*TargetGain; });
160 pos += leftover;
163 CurrentGain = TargetGain;
165 if(!(std::abs(TargetGain) > GainSilenceThreshold))
166 return;
167 if(const size_t todo{(InSamples.size()-pos) >> 2})
169 const auto in4 = al::span{reinterpret_cast<const float32x4_t*>(InSamples.data()),
170 InSamples.size()/4}.last(todo);
171 const auto out = dst.subspan(pos);
172 const auto out4 = al::span{reinterpret_cast<float32x4_t*>(out.data()), out.size()/4};
174 const auto gain4 = vdupq_n_f32(TargetGain);
175 std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
176 [gain4](const float32x4_t val4, const float32x4_t dry4) -> float32x4_t
177 { return vmlaq_f32(dry4, val4, gain4); });
178 pos += in4.size()*4;
180 if(const size_t leftover{(InSamples.size()-pos)&3})
182 const auto in = InSamples.last(leftover);
183 const auto out = dst.subspan(pos);
185 std::transform(in.begin(), in.end(), out.begin(), out.begin(),
186 [TargetGain](const float val, const float dry) noexcept -> float
187 { return dry + val*TargetGain; });
191 } // namespace
193 template<>
194 void Resample_<LerpTag,NEONTag>(const InterpState*, const al::span<const float> src, uint frac,
195 const uint increment, const al::span<float> dst)
197 ASSUME(frac < MixerFracOne);
199 const uint32x4_t increment4 = vdupq_n_u32(increment*4u);
200 const float32x4_t fracOne4 = vdupq_n_f32(1.0f/MixerFracOne);
201 const uint32x4_t fracMask4 = vdupq_n_u32(MixerFracMask);
203 alignas(16) std::array<uint,4> pos_{}, frac_{};
204 InitPosArrays(MaxResamplerEdge, frac, increment, al::span{frac_}, al::span{pos_});
205 uint32x4_t frac4 = vld1q_u32(frac_.data());
206 uint32x4_t pos4 = vld1q_u32(pos_.data());
208 auto vecout = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
209 std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]() -> float32x4_t
211 const uint pos0{vgetq_lane_u32(pos4, 0)};
212 const uint pos1{vgetq_lane_u32(pos4, 1)};
213 const uint pos2{vgetq_lane_u32(pos4, 2)};
214 const uint pos3{vgetq_lane_u32(pos4, 3)};
215 ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
216 const float32x4_t val1{set_f4(src[pos0], src[pos1], src[pos2], src[pos3])};
217 const float32x4_t val2{set_f4(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])};
219 /* val1 + (val2-val1)*mu */
220 const float32x4_t r0{vsubq_f32(val2, val1)};
221 const float32x4_t mu{vmulq_f32(vcvtq_f32_u32(frac4), fracOne4)};
222 const float32x4_t out{vmlaq_f32(val1, mu, r0)};
224 frac4 = vaddq_u32(frac4, increment4);
225 pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits));
226 frac4 = vandq_u32(frac4, fracMask4);
227 return out;
230 if(size_t todo{dst.size()&3})
232 auto pos = size_t{vgetq_lane_u32(pos4, 0)};
233 frac = vgetq_lane_u32(frac4, 0);
235 const auto out = dst.last(todo);
236 std::generate(out.begin(), out.end(), [&pos,&frac,src,increment]
238 const float output{lerpf(src[pos+0], src[pos+1],
239 static_cast<float>(frac) * (1.0f/MixerFracOne))};
241 frac += increment;
242 pos += frac>>MixerFracBits;
243 frac &= MixerFracMask;
244 return output;
249 template<>
250 void Resample_<CubicTag,NEONTag>(const InterpState *state, const al::span<const float> src,
251 uint frac, const uint increment, const al::span<float> dst)
253 ASSUME(frac < MixerFracOne);
255 const auto filter = std::get<CubicState>(*state).filter;
257 const uint32x4_t increment4{vdupq_n_u32(increment*4u)};
258 const uint32x4_t fracMask4{vdupq_n_u32(MixerFracMask)};
259 const float32x4_t fracDiffOne4{vdupq_n_f32(1.0f/CubicPhaseDiffOne)};
260 const uint32x4_t fracDiffMask4{vdupq_n_u32(CubicPhaseDiffMask)};
262 alignas(16) std::array<uint,4> pos_{}, frac_{};
263 InitPosArrays(MaxResamplerEdge-1, frac, increment, al::span{frac_}, al::span{pos_});
264 uint32x4_t frac4{vld1q_u32(frac_.data())};
265 uint32x4_t pos4{vld1q_u32(pos_.data())};
267 auto vecout = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
268 std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]
270 const uint pos0{vgetq_lane_u32(pos4, 0)};
271 const uint pos1{vgetq_lane_u32(pos4, 1)};
272 const uint pos2{vgetq_lane_u32(pos4, 2)};
273 const uint pos3{vgetq_lane_u32(pos4, 3)};
274 ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
275 const float32x4_t val0{vld1q_f32(&src[pos0])};
276 const float32x4_t val1{vld1q_f32(&src[pos1])};
277 const float32x4_t val2{vld1q_f32(&src[pos2])};
278 const float32x4_t val3{vld1q_f32(&src[pos3])};
280 const uint32x4_t pi4{vshrq_n_u32(frac4, CubicPhaseDiffBits)};
281 const uint pi0{vgetq_lane_u32(pi4, 0)}; ASSUME(pi0 < CubicPhaseCount);
282 const uint pi1{vgetq_lane_u32(pi4, 1)}; ASSUME(pi1 < CubicPhaseCount);
283 const uint pi2{vgetq_lane_u32(pi4, 2)}; ASSUME(pi2 < CubicPhaseCount);
284 const uint pi3{vgetq_lane_u32(pi4, 3)}; ASSUME(pi3 < CubicPhaseCount);
286 const float32x4_t pf4{vmulq_f32(vcvtq_f32_u32(vandq_u32(frac4, fracDiffMask4)),
287 fracDiffOne4)};
289 float32x4_t r0{vmulq_f32(val0,
290 vmlaq_f32(vld1q_f32(filter[pi0].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 0),
291 vld1q_f32(filter[pi0].mDeltas.data())))};
292 float32x4_t r1{vmulq_f32(val1,
293 vmlaq_f32(vld1q_f32(filter[pi1].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 1),
294 vld1q_f32(filter[pi1].mDeltas.data())))};
295 float32x4_t r2{vmulq_f32(val2,
296 vmlaq_f32(vld1q_f32(filter[pi2].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 0),
297 vld1q_f32(filter[pi2].mDeltas.data())))};
298 float32x4_t r3{vmulq_f32(val3,
299 vmlaq_f32(vld1q_f32(filter[pi3].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 1),
300 vld1q_f32(filter[pi3].mDeltas.data())))};
302 vtranspose4(r0, r1, r2, r3);
303 r0 = vaddq_f32(vaddq_f32(r0, r1), vaddq_f32(r2, r3));
305 frac4 = vaddq_u32(frac4, increment4);
306 pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits));
307 frac4 = vandq_u32(frac4, fracMask4);
308 return r0;
311 if(const size_t todo{dst.size()&3})
313 auto pos = size_t{vgetq_lane_u32(pos4, 0)};
314 frac = vgetq_lane_u32(frac4, 0);
316 auto out = dst.last(todo);
317 std::generate(out.begin(), out.end(), [&pos,&frac,src,increment,filter]
319 const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
320 const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
321 const float32x4_t pf4{vdupq_n_f32(pf)};
323 const float32x4_t f4{vmlaq_f32(vld1q_f32(filter[pi].mCoeffs.data()), pf4,
324 vld1q_f32(filter[pi].mDeltas.data()))};
325 float32x4_t r4{vmulq_f32(f4, vld1q_f32(&src[pos]))};
327 r4 = vaddq_f32(r4, vrev64q_f32(r4));
328 const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
330 frac += increment;
331 pos += frac>>MixerFracBits;
332 frac &= MixerFracMask;
333 return output;
338 template<>
339 void Resample_<BSincTag,NEONTag>(const InterpState *state, const al::span<const float> src,
340 uint frac, const uint increment, const al::span<float> dst)
342 const auto &bsinc = std::get<BsincState>(*state);
343 const auto sf4 = vdupq_n_f32(bsinc.sf);
344 const auto m = size_t{bsinc.m};
345 ASSUME(m > 0);
346 ASSUME(m <= MaxResamplerPadding);
347 ASSUME(frac < MixerFracOne);
349 const auto filter = bsinc.filter.first(4_uz*BSincPhaseCount*m);
351 ASSUME(bsinc.l <= MaxResamplerEdge);
352 auto pos = size_t{MaxResamplerEdge-bsinc.l};
353 std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,sf4,m,filter]() -> float
355 // Calculate the phase index and factor.
356 const uint pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
357 const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
359 // Apply the scale and phase interpolated filter.
360 float32x4_t r4{vdupq_n_f32(0.0f)};
362 const float32x4_t pf4{vdupq_n_f32(pf)};
363 const auto fil = filter.subspan(2_uz*pi*m);
364 const auto phd = fil.subspan(m);
365 const auto scd = fil.subspan(2_uz*BSincPhaseCount*m);
366 const auto spd = scd.subspan(m);
367 size_t td{m >> 2};
368 size_t j{0u};
370 do {
371 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
372 const float32x4_t f4 = vmlaq_f32(
373 vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j])),
374 pf4, vmlaq_f32(vld1q_f32(&phd[j]), sf4, vld1q_f32(&spd[j])));
375 /* r += f*src */
376 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[pos+j]));
377 j += 4;
378 } while(--td);
380 r4 = vaddq_f32(r4, vrev64q_f32(r4));
381 const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
383 frac += increment;
384 pos += frac>>MixerFracBits;
385 frac &= MixerFracMask;
386 return output;
390 template<>
391 void Resample_<FastBSincTag,NEONTag>(const InterpState *state, const al::span<const float> src,
392 uint frac, const uint increment, const al::span<float> dst)
394 const auto &bsinc = std::get<BsincState>(*state);
395 const auto m = size_t{bsinc.m};
396 ASSUME(m > 0);
397 ASSUME(m <= MaxResamplerPadding);
398 ASSUME(frac < MixerFracOne);
400 const auto filter = bsinc.filter.first(2_uz*BSincPhaseCount*m);
402 ASSUME(bsinc.l <= MaxResamplerEdge);
403 auto pos = size_t{MaxResamplerEdge-bsinc.l};
404 std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,m,filter]() -> float
406 // Calculate the phase index and factor.
407 const uint pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
408 const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
410 // Apply the phase interpolated filter.
411 float32x4_t r4{vdupq_n_f32(0.0f)};
413 const float32x4_t pf4{vdupq_n_f32(pf)};
414 const auto fil = filter.subspan(2_uz*pi*m);
415 const auto phd = fil.subspan(m);
416 size_t td{m >> 2};
417 size_t j{0u};
419 do {
420 /* f = fil + pf*phd */
421 const float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), pf4, vld1q_f32(&phd[j]));
422 /* r += f*src */
423 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[pos+j]));
424 j += 4;
425 } while(--td);
427 r4 = vaddq_f32(r4, vrev64q_f32(r4));
428 const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
430 frac += increment;
431 pos += frac>>MixerFracBits;
432 frac &= MixerFracMask;
433 return output;
438 template<>
439 void MixHrtf_<NEONTag>(const al::span<const float> InSamples, const al::span<float2> AccumSamples,
440 const uint IrSize, const MixHrtfFilter *hrtfparams, const size_t SamplesToDo)
441 { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, SamplesToDo); }
443 template<>
444 void MixHrtfBlend_<NEONTag>(const al::span<const float> InSamples,
445 const al::span<float2> AccumSamples, const uint IrSize, const HrtfFilter *oldparams,
446 const MixHrtfFilter *newparams, const size_t SamplesToDo)
448 MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
449 SamplesToDo);
452 template<>
453 void MixDirectHrtf_<NEONTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
454 const al::span<const FloatBufferLine> InSamples, const al::span<float2> AccumSamples,
455 const al::span<float,BufferLineSize> TempBuf, const al::span<HrtfChannelState> ChanState,
456 const size_t IrSize, const size_t SamplesToDo)
458 MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
459 IrSize, SamplesToDo);
463 template<>
464 void Mix_<NEONTag>(const al::span<const float> InSamples,const al::span<FloatBufferLine> OutBuffer,
465 const al::span<float> CurrentGains, const al::span<const float> TargetGains,
466 const size_t Counter, const size_t OutPos)
468 if((OutPos&3) != 0) UNLIKELY
469 return Mix_<CTag>(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos);
471 const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
472 const auto fade_len = std::min(Counter, InSamples.size());
473 const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
475 auto curgains = CurrentGains.begin();
476 auto targetgains = TargetGains.cbegin();
477 for(FloatBufferLine &output : OutBuffer)
478 MixLine(InSamples, al::span{output}.subspan(OutPos), *curgains++, *targetgains++, delta,
479 fade_len, realign_len, Counter);
482 template<>
483 void Mix_<NEONTag>(const al::span<const float> InSamples, const al::span<float> OutBuffer,
484 float &CurrentGain, const float TargetGain, const size_t Counter)
486 if((reinterpret_cast<uintptr_t>(OutBuffer.data())&15) != 0) UNLIKELY
487 return Mix_<CTag>(InSamples, OutBuffer, CurrentGain, TargetGain, Counter);
489 const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
490 const auto fade_len = std::min(Counter, InSamples.size());
491 const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
493 MixLine(InSamples, OutBuffer, CurrentGain, TargetGain, delta, fade_len, realign_len, Counter);