11 #include "alnumeric.h"
13 #include "core/bsinc_defs.h"
14 #include "core/bufferline.h"
15 #include "core/cubic_defs.h"
16 #include "core/mixer/hrtfdefs.h"
17 #include "core/resampler_limits.h"
20 #include "opthelpers.h"
30 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
31 #pragma GCC target("fpu=neon")
34 using uint
= unsigned int;
38 constexpr uint BSincPhaseDiffBits
{MixerFracBits
- BSincPhaseBits
};
39 constexpr uint BSincPhaseDiffOne
{1 << BSincPhaseDiffBits
};
40 constexpr uint BSincPhaseDiffMask
{BSincPhaseDiffOne
- 1u};
42 constexpr uint CubicPhaseDiffBits
{MixerFracBits
- CubicPhaseBits
};
43 constexpr uint CubicPhaseDiffOne
{1 << CubicPhaseDiffBits
};
44 constexpr uint CubicPhaseDiffMask
{CubicPhaseDiffOne
- 1u};
47 void vtranspose4(float32x4_t
&x0
, float32x4_t
&x1
, float32x4_t
&x2
, float32x4_t
&x3
) noexcept
49 float32x4x2_t t0_
{vzipq_f32(x0
, x2
)};
50 float32x4x2_t t1_
{vzipq_f32(x1
, x3
)};
51 float32x4x2_t u0_
{vzipq_f32(t0_
.val
[0], t1_
.val
[0])};
52 float32x4x2_t u1_
{vzipq_f32(t0_
.val
[1], t1_
.val
[1])};
59 inline float32x4_t
set_f4(float l0
, float l1
, float l2
, float l3
)
61 float32x4_t ret
{vmovq_n_f32(l0
)};
62 ret
= vsetq_lane_f32(l1
, ret
, 1);
63 ret
= vsetq_lane_f32(l2
, ret
, 2);
64 ret
= vsetq_lane_f32(l3
, ret
, 3);
68 inline void ApplyCoeffs(const al::span
<float2
> Values
, const size_t IrSize
,
69 const ConstHrirSpan Coeffs
, const float left
, const float right
)
71 ASSUME(IrSize
>= MinIrLength
);
72 ASSUME(IrSize
<= HrirLength
);
74 auto dup_samples
= [left
,right
]() -> float32x4_t
76 float32x2_t leftright2
{vset_lane_f32(right
, vmov_n_f32(left
), 1)};
77 return vcombine_f32(leftright2
, leftright2
);
79 const auto leftright4
= dup_samples();
81 /* Using a loop here instead of std::transform since some builds seem to
82 * have an issue with accessing an array/span of float32x4_t.
84 for(size_t c
{0};c
< IrSize
;c
+= 2)
86 auto vals
= vld1q_f32(&Values
[c
][0]);
87 vals
= vmlaq_f32(vals
, vld1q_f32(&Coeffs
[c
][0]), leftright4
);
88 vst1q_f32(&Values
[c
][0], vals
);
92 force_inline
void MixLine(const al::span
<const float> InSamples
, const al::span
<float> dst
,
93 float &CurrentGain
, const float TargetGain
, const float delta
, const size_t fade_len
,
94 const size_t realign_len
, size_t Counter
)
96 const auto step
= float{(TargetGain
-CurrentGain
) * delta
};
99 if(std::abs(step
) > std::numeric_limits
<float>::epsilon())
101 const auto gain
= float{CurrentGain
};
102 auto step_count
= float{0.0f
};
103 /* Mix with applying gain steps in aligned multiples of 4. */
104 if(const size_t todo
{fade_len
>> 2})
106 const auto four4
= vdupq_n_f32(4.0f
);
107 const auto step4
= vdupq_n_f32(step
);
108 const auto gain4
= vdupq_n_f32(gain
);
109 auto step_count4
= set_f4(0.0f
, 1.0f
, 2.0f
, 3.0f
);
111 const auto in4
= al::span
{reinterpret_cast<const float32x4_t
*>(InSamples
.data()),
112 InSamples
.size()/4}.first(todo
);
113 const auto out4
= al::span
{reinterpret_cast<float32x4_t
*>(dst
.data()), dst
.size()/4};
114 std::transform(in4
.begin(), in4
.end(), out4
.begin(), out4
.begin(),
115 [gain4
,step4
,four4
,&step_count4
](const float32x4_t val4
, float32x4_t dry4
)
117 /* dry += val * (gain + step*step_count) */
118 dry4
= vmlaq_f32(dry4
, val4
, vmlaq_f32(gain4
, step4
, step_count4
));
119 step_count4
= vaddq_f32(step_count4
, four4
);
124 /* NOTE: step_count4 now represents the next four counts after the
125 * last four mixed samples, so the lowest element represents the
126 * next step count to apply.
128 step_count
= vgetq_lane_f32(step_count4
, 0);
130 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
131 if(const size_t leftover
{fade_len
&3})
133 const auto in
= InSamples
.subspan(pos
, leftover
);
134 const auto out
= dst
.subspan(pos
);
136 std::transform(in
.begin(), in
.end(), out
.begin(), out
.begin(),
137 [gain
,step
,&step_count
](const float val
, float dry
) noexcept
-> float
139 dry
+= val
* (gain
+ step
*step_count
);
147 CurrentGain
= gain
+ step
*step_count
;
151 /* Mix until pos is aligned with 4 or the mix is done. */
152 if(const size_t leftover
{realign_len
&3})
154 const auto in
= InSamples
.subspan(pos
, leftover
);
155 const auto out
= dst
.subspan(pos
);
157 std::transform(in
.begin(), in
.end(), out
.begin(), out
.begin(),
158 [TargetGain
](const float val
, const float dry
) noexcept
-> float
159 { return dry
+ val
*TargetGain
; });
163 CurrentGain
= TargetGain
;
165 if(!(std::abs(TargetGain
) > GainSilenceThreshold
))
167 if(const size_t todo
{(InSamples
.size()-pos
) >> 2})
169 const auto in4
= al::span
{reinterpret_cast<const float32x4_t
*>(InSamples
.data()),
170 InSamples
.size()/4}.last(todo
);
171 const auto out
= dst
.subspan(pos
);
172 const auto out4
= al::span
{reinterpret_cast<float32x4_t
*>(out
.data()), out
.size()/4};
174 const auto gain4
= vdupq_n_f32(TargetGain
);
175 std::transform(in4
.begin(), in4
.end(), out4
.begin(), out4
.begin(),
176 [gain4
](const float32x4_t val4
, const float32x4_t dry4
) -> float32x4_t
177 { return vmlaq_f32(dry4
, val4
, gain4
); });
180 if(const size_t leftover
{(InSamples
.size()-pos
)&3})
182 const auto in
= InSamples
.last(leftover
);
183 const auto out
= dst
.subspan(pos
);
185 std::transform(in
.begin(), in
.end(), out
.begin(), out
.begin(),
186 [TargetGain
](const float val
, const float dry
) noexcept
-> float
187 { return dry
+ val
*TargetGain
; });
194 void Resample_
<LerpTag
,NEONTag
>(const InterpState
*, const al::span
<const float> src
, uint frac
,
195 const uint increment
, const al::span
<float> dst
)
197 ASSUME(frac
< MixerFracOne
);
199 const uint32x4_t increment4
= vdupq_n_u32(increment
*4u);
200 const float32x4_t fracOne4
= vdupq_n_f32(1.0f
/MixerFracOne
);
201 const uint32x4_t fracMask4
= vdupq_n_u32(MixerFracMask
);
203 alignas(16) std::array
<uint
,4> pos_
{}, frac_
{};
204 InitPosArrays(MaxResamplerEdge
, frac
, increment
, al::span
{frac_
}, al::span
{pos_
});
205 uint32x4_t frac4
= vld1q_u32(frac_
.data());
206 uint32x4_t pos4
= vld1q_u32(pos_
.data());
208 auto vecout
= al::span
{reinterpret_cast<float32x4_t
*>(dst
.data()), dst
.size()/4};
209 std::generate(vecout
.begin(), vecout
.end(), [=,&pos4
,&frac4
]() -> float32x4_t
211 const uint pos0
{vgetq_lane_u32(pos4
, 0)};
212 const uint pos1
{vgetq_lane_u32(pos4
, 1)};
213 const uint pos2
{vgetq_lane_u32(pos4
, 2)};
214 const uint pos3
{vgetq_lane_u32(pos4
, 3)};
215 ASSUME(pos0
<= pos1
); ASSUME(pos1
<= pos2
); ASSUME(pos2
<= pos3
);
216 const float32x4_t val1
{set_f4(src
[pos0
], src
[pos1
], src
[pos2
], src
[pos3
])};
217 const float32x4_t val2
{set_f4(src
[pos0
+1_uz
], src
[pos1
+1_uz
], src
[pos2
+1_uz
], src
[pos3
+1_uz
])};
219 /* val1 + (val2-val1)*mu */
220 const float32x4_t r0
{vsubq_f32(val2
, val1
)};
221 const float32x4_t mu
{vmulq_f32(vcvtq_f32_u32(frac4
), fracOne4
)};
222 const float32x4_t out
{vmlaq_f32(val1
, mu
, r0
)};
224 frac4
= vaddq_u32(frac4
, increment4
);
225 pos4
= vaddq_u32(pos4
, vshrq_n_u32(frac4
, MixerFracBits
));
226 frac4
= vandq_u32(frac4
, fracMask4
);
230 if(size_t todo
{dst
.size()&3})
232 auto pos
= size_t{vgetq_lane_u32(pos4
, 0)};
233 frac
= vgetq_lane_u32(frac4
, 0);
235 const auto out
= dst
.last(todo
);
236 std::generate(out
.begin(), out
.end(), [&pos
,&frac
,src
,increment
]
238 const float output
{lerpf(src
[pos
+0], src
[pos
+1],
239 static_cast<float>(frac
) * (1.0f
/MixerFracOne
))};
242 pos
+= frac
>>MixerFracBits
;
243 frac
&= MixerFracMask
;
250 void Resample_
<CubicTag
,NEONTag
>(const InterpState
*state
, const al::span
<const float> src
,
251 uint frac
, const uint increment
, const al::span
<float> dst
)
253 ASSUME(frac
< MixerFracOne
);
255 const auto filter
= std::get
<CubicState
>(*state
).filter
;
257 const uint32x4_t increment4
{vdupq_n_u32(increment
*4u)};
258 const uint32x4_t fracMask4
{vdupq_n_u32(MixerFracMask
)};
259 const float32x4_t fracDiffOne4
{vdupq_n_f32(1.0f
/CubicPhaseDiffOne
)};
260 const uint32x4_t fracDiffMask4
{vdupq_n_u32(CubicPhaseDiffMask
)};
262 alignas(16) std::array
<uint
,4> pos_
{}, frac_
{};
263 InitPosArrays(MaxResamplerEdge
-1, frac
, increment
, al::span
{frac_
}, al::span
{pos_
});
264 uint32x4_t frac4
{vld1q_u32(frac_
.data())};
265 uint32x4_t pos4
{vld1q_u32(pos_
.data())};
267 auto vecout
= al::span
{reinterpret_cast<float32x4_t
*>(dst
.data()), dst
.size()/4};
268 std::generate(vecout
.begin(), vecout
.end(), [=,&pos4
,&frac4
]
270 const uint pos0
{vgetq_lane_u32(pos4
, 0)};
271 const uint pos1
{vgetq_lane_u32(pos4
, 1)};
272 const uint pos2
{vgetq_lane_u32(pos4
, 2)};
273 const uint pos3
{vgetq_lane_u32(pos4
, 3)};
274 ASSUME(pos0
<= pos1
); ASSUME(pos1
<= pos2
); ASSUME(pos2
<= pos3
);
275 const float32x4_t val0
{vld1q_f32(&src
[pos0
])};
276 const float32x4_t val1
{vld1q_f32(&src
[pos1
])};
277 const float32x4_t val2
{vld1q_f32(&src
[pos2
])};
278 const float32x4_t val3
{vld1q_f32(&src
[pos3
])};
280 const uint32x4_t pi4
{vshrq_n_u32(frac4
, CubicPhaseDiffBits
)};
281 const uint pi0
{vgetq_lane_u32(pi4
, 0)}; ASSUME(pi0
< CubicPhaseCount
);
282 const uint pi1
{vgetq_lane_u32(pi4
, 1)}; ASSUME(pi1
< CubicPhaseCount
);
283 const uint pi2
{vgetq_lane_u32(pi4
, 2)}; ASSUME(pi2
< CubicPhaseCount
);
284 const uint pi3
{vgetq_lane_u32(pi4
, 3)}; ASSUME(pi3
< CubicPhaseCount
);
286 const float32x4_t pf4
{vmulq_f32(vcvtq_f32_u32(vandq_u32(frac4
, fracDiffMask4
)),
289 float32x4_t r0
{vmulq_f32(val0
,
290 vmlaq_f32(vld1q_f32(filter
[pi0
].mCoeffs
.data()), vdupq_lane_f32(vget_low_f32(pf4
), 0),
291 vld1q_f32(filter
[pi0
].mDeltas
.data())))};
292 float32x4_t r1
{vmulq_f32(val1
,
293 vmlaq_f32(vld1q_f32(filter
[pi1
].mCoeffs
.data()), vdupq_lane_f32(vget_low_f32(pf4
), 1),
294 vld1q_f32(filter
[pi1
].mDeltas
.data())))};
295 float32x4_t r2
{vmulq_f32(val2
,
296 vmlaq_f32(vld1q_f32(filter
[pi2
].mCoeffs
.data()), vdupq_lane_f32(vget_high_f32(pf4
), 0),
297 vld1q_f32(filter
[pi2
].mDeltas
.data())))};
298 float32x4_t r3
{vmulq_f32(val3
,
299 vmlaq_f32(vld1q_f32(filter
[pi3
].mCoeffs
.data()), vdupq_lane_f32(vget_high_f32(pf4
), 1),
300 vld1q_f32(filter
[pi3
].mDeltas
.data())))};
302 vtranspose4(r0
, r1
, r2
, r3
);
303 r0
= vaddq_f32(vaddq_f32(r0
, r1
), vaddq_f32(r2
, r3
));
305 frac4
= vaddq_u32(frac4
, increment4
);
306 pos4
= vaddq_u32(pos4
, vshrq_n_u32(frac4
, MixerFracBits
));
307 frac4
= vandq_u32(frac4
, fracMask4
);
311 if(const size_t todo
{dst
.size()&3})
313 auto pos
= size_t{vgetq_lane_u32(pos4
, 0)};
314 frac
= vgetq_lane_u32(frac4
, 0);
316 auto out
= dst
.last(todo
);
317 std::generate(out
.begin(), out
.end(), [&pos
,&frac
,src
,increment
,filter
]
319 const uint pi
{frac
>> CubicPhaseDiffBits
}; ASSUME(pi
< CubicPhaseCount
);
320 const float pf
{static_cast<float>(frac
&CubicPhaseDiffMask
) * (1.0f
/CubicPhaseDiffOne
)};
321 const float32x4_t pf4
{vdupq_n_f32(pf
)};
323 const float32x4_t f4
{vmlaq_f32(vld1q_f32(filter
[pi
].mCoeffs
.data()), pf4
,
324 vld1q_f32(filter
[pi
].mDeltas
.data()))};
325 float32x4_t r4
{vmulq_f32(f4
, vld1q_f32(&src
[pos
]))};
327 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
328 const float output
{vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0)};
331 pos
+= frac
>>MixerFracBits
;
332 frac
&= MixerFracMask
;
339 void Resample_
<BSincTag
,NEONTag
>(const InterpState
*state
, const al::span
<const float> src
,
340 uint frac
, const uint increment
, const al::span
<float> dst
)
342 const auto &bsinc
= std::get
<BsincState
>(*state
);
343 const auto sf4
= vdupq_n_f32(bsinc
.sf
);
344 const auto m
= size_t{bsinc
.m
};
346 ASSUME(m
<= MaxResamplerPadding
);
347 ASSUME(frac
< MixerFracOne
);
349 const auto filter
= bsinc
.filter
.first(4_uz
*BSincPhaseCount
*m
);
351 ASSUME(bsinc
.l
<= MaxResamplerEdge
);
352 auto pos
= size_t{MaxResamplerEdge
-bsinc
.l
};
353 std::generate(dst
.begin(), dst
.end(), [&pos
,&frac
,src
,increment
,sf4
,m
,filter
]() -> float
355 // Calculate the phase index and factor.
356 const uint pi
{frac
>> BSincPhaseDiffBits
}; ASSUME(pi
< BSincPhaseCount
);
357 const float pf
{static_cast<float>(frac
&BSincPhaseDiffMask
) * (1.0f
/BSincPhaseDiffOne
)};
359 // Apply the scale and phase interpolated filter.
360 float32x4_t r4
{vdupq_n_f32(0.0f
)};
362 const float32x4_t pf4
{vdupq_n_f32(pf
)};
363 const auto fil
= filter
.subspan(2_uz
*pi
*m
);
364 const auto phd
= fil
.subspan(m
);
365 const auto scd
= fil
.subspan(2_uz
*BSincPhaseCount
*m
);
366 const auto spd
= scd
.subspan(m
);
371 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
372 const float32x4_t f4
= vmlaq_f32(
373 vmlaq_f32(vld1q_f32(&fil
[j
]), sf4
, vld1q_f32(&scd
[j
])),
374 pf4
, vmlaq_f32(vld1q_f32(&phd
[j
]), sf4
, vld1q_f32(&spd
[j
])));
376 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[pos
+j
]));
380 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
381 const float output
{vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0)};
384 pos
+= frac
>>MixerFracBits
;
385 frac
&= MixerFracMask
;
391 void Resample_
<FastBSincTag
,NEONTag
>(const InterpState
*state
, const al::span
<const float> src
,
392 uint frac
, const uint increment
, const al::span
<float> dst
)
394 const auto &bsinc
= std::get
<BsincState
>(*state
);
395 const auto m
= size_t{bsinc
.m
};
397 ASSUME(m
<= MaxResamplerPadding
);
398 ASSUME(frac
< MixerFracOne
);
400 const auto filter
= bsinc
.filter
.first(2_uz
*BSincPhaseCount
*m
);
402 ASSUME(bsinc
.l
<= MaxResamplerEdge
);
403 auto pos
= size_t{MaxResamplerEdge
-bsinc
.l
};
404 std::generate(dst
.begin(), dst
.end(), [&pos
,&frac
,src
,increment
,m
,filter
]() -> float
406 // Calculate the phase index and factor.
407 const uint pi
{frac
>> BSincPhaseDiffBits
}; ASSUME(pi
< BSincPhaseCount
);
408 const float pf
{static_cast<float>(frac
&BSincPhaseDiffMask
) * (1.0f
/BSincPhaseDiffOne
)};
410 // Apply the phase interpolated filter.
411 float32x4_t r4
{vdupq_n_f32(0.0f
)};
413 const float32x4_t pf4
{vdupq_n_f32(pf
)};
414 const auto fil
= filter
.subspan(2_uz
*pi
*m
);
415 const auto phd
= fil
.subspan(m
);
420 /* f = fil + pf*phd */
421 const float32x4_t f4
= vmlaq_f32(vld1q_f32(&fil
[j
]), pf4
, vld1q_f32(&phd
[j
]));
423 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[pos
+j
]));
427 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
428 const float output
{vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0)};
431 pos
+= frac
>>MixerFracBits
;
432 frac
&= MixerFracMask
;
439 void MixHrtf_
<NEONTag
>(const al::span
<const float> InSamples
, const al::span
<float2
> AccumSamples
,
440 const uint IrSize
, const MixHrtfFilter
*hrtfparams
, const size_t SamplesToDo
)
441 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, SamplesToDo
); }
444 void MixHrtfBlend_
<NEONTag
>(const al::span
<const float> InSamples
,
445 const al::span
<float2
> AccumSamples
, const uint IrSize
, const HrtfFilter
*oldparams
,
446 const MixHrtfFilter
*newparams
, const size_t SamplesToDo
)
448 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
453 void MixDirectHrtf_
<NEONTag
>(const FloatBufferSpan LeftOut
, const FloatBufferSpan RightOut
,
454 const al::span
<const FloatBufferLine
> InSamples
, const al::span
<float2
> AccumSamples
,
455 const al::span
<float,BufferLineSize
> TempBuf
, const al::span
<HrtfChannelState
> ChanState
,
456 const size_t IrSize
, const size_t SamplesToDo
)
458 MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, TempBuf
, ChanState
,
459 IrSize
, SamplesToDo
);
464 void Mix_
<NEONTag
>(const al::span
<const float> InSamples
,const al::span
<FloatBufferLine
> OutBuffer
,
465 const al::span
<float> CurrentGains
, const al::span
<const float> TargetGains
,
466 const size_t Counter
, const size_t OutPos
)
468 if((OutPos
&3) != 0) UNLIKELY
469 return Mix_
<CTag
>(InSamples
, OutBuffer
, CurrentGains
, TargetGains
, Counter
, OutPos
);
471 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
472 const auto fade_len
= std::min(Counter
, InSamples
.size());
473 const auto realign_len
= std::min((fade_len
+3_uz
) & ~3_uz
, InSamples
.size()) - fade_len
;
475 auto curgains
= CurrentGains
.begin();
476 auto targetgains
= TargetGains
.cbegin();
477 for(FloatBufferLine
&output
: OutBuffer
)
478 MixLine(InSamples
, al::span
{output
}.subspan(OutPos
), *curgains
++, *targetgains
++, delta
,
479 fade_len
, realign_len
, Counter
);
483 void Mix_
<NEONTag
>(const al::span
<const float> InSamples
, const al::span
<float> OutBuffer
,
484 float &CurrentGain
, const float TargetGain
, const size_t Counter
)
486 if((reinterpret_cast<uintptr_t>(OutBuffer
.data())&15) != 0) UNLIKELY
487 return Mix_
<CTag
>(InSamples
, OutBuffer
, CurrentGain
, TargetGain
, Counter
);
489 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
490 const auto fade_len
= std::min(Counter
, InSamples
.size());
491 const auto realign_len
= std::min((fade_len
+3_uz
) & ~3_uz
, InSamples
.size()) - fade_len
;
493 MixLine(InSamples
, OutBuffer
, CurrentGain
, TargetGain
, delta
, fade_len
, realign_len
, Counter
);