Use pw_buffer::requested in newer PipeWire versions
[openal-soft.git] / core / mixer / mixer_neon.cpp
bloba34689269b796b96a7e2e85123c0dad516e37c7d
1 #include "config.h"
3 #include <arm_neon.h>
5 #include <cmath>
6 #include <limits>
8 #include "alnumeric.h"
9 #include "core/bsinc_defs.h"
10 #include "defs.h"
11 #include "hrtfbase.h"
13 struct NEONTag;
14 struct LerpTag;
15 struct BSincTag;
16 struct FastBSincTag;
19 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
20 #pragma GCC target("fpu=neon")
21 #endif
23 namespace {
25 inline float32x4_t set_f4(float l0, float l1, float l2, float l3)
27 float32x4_t ret{vmovq_n_f32(l0)};
28 ret = vsetq_lane_f32(l1, ret, 1);
29 ret = vsetq_lane_f32(l2, ret, 2);
30 ret = vsetq_lane_f32(l3, ret, 3);
31 return ret;
34 constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits};
35 constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff};
37 inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
38 const float left, const float right)
40 float32x4_t leftright4;
42 float32x2_t leftright2{vmov_n_f32(left)};
43 leftright2 = vset_lane_f32(right, leftright2, 1);
44 leftright4 = vcombine_f32(leftright2, leftright2);
47 ASSUME(IrSize >= MinIrLength);
48 for(size_t c{0};c < IrSize;c += 2)
50 float32x4_t vals = vld1q_f32(&Values[c][0]);
51 float32x4_t coefs = vld1q_f32(&Coeffs[c][0]);
53 vals = vmlaq_f32(vals, coefs, leftright4);
55 vst1q_f32(&Values[c][0], vals);
59 } // namespace
61 template<>
62 float *Resample_<LerpTag,NEONTag>(const InterpState*, float *RESTRICT src, uint frac,
63 uint increment, const al::span<float> dst)
65 const int32x4_t increment4 = vdupq_n_s32(static_cast<int>(increment*4));
66 const float32x4_t fracOne4 = vdupq_n_f32(1.0f/MixerFracOne);
67 const int32x4_t fracMask4 = vdupq_n_s32(MixerFracMask);
68 alignas(16) uint pos_[4], frac_[4];
69 int32x4_t pos4, frac4;
71 InitPosArrays(frac, increment, frac_, pos_);
72 frac4 = vld1q_s32(reinterpret_cast<int*>(frac_));
73 pos4 = vld1q_s32(reinterpret_cast<int*>(pos_));
75 auto dst_iter = dst.begin();
76 for(size_t todo{dst.size()>>2};todo;--todo)
78 const int pos0{vgetq_lane_s32(pos4, 0)};
79 const int pos1{vgetq_lane_s32(pos4, 1)};
80 const int pos2{vgetq_lane_s32(pos4, 2)};
81 const int pos3{vgetq_lane_s32(pos4, 3)};
82 const float32x4_t val1{set_f4(src[pos0], src[pos1], src[pos2], src[pos3])};
83 const float32x4_t val2{set_f4(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])};
85 /* val1 + (val2-val1)*mu */
86 const float32x4_t r0{vsubq_f32(val2, val1)};
87 const float32x4_t mu{vmulq_f32(vcvtq_f32_s32(frac4), fracOne4)};
88 const float32x4_t out{vmlaq_f32(val1, mu, r0)};
90 vst1q_f32(dst_iter, out);
91 dst_iter += 4;
93 frac4 = vaddq_s32(frac4, increment4);
94 pos4 = vaddq_s32(pos4, vshrq_n_s32(frac4, MixerFracBits));
95 frac4 = vandq_s32(frac4, fracMask4);
98 if(size_t todo{dst.size()&3})
100 src += static_cast<uint>(vgetq_lane_s32(pos4, 0));
101 frac = static_cast<uint>(vgetq_lane_s32(frac4, 0));
103 do {
104 *(dst_iter++) = lerpf(src[0], src[1], static_cast<float>(frac) * (1.0f/MixerFracOne));
106 frac += increment;
107 src += frac>>MixerFracBits;
108 frac &= MixerFracMask;
109 } while(--todo);
111 return dst.data();
114 template<>
115 float *Resample_<BSincTag,NEONTag>(const InterpState *state, float *RESTRICT src, uint frac,
116 uint increment, const al::span<float> dst)
118 const float *const filter{state->bsinc.filter};
119 const float32x4_t sf4{vdupq_n_f32(state->bsinc.sf)};
120 const size_t m{state->bsinc.m};
121 ASSUME(m > 0);
123 src -= state->bsinc.l;
124 for(float &out_sample : dst)
126 // Calculate the phase index and factor.
127 const uint pi{frac >> FracPhaseBitDiff};
128 const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)};
130 // Apply the scale and phase interpolated filter.
131 float32x4_t r4{vdupq_n_f32(0.0f)};
133 const float32x4_t pf4{vdupq_n_f32(pf)};
134 const float *RESTRICT fil{filter + m*pi*2};
135 const float *RESTRICT phd{fil + m};
136 const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
137 const float *RESTRICT spd{scd + m};
138 size_t td{m >> 2};
139 size_t j{0u};
141 do {
142 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
143 const float32x4_t f4 = vmlaq_f32(
144 vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j])),
145 pf4, vmlaq_f32(vld1q_f32(&phd[j]), sf4, vld1q_f32(&spd[j])));
146 /* r += f*src */
147 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
148 j += 4;
149 } while(--td);
151 r4 = vaddq_f32(r4, vrev64q_f32(r4));
152 out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
154 frac += increment;
155 src += frac>>MixerFracBits;
156 frac &= MixerFracMask;
158 return dst.data();
161 template<>
162 float *Resample_<FastBSincTag,NEONTag>(const InterpState *state, float *RESTRICT src, uint frac,
163 uint increment, const al::span<float> dst)
165 const float *const filter{state->bsinc.filter};
166 const size_t m{state->bsinc.m};
167 ASSUME(m > 0);
169 src -= state->bsinc.l;
170 for(float &out_sample : dst)
172 // Calculate the phase index and factor.
173 const uint pi{frac >> FracPhaseBitDiff};
174 const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)};
176 // Apply the phase interpolated filter.
177 float32x4_t r4{vdupq_n_f32(0.0f)};
179 const float32x4_t pf4{vdupq_n_f32(pf)};
180 const float *RESTRICT fil{filter + m*pi*2};
181 const float *RESTRICT phd{fil + m};
182 size_t td{m >> 2};
183 size_t j{0u};
185 do {
186 /* f = fil + pf*phd */
187 const float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), pf4, vld1q_f32(&phd[j]));
188 /* r += f*src */
189 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
190 j += 4;
191 } while(--td);
193 r4 = vaddq_f32(r4, vrev64q_f32(r4));
194 out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
196 frac += increment;
197 src += frac>>MixerFracBits;
198 frac &= MixerFracMask;
200 return dst.data();
204 template<>
205 void MixHrtf_<NEONTag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
206 const MixHrtfFilter *hrtfparams, const size_t BufferSize)
207 { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, BufferSize); }
209 template<>
210 void MixHrtfBlend_<NEONTag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
211 const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize)
213 MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
214 BufferSize);
217 template<>
218 void MixDirectHrtf_<NEONTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
219 const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
220 float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
222 MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
223 IrSize, BufferSize);
227 template<>
228 void Mix_<NEONTag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
229 float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
231 const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
232 const auto min_len = minz(Counter, InSamples.size());
233 const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
235 for(FloatBufferLine &output : OutBuffer)
237 float *RESTRICT dst{al::assume_aligned<16>(output.data()+OutPos)};
238 float gain{*CurrentGains};
239 const float step{(*TargetGains-gain) * delta};
241 size_t pos{0};
242 if(!(std::abs(step) > std::numeric_limits<float>::epsilon()))
243 gain = *TargetGains;
244 else
246 float step_count{0.0f};
247 /* Mix with applying gain steps in aligned multiples of 4. */
248 if(size_t todo{min_len >> 2})
250 const float32x4_t four4{vdupq_n_f32(4.0f)};
251 const float32x4_t step4{vdupq_n_f32(step)};
252 const float32x4_t gain4{vdupq_n_f32(gain)};
253 float32x4_t step_count4{vdupq_n_f32(0.0f)};
254 step_count4 = vsetq_lane_f32(1.0f, step_count4, 1);
255 step_count4 = vsetq_lane_f32(2.0f, step_count4, 2);
256 step_count4 = vsetq_lane_f32(3.0f, step_count4, 3);
258 do {
259 const float32x4_t val4 = vld1q_f32(&InSamples[pos]);
260 float32x4_t dry4 = vld1q_f32(&dst[pos]);
261 dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
262 step_count4 = vaddq_f32(step_count4, four4);
263 vst1q_f32(&dst[pos], dry4);
264 pos += 4;
265 } while(--todo);
266 /* NOTE: step_count4 now represents the next four counts after
267 * the last four mixed samples, so the lowest element
268 * represents the next step count to apply.
270 step_count = vgetq_lane_f32(step_count4, 0);
272 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
273 for(size_t leftover{min_len&3};leftover;++pos,--leftover)
275 dst[pos] += InSamples[pos] * (gain + step*step_count);
276 step_count += 1.0f;
278 if(pos == Counter)
279 gain = *TargetGains;
280 else
281 gain += step*step_count;
283 /* Mix until pos is aligned with 4 or the mix is done. */
284 for(size_t leftover{aligned_len&3};leftover;++pos,--leftover)
285 dst[pos] += InSamples[pos] * gain;
287 *CurrentGains = gain;
288 ++CurrentGains;
289 ++TargetGains;
291 if(!(std::abs(gain) > GainSilenceThreshold))
292 continue;
293 if(size_t todo{(InSamples.size()-pos) >> 2})
295 const float32x4_t gain4 = vdupq_n_f32(gain);
296 do {
297 const float32x4_t val4 = vld1q_f32(&InSamples[pos]);
298 float32x4_t dry4 = vld1q_f32(&dst[pos]);
299 dry4 = vmlaq_f32(dry4, val4, gain4);
300 vst1q_f32(&dst[pos], dry4);
301 pos += 4;
302 } while(--todo);
304 for(size_t leftover{(InSamples.size()-pos)&3};leftover;++pos,--leftover)
305 dst[pos] += InSamples[pos] * gain;