18 const ALfloat
*Resample_
<LerpTag
,NEONTag
>(const InterpState
*, const ALfloat
*RESTRICT src
,
19 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
21 const int32x4_t increment4
= vdupq_n_s32(static_cast<int>(increment
*4));
22 const float32x4_t fracOne4
= vdupq_n_f32(1.0f
/FRACTIONONE
);
23 const int32x4_t fracMask4
= vdupq_n_s32(FRACTIONMASK
);
24 alignas(16) ALuint pos_
[4], frac_
[4];
25 int32x4_t pos4
, frac4
;
27 InitPosArrays(frac
, increment
, frac_
, pos_
, 4);
28 frac4
= vld1q_s32(reinterpret_cast<int*>(frac_
));
29 pos4
= vld1q_s32(reinterpret_cast<int*>(pos_
));
31 auto dst_iter
= dst
.begin();
32 const auto aligned_end
= (dst
.size()&~3u) + dst_iter
;
33 while(dst_iter
!= aligned_end
)
35 const int pos0
{vgetq_lane_s32(pos4
, 0)};
36 const int pos1
{vgetq_lane_s32(pos4
, 1)};
37 const int pos2
{vgetq_lane_s32(pos4
, 2)};
38 const int pos3
{vgetq_lane_s32(pos4
, 3)};
39 const float32x4_t val1
{src
[pos0
], src
[pos1
], src
[pos2
], src
[pos3
]};
40 const float32x4_t val2
{src
[pos0
+1], src
[pos1
+1], src
[pos2
+1], src
[pos3
+1]};
42 /* val1 + (val2-val1)*mu */
43 const float32x4_t r0
{vsubq_f32(val2
, val1
)};
44 const float32x4_t mu
{vmulq_f32(vcvtq_f32_s32(frac4
), fracOne4
)};
45 const float32x4_t out
{vmlaq_f32(val1
, mu
, r0
)};
47 vst1q_f32(dst_iter
, out
);
50 frac4
= vaddq_s32(frac4
, increment4
);
51 pos4
= vaddq_s32(pos4
, vshrq_n_s32(frac4
, FRACTIONBITS
));
52 frac4
= vandq_s32(frac4
, fracMask4
);
55 if(dst_iter
!= dst
.end())
57 src
+= static_cast<ALuint
>(vgetq_lane_s32(pos4
, 0));
58 frac
= static_cast<ALuint
>(vgetq_lane_s32(frac4
, 0));
61 *(dst_iter
++) = lerp(src
[0], src
[1], static_cast<float>(frac
) * (1.0f
/FRACTIONONE
));
64 src
+= frac
>>FRACTIONBITS
;
66 } while(dst_iter
!= dst
.end());
72 const ALfloat
*Resample_
<BSincTag
,NEONTag
>(const InterpState
*state
, const ALfloat
*RESTRICT src
,
73 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
75 const float *const filter
{state
->bsinc
.filter
};
76 const float32x4_t sf4
{vdupq_n_f32(state
->bsinc
.sf
)};
77 const size_t m
{state
->bsinc
.m
};
79 src
-= state
->bsinc
.l
;
80 for(float &out_sample
: dst
)
82 // Calculate the phase index and factor.
83 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
84 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
85 const float pf
{static_cast<float>(frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) *
86 (1.0f
/(1<<FRAC_PHASE_BITDIFF
))};
87 #undef FRAC_PHASE_BITDIFF
89 // Apply the scale and phase interpolated filter.
90 float32x4_t r4
{vdupq_n_f32(0.0f
)};
92 const float32x4_t pf4
{vdupq_n_f32(pf
)};
93 const float *fil
{filter
+ m
*pi
*4};
94 const float *phd
{fil
+ m
};
95 const float *scd
{phd
+ m
};
96 const float *spd
{scd
+ m
};
101 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
102 const float32x4_t f4
= vmlaq_f32(
103 vmlaq_f32(vld1q_f32(fil
), sf4
, vld1q_f32(scd
)),
104 pf4
, vmlaq_f32(vld1q_f32(phd
), sf4
, vld1q_f32(spd
)));
105 fil
+= 4; scd
+= 4; phd
+= 4; spd
+= 4;
107 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
111 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
112 out_sample
= vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
115 src
+= frac
>>FRACTIONBITS
;
116 frac
&= FRACTIONMASK
;
122 const ALfloat
*Resample_
<FastBSincTag
,NEONTag
>(const InterpState
*state
,
123 const ALfloat
*RESTRICT src
, ALuint frac
, ALuint increment
, const al::span
<float> dst
)
125 const float *const filter
{state
->bsinc
.filter
};
126 const size_t m
{state
->bsinc
.m
};
128 src
-= state
->bsinc
.l
;
129 for(float &out_sample
: dst
)
131 // Calculate the phase index and factor.
132 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
133 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
134 const float pf
{static_cast<float>(frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) *
135 (1.0f
/(1<<FRAC_PHASE_BITDIFF
))};
136 #undef FRAC_PHASE_BITDIFF
138 // Apply the phase interpolated filter.
139 float32x4_t r4
{vdupq_n_f32(0.0f
)};
141 const float32x4_t pf4
{vdupq_n_f32(pf
)};
142 const float *fil
{filter
+ m
*pi
*4};
143 const float *phd
{fil
+ m
};
148 /* f = fil + pf*phd */
149 const float32x4_t f4
= vmlaq_f32(vld1q_f32(fil
), pf4
, vld1q_f32(phd
));
151 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
152 fil
+= 4; phd
+= 4; j
+= 4;
155 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
156 out_sample
= vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
159 src
+= frac
>>FRACTIONBITS
;
160 frac
&= FRACTIONMASK
;
166 static inline void ApplyCoeffs(size_t /*Offset*/, float2
*RESTRICT Values
, const ALuint IrSize
,
167 const HrirArray
&Coeffs
, const float left
, const float right
)
171 float32x4_t leftright4
;
173 float32x2_t leftright2
= vdup_n_f32(0.0);
174 leftright2
= vset_lane_f32(left
, leftright2
, 0);
175 leftright2
= vset_lane_f32(right
, leftright2
, 1);
176 leftright4
= vcombine_f32(leftright2
, leftright2
);
179 for(ALuint c
{0};c
< IrSize
;c
+= 2)
181 float32x4_t vals
= vld1q_f32(&Values
[c
][0]);
182 float32x4_t coefs
= vld1q_f32(&Coeffs
[c
][0]);
184 vals
= vmlaq_f32(vals
, coefs
, leftright4
);
186 vst1q_f32(&Values
[c
][0], vals
);
191 void MixHrtf_
<NEONTag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
192 const float *InSamples
, float2
*AccumSamples
, const size_t OutPos
, const ALuint IrSize
,
193 MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
195 MixHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, OutPos
, IrSize
,
196 hrtfparams
, BufferSize
);
200 void MixHrtfBlend_
<NEONTag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
201 const float *InSamples
, float2
*AccumSamples
, const size_t OutPos
, const ALuint IrSize
,
202 const HrtfFilter
*oldparams
, MixHrtfFilter
*newparams
, const size_t BufferSize
)
204 MixHrtfBlendBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, OutPos
, IrSize
,
205 oldparams
, newparams
, BufferSize
);
209 void MixDirectHrtf_
<NEONTag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
210 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
, DirectHrtfState
*State
,
211 const size_t BufferSize
)
212 { MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, State
, BufferSize
); }
216 void Mix_
<NEONTag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
217 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
219 const ALfloat delta
{(Counter
> 0) ? 1.0f
/ static_cast<ALfloat
>(Counter
) : 0.0f
};
220 const bool reached_target
{InSamples
.size() >= Counter
};
221 const auto min_end
= reached_target
? InSamples
.begin() + Counter
: InSamples
.end();
222 const auto aligned_end
= minz(static_cast<uintptr_t>(min_end
-InSamples
.begin()+3) & ~3u,
223 InSamples
.size()) + InSamples
.begin();
224 for(FloatBufferLine
&output
: OutBuffer
)
226 ALfloat
*RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
227 ALfloat gain
{*CurrentGains
};
228 const ALfloat diff
{*TargetGains
- gain
};
230 auto in_iter
= InSamples
.begin();
231 if(std::fabs(diff
) > std::numeric_limits
<float>::epsilon())
233 const ALfloat step
{diff
* delta
};
234 ALfloat step_count
{0.0f
};
235 /* Mix with applying gain steps in aligned multiples of 4. */
236 if(ptrdiff_t todo
{(min_end
-in_iter
) >> 2})
238 const float32x4_t four4
{vdupq_n_f32(4.0f
)};
239 const float32x4_t step4
{vdupq_n_f32(step
)};
240 const float32x4_t gain4
{vdupq_n_f32(gain
)};
241 float32x4_t step_count4
{vsetq_lane_f32(0.0f
,
244 vsetq_lane_f32(3.0f
, vdupq_n_f32(0.0f
), 3),
248 const float32x4_t val4
= vld1q_f32(in_iter
);
249 float32x4_t dry4
= vld1q_f32(dst
);
250 dry4
= vmlaq_f32(dry4
, val4
, vmlaq_f32(gain4
, step4
, step_count4
));
251 step_count4
= vaddq_f32(step_count4
, four4
);
252 vst1q_f32(dst
, dry4
);
253 in_iter
+= 4; dst
+= 4;
255 /* NOTE: step_count4 now represents the next four counts after
256 * the last four mixed samples, so the lowest element
257 * represents the next step count to apply.
259 step_count
= vgetq_lane_f32(step_count4
, 0);
261 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
262 while(in_iter
!= min_end
)
264 *(dst
++) += *(in_iter
++) * (gain
+ step
*step_count
);
270 gain
+= step
*step_count
;
271 *CurrentGains
= gain
;
273 /* Mix until pos is aligned with 4 or the mix is done. */
274 while(in_iter
!= aligned_end
)
275 *(dst
++) += *(in_iter
++) * gain
;
280 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
282 if(ptrdiff_t todo
{(InSamples
.end()-in_iter
) >> 2})
284 const float32x4_t gain4
= vdupq_n_f32(gain
);
286 const float32x4_t val4
= vld1q_f32(in_iter
);
287 float32x4_t dry4
= vld1q_f32(dst
);
288 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
289 vst1q_f32(dst
, dry4
);
290 in_iter
+= 4; dst
+= 4;
293 while(in_iter
!= InSamples
.end())
294 *(dst
++) += *(in_iter
++) * gain
;
299 void MixRow_
<NEONTag
>(const al::span
<float> OutBuffer
, const al::span
<const float> Gains
,
300 const float *InSamples
, const size_t InStride
)
302 for(const ALfloat gain
: Gains
)
304 const ALfloat
*RESTRICT input
{InSamples
};
305 InSamples
+= InStride
;
307 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
310 auto out_iter
= OutBuffer
.begin();
311 if(size_t todo
{OutBuffer
.size() >> 2})
313 const float32x4_t gain4
{vdupq_n_f32(gain
)};
315 const float32x4_t val4
= vld1q_f32(input
);
316 float32x4_t dry4
= vld1q_f32(out_iter
);
317 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
318 vst1q_f32(out_iter
, dry4
);
319 out_iter
+= 4; input
+= 4;
323 auto do_mix
= [gain
](const float cur
, const float src
) noexcept
-> float
324 { return cur
+ src
*gain
; };
325 std::transform(out_iter
, OutBuffer
.end(), input
, out_iter
, do_mix
);