13 #include "bsinc_defs.h"
24 #define FRAC_PHASE_BITDIFF (FRACTIONBITS - BSINC_PHASE_BITS)
25 #define FRAC_PHASE_DIFFONE (1<<FRAC_PHASE_BITDIFF)
27 inline void ApplyCoeffs(float2
*RESTRICT Values
, const uint_fast32_t IrSize
,
28 const HrirArray
&Coeffs
, const float left
, const float right
)
30 float32x4_t leftright4
;
32 float32x2_t leftright2
= vdup_n_f32(0.0);
33 leftright2
= vset_lane_f32(left
, leftright2
, 0);
34 leftright2
= vset_lane_f32(right
, leftright2
, 1);
35 leftright4
= vcombine_f32(leftright2
, leftright2
);
38 ASSUME(IrSize
>= MIN_IR_LENGTH
);
39 for(size_t c
{0};c
< IrSize
;c
+= 2)
41 float32x4_t vals
= vld1q_f32(&Values
[c
][0]);
42 float32x4_t coefs
= vld1q_f32(&Coeffs
[c
][0]);
44 vals
= vmlaq_f32(vals
, coefs
, leftright4
);
46 vst1q_f32(&Values
[c
][0], vals
);
53 const float *Resample_
<LerpTag
,NEONTag
>(const InterpState
*, const float *RESTRICT src
, ALuint frac
,
54 ALuint increment
, const al::span
<float> dst
)
56 const int32x4_t increment4
= vdupq_n_s32(static_cast<int>(increment
*4));
57 const float32x4_t fracOne4
= vdupq_n_f32(1.0f
/FRACTIONONE
);
58 const int32x4_t fracMask4
= vdupq_n_s32(FRACTIONMASK
);
59 alignas(16) ALuint pos_
[4], frac_
[4];
60 int32x4_t pos4
, frac4
;
62 InitPosArrays(frac
, increment
, frac_
, pos_
, 4);
63 frac4
= vld1q_s32(reinterpret_cast<int*>(frac_
));
64 pos4
= vld1q_s32(reinterpret_cast<int*>(pos_
));
66 auto dst_iter
= dst
.begin();
67 for(size_t todo
{dst
.size()>>2};todo
;--todo
)
69 const int pos0
{vgetq_lane_s32(pos4
, 0)};
70 const int pos1
{vgetq_lane_s32(pos4
, 1)};
71 const int pos2
{vgetq_lane_s32(pos4
, 2)};
72 const int pos3
{vgetq_lane_s32(pos4
, 3)};
73 const float32x4_t val1
{src
[pos0
], src
[pos1
], src
[pos2
], src
[pos3
]};
74 const float32x4_t val2
{src
[pos0
+1], src
[pos1
+1], src
[pos2
+1], src
[pos3
+1]};
76 /* val1 + (val2-val1)*mu */
77 const float32x4_t r0
{vsubq_f32(val2
, val1
)};
78 const float32x4_t mu
{vmulq_f32(vcvtq_f32_s32(frac4
), fracOne4
)};
79 const float32x4_t out
{vmlaq_f32(val1
, mu
, r0
)};
81 vst1q_f32(dst_iter
, out
);
84 frac4
= vaddq_s32(frac4
, increment4
);
85 pos4
= vaddq_s32(pos4
, vshrq_n_s32(frac4
, FRACTIONBITS
));
86 frac4
= vandq_s32(frac4
, fracMask4
);
89 if(size_t todo
{dst
.size()&3})
91 src
+= static_cast<ALuint
>(vgetq_lane_s32(pos4
, 0));
92 frac
= static_cast<ALuint
>(vgetq_lane_s32(frac4
, 0));
95 *(dst_iter
++) = lerp(src
[0], src
[1], static_cast<float>(frac
) * (1.0f
/FRACTIONONE
));
98 src
+= frac
>>FRACTIONBITS
;
106 const float *Resample_
<BSincTag
,NEONTag
>(const InterpState
*state
, const float *RESTRICT src
,
107 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
109 const float *const filter
{state
->bsinc
.filter
};
110 const float32x4_t sf4
{vdupq_n_f32(state
->bsinc
.sf
)};
111 const size_t m
{state
->bsinc
.m
};
113 src
-= state
->bsinc
.l
;
114 for(float &out_sample
: dst
)
116 // Calculate the phase index and factor.
117 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
118 const float pf
{static_cast<float>(frac
& (FRAC_PHASE_DIFFONE
-1)) *
119 (1.0f
/FRAC_PHASE_DIFFONE
)};
121 // Apply the scale and phase interpolated filter.
122 float32x4_t r4
{vdupq_n_f32(0.0f
)};
124 const float32x4_t pf4
{vdupq_n_f32(pf
)};
125 const float *fil
{filter
+ m
*pi
*4};
126 const float *phd
{fil
+ m
};
127 const float *scd
{phd
+ m
};
128 const float *spd
{scd
+ m
};
133 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
134 const float32x4_t f4
= vmlaq_f32(
135 vmlaq_f32(vld1q_f32(&fil
[j
]), sf4
, vld1q_f32(&scd
[j
])),
136 pf4
, vmlaq_f32(vld1q_f32(&phd
[j
]), sf4
, vld1q_f32(&spd
[j
])));
138 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
142 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
143 out_sample
= vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
146 src
+= frac
>>FRACTIONBITS
;
147 frac
&= FRACTIONMASK
;
153 const float *Resample_
<FastBSincTag
,NEONTag
>(const InterpState
*state
,
154 const float *RESTRICT src
, ALuint frac
, ALuint increment
, const al::span
<float> dst
)
156 const float *const filter
{state
->bsinc
.filter
};
157 const size_t m
{state
->bsinc
.m
};
159 src
-= state
->bsinc
.l
;
160 for(float &out_sample
: dst
)
162 // Calculate the phase index and factor.
163 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
164 const float pf
{static_cast<float>(frac
& (FRAC_PHASE_DIFFONE
-1)) *
165 (1.0f
/FRAC_PHASE_DIFFONE
)};
167 // Apply the phase interpolated filter.
168 float32x4_t r4
{vdupq_n_f32(0.0f
)};
170 const float32x4_t pf4
{vdupq_n_f32(pf
)};
171 const float *fil
{filter
+ m
*pi
*4};
172 const float *phd
{fil
+ m
};
177 /* f = fil + pf*phd */
178 const float32x4_t f4
= vmlaq_f32(vld1q_f32(&fil
[j
]), pf4
, vld1q_f32(&phd
[j
]));
180 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
184 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
185 out_sample
= vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
188 src
+= frac
>>FRACTIONBITS
;
189 frac
&= FRACTIONMASK
;
196 void MixHrtf_
<NEONTag
>(const float *InSamples
, float2
*AccumSamples
, const ALuint IrSize
,
197 const MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
198 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, BufferSize
); }
201 void MixHrtfBlend_
<NEONTag
>(const float *InSamples
, float2
*AccumSamples
, const ALuint IrSize
,
202 const HrtfFilter
*oldparams
, const MixHrtfFilter
*newparams
, const size_t BufferSize
)
204 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
209 void MixDirectHrtf_
<NEONTag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
210 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
, DirectHrtfState
*State
,
211 const size_t BufferSize
)
212 { MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, State
, BufferSize
); }
216 void Mix_
<NEONTag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
217 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
219 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
220 const auto min_len
= minz(Counter
, InSamples
.size());
221 const auto aligned_len
= minz((min_len
+3) & ~size_t{3}, InSamples
.size()) - min_len
;
223 for(FloatBufferLine
&output
: OutBuffer
)
225 float *RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
226 float gain
{*CurrentGains
};
227 const float step
{(*TargetGains
-gain
) * delta
};
230 if(!(std::fabs(step
) > std::numeric_limits
<float>::epsilon()))
234 float step_count
{0.0f
};
235 /* Mix with applying gain steps in aligned multiples of 4. */
236 if(size_t todo
{(min_len
-pos
) >> 2})
238 const float32x4_t four4
{vdupq_n_f32(4.0f
)};
239 const float32x4_t step4
{vdupq_n_f32(step
)};
240 const float32x4_t gain4
{vdupq_n_f32(gain
)};
241 float32x4_t step_count4
{vdupq_n_f32(0.0f
)};
242 step_count4
= vsetq_lane_f32(1.0f
, step_count4
, 1);
243 step_count4
= vsetq_lane_f32(2.0f
, step_count4
, 2);
244 step_count4
= vsetq_lane_f32(3.0f
, step_count4
, 3);
247 const float32x4_t val4
= vld1q_f32(&InSamples
[pos
]);
248 float32x4_t dry4
= vld1q_f32(&dst
[pos
]);
249 dry4
= vmlaq_f32(dry4
, val4
, vmlaq_f32(gain4
, step4
, step_count4
));
250 step_count4
= vaddq_f32(step_count4
, four4
);
251 vst1q_f32(&dst
[pos
], dry4
);
254 /* NOTE: step_count4 now represents the next four counts after
255 * the last four mixed samples, so the lowest element
256 * represents the next step count to apply.
258 step_count
= vgetq_lane_f32(step_count4
, 0);
260 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
261 for(size_t leftover
{min_len
&3};leftover
;++pos
,--leftover
)
263 dst
[pos
] += InSamples
[pos
] * (gain
+ step
*step_count
);
269 gain
+= step
*step_count
;
271 /* Mix until pos is aligned with 4 or the mix is done. */
272 for(size_t leftover
{aligned_len
&3};leftover
;++pos
,--leftover
)
273 dst
[pos
] += InSamples
[pos
] * gain
;
275 *CurrentGains
= gain
;
279 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
281 if(size_t todo
{(InSamples
.size()-pos
) >> 2})
283 const float32x4_t gain4
= vdupq_n_f32(gain
);
285 const float32x4_t val4
= vld1q_f32(&InSamples
[pos
]);
286 float32x4_t dry4
= vld1q_f32(&dst
[pos
]);
287 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
288 vst1q_f32(&dst
[pos
], dry4
);
292 for(size_t leftover
{(InSamples
.size()-pos
)&3};leftover
;++pos
,--leftover
)
293 dst
[pos
] += InSamples
[pos
] * gain
;