9 #include "core/bsinc_defs.h"
19 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
20 #pragma GCC target("fpu=neon")
25 inline float32x4_t
set_f4(float l0
, float l1
, float l2
, float l3
)
27 float32x4_t ret
{vmovq_n_f32(l0
)};
28 ret
= vsetq_lane_f32(l1
, ret
, 1);
29 ret
= vsetq_lane_f32(l2
, ret
, 2);
30 ret
= vsetq_lane_f32(l3
, ret
, 3);
34 constexpr uint FracPhaseBitDiff
{MixerFracBits
- BSincPhaseBits
};
35 constexpr uint FracPhaseDiffOne
{1 << FracPhaseBitDiff
};
37 inline void ApplyCoeffs(float2
*RESTRICT Values
, const size_t IrSize
, const ConstHrirSpan Coeffs
,
38 const float left
, const float right
)
40 float32x4_t leftright4
;
42 float32x2_t leftright2
{vmov_n_f32(left
)};
43 leftright2
= vset_lane_f32(right
, leftright2
, 1);
44 leftright4
= vcombine_f32(leftright2
, leftright2
);
47 ASSUME(IrSize
>= MinIrLength
);
48 for(size_t c
{0};c
< IrSize
;c
+= 2)
50 float32x4_t vals
= vld1q_f32(&Values
[c
][0]);
51 float32x4_t coefs
= vld1q_f32(&Coeffs
[c
][0]);
53 vals
= vmlaq_f32(vals
, coefs
, leftright4
);
55 vst1q_f32(&Values
[c
][0], vals
);
62 float *Resample_
<LerpTag
,NEONTag
>(const InterpState
*, float *RESTRICT src
, uint frac
,
63 uint increment
, const al::span
<float> dst
)
65 const int32x4_t increment4
= vdupq_n_s32(static_cast<int>(increment
*4));
66 const float32x4_t fracOne4
= vdupq_n_f32(1.0f
/MixerFracOne
);
67 const int32x4_t fracMask4
= vdupq_n_s32(MixerFracMask
);
68 alignas(16) uint pos_
[4], frac_
[4];
69 int32x4_t pos4
, frac4
;
71 InitPosArrays(frac
, increment
, frac_
, pos_
);
72 frac4
= vld1q_s32(reinterpret_cast<int*>(frac_
));
73 pos4
= vld1q_s32(reinterpret_cast<int*>(pos_
));
75 auto dst_iter
= dst
.begin();
76 for(size_t todo
{dst
.size()>>2};todo
;--todo
)
78 const int pos0
{vgetq_lane_s32(pos4
, 0)};
79 const int pos1
{vgetq_lane_s32(pos4
, 1)};
80 const int pos2
{vgetq_lane_s32(pos4
, 2)};
81 const int pos3
{vgetq_lane_s32(pos4
, 3)};
82 const float32x4_t val1
{set_f4(src
[pos0
], src
[pos1
], src
[pos2
], src
[pos3
])};
83 const float32x4_t val2
{set_f4(src
[pos0
+1], src
[pos1
+1], src
[pos2
+1], src
[pos3
+1])};
85 /* val1 + (val2-val1)*mu */
86 const float32x4_t r0
{vsubq_f32(val2
, val1
)};
87 const float32x4_t mu
{vmulq_f32(vcvtq_f32_s32(frac4
), fracOne4
)};
88 const float32x4_t out
{vmlaq_f32(val1
, mu
, r0
)};
90 vst1q_f32(dst_iter
, out
);
93 frac4
= vaddq_s32(frac4
, increment4
);
94 pos4
= vaddq_s32(pos4
, vshrq_n_s32(frac4
, MixerFracBits
));
95 frac4
= vandq_s32(frac4
, fracMask4
);
98 if(size_t todo
{dst
.size()&3})
100 src
+= static_cast<uint
>(vgetq_lane_s32(pos4
, 0));
101 frac
= static_cast<uint
>(vgetq_lane_s32(frac4
, 0));
104 *(dst_iter
++) = lerpf(src
[0], src
[1], static_cast<float>(frac
) * (1.0f
/MixerFracOne
));
107 src
+= frac
>>MixerFracBits
;
108 frac
&= MixerFracMask
;
115 float *Resample_
<BSincTag
,NEONTag
>(const InterpState
*state
, float *RESTRICT src
, uint frac
,
116 uint increment
, const al::span
<float> dst
)
118 const float *const filter
{state
->bsinc
.filter
};
119 const float32x4_t sf4
{vdupq_n_f32(state
->bsinc
.sf
)};
120 const size_t m
{state
->bsinc
.m
};
123 src
-= state
->bsinc
.l
;
124 for(float &out_sample
: dst
)
126 // Calculate the phase index and factor.
127 const uint pi
{frac
>> FracPhaseBitDiff
};
128 const float pf
{static_cast<float>(frac
& (FracPhaseDiffOne
-1)) * (1.0f
/FracPhaseDiffOne
)};
130 // Apply the scale and phase interpolated filter.
131 float32x4_t r4
{vdupq_n_f32(0.0f
)};
133 const float32x4_t pf4
{vdupq_n_f32(pf
)};
134 const float *RESTRICT fil
{filter
+ m
*pi
*2};
135 const float *RESTRICT phd
{fil
+ m
};
136 const float *RESTRICT scd
{fil
+ BSincPhaseCount
*2*m
};
137 const float *RESTRICT spd
{scd
+ m
};
142 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
143 const float32x4_t f4
= vmlaq_f32(
144 vmlaq_f32(vld1q_f32(&fil
[j
]), sf4
, vld1q_f32(&scd
[j
])),
145 pf4
, vmlaq_f32(vld1q_f32(&phd
[j
]), sf4
, vld1q_f32(&spd
[j
])));
147 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
151 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
152 out_sample
= vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
155 src
+= frac
>>MixerFracBits
;
156 frac
&= MixerFracMask
;
162 float *Resample_
<FastBSincTag
,NEONTag
>(const InterpState
*state
, float *RESTRICT src
, uint frac
,
163 uint increment
, const al::span
<float> dst
)
165 const float *const filter
{state
->bsinc
.filter
};
166 const size_t m
{state
->bsinc
.m
};
169 src
-= state
->bsinc
.l
;
170 for(float &out_sample
: dst
)
172 // Calculate the phase index and factor.
173 const uint pi
{frac
>> FracPhaseBitDiff
};
174 const float pf
{static_cast<float>(frac
& (FracPhaseDiffOne
-1)) * (1.0f
/FracPhaseDiffOne
)};
176 // Apply the phase interpolated filter.
177 float32x4_t r4
{vdupq_n_f32(0.0f
)};
179 const float32x4_t pf4
{vdupq_n_f32(pf
)};
180 const float *RESTRICT fil
{filter
+ m
*pi
*2};
181 const float *RESTRICT phd
{fil
+ m
};
186 /* f = fil + pf*phd */
187 const float32x4_t f4
= vmlaq_f32(vld1q_f32(&fil
[j
]), pf4
, vld1q_f32(&phd
[j
]));
189 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
193 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
194 out_sample
= vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
197 src
+= frac
>>MixerFracBits
;
198 frac
&= MixerFracMask
;
205 void MixHrtf_
<NEONTag
>(const float *InSamples
, float2
*AccumSamples
, const uint IrSize
,
206 const MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
207 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, BufferSize
); }
210 void MixHrtfBlend_
<NEONTag
>(const float *InSamples
, float2
*AccumSamples
, const uint IrSize
,
211 const HrtfFilter
*oldparams
, const MixHrtfFilter
*newparams
, const size_t BufferSize
)
213 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
218 void MixDirectHrtf_
<NEONTag
>(const FloatBufferSpan LeftOut
, const FloatBufferSpan RightOut
,
219 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
,
220 float *TempBuf
, HrtfChannelState
*ChanState
, const size_t IrSize
, const size_t BufferSize
)
222 MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, TempBuf
, ChanState
,
228 void Mix_
<NEONTag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
229 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
231 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
232 const auto min_len
= minz(Counter
, InSamples
.size());
233 const auto aligned_len
= minz((min_len
+3) & ~size_t{3}, InSamples
.size()) - min_len
;
235 for(FloatBufferLine
&output
: OutBuffer
)
237 float *RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
238 float gain
{*CurrentGains
};
239 const float step
{(*TargetGains
-gain
) * delta
};
242 if(!(std::abs(step
) > std::numeric_limits
<float>::epsilon()))
246 float step_count
{0.0f
};
247 /* Mix with applying gain steps in aligned multiples of 4. */
248 if(size_t todo
{min_len
>> 2})
250 const float32x4_t four4
{vdupq_n_f32(4.0f
)};
251 const float32x4_t step4
{vdupq_n_f32(step
)};
252 const float32x4_t gain4
{vdupq_n_f32(gain
)};
253 float32x4_t step_count4
{vdupq_n_f32(0.0f
)};
254 step_count4
= vsetq_lane_f32(1.0f
, step_count4
, 1);
255 step_count4
= vsetq_lane_f32(2.0f
, step_count4
, 2);
256 step_count4
= vsetq_lane_f32(3.0f
, step_count4
, 3);
259 const float32x4_t val4
= vld1q_f32(&InSamples
[pos
]);
260 float32x4_t dry4
= vld1q_f32(&dst
[pos
]);
261 dry4
= vmlaq_f32(dry4
, val4
, vmlaq_f32(gain4
, step4
, step_count4
));
262 step_count4
= vaddq_f32(step_count4
, four4
);
263 vst1q_f32(&dst
[pos
], dry4
);
266 /* NOTE: step_count4 now represents the next four counts after
267 * the last four mixed samples, so the lowest element
268 * represents the next step count to apply.
270 step_count
= vgetq_lane_f32(step_count4
, 0);
272 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
273 for(size_t leftover
{min_len
&3};leftover
;++pos
,--leftover
)
275 dst
[pos
] += InSamples
[pos
] * (gain
+ step
*step_count
);
281 gain
+= step
*step_count
;
283 /* Mix until pos is aligned with 4 or the mix is done. */
284 for(size_t leftover
{aligned_len
&3};leftover
;++pos
,--leftover
)
285 dst
[pos
] += InSamples
[pos
] * gain
;
287 *CurrentGains
= gain
;
291 if(!(std::abs(gain
) > GainSilenceThreshold
))
293 if(size_t todo
{(InSamples
.size()-pos
) >> 2})
295 const float32x4_t gain4
= vdupq_n_f32(gain
);
297 const float32x4_t val4
= vld1q_f32(&InSamples
[pos
]);
298 float32x4_t dry4
= vld1q_f32(&dst
[pos
]);
299 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
300 vst1q_f32(&dst
[pos
], dry4
);
304 for(size_t leftover
{(InSamples
.size()-pos
)&3};leftover
;++pos
,--leftover
)
305 dst
[pos
] += InSamples
[pos
] * gain
;