13 #include "alnumeric.h"
15 #include "core/bsinc_defs.h"
16 #include "core/bufferline.h"
17 #include "core/cubic_defs.h"
18 #include "core/mixer/hrtfdefs.h"
19 #include "core/resampler_limits.h"
22 #include "opthelpers.h"
31 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
32 #pragma GCC target("sse")
37 constexpr uint BSincPhaseDiffBits
{MixerFracBits
- BSincPhaseBits
};
38 constexpr uint BSincPhaseDiffOne
{1 << BSincPhaseDiffBits
};
39 constexpr uint BSincPhaseDiffMask
{BSincPhaseDiffOne
- 1u};
41 constexpr uint CubicPhaseDiffBits
{MixerFracBits
- CubicPhaseBits
};
42 constexpr uint CubicPhaseDiffOne
{1 << CubicPhaseDiffBits
};
43 constexpr uint CubicPhaseDiffMask
{CubicPhaseDiffOne
- 1u};
45 force_inline __m128
vmadd(const __m128 x
, const __m128 y
, const __m128 z
) noexcept
46 { return _mm_add_ps(x
, _mm_mul_ps(y
, z
)); }
48 inline void ApplyCoeffs(const al::span
<float2
> Values
, const size_t IrSize
,
49 const ConstHrirSpan Coeffs
, const float left
, const float right
)
51 ASSUME(IrSize
>= MinIrLength
);
52 ASSUME(IrSize
<= HrirLength
);
53 const auto lrlr
= _mm_setr_ps(left
, right
, left
, right
);
54 /* Round up the IR size to a multiple of 2 for SIMD (2 IRs for 2 channels
55 * is 4 floats), to avoid cutting the last sample for odd IR counts. The
56 * underlying HRIR is a fixed-size multiple of 2, any extra samples are
57 * either 0 (silence) or more IR samples that get applied for "free".
59 const auto count4
= size_t{(IrSize
+1) >> 1};
61 /* This isn't technically correct to test alignment, but it's true for
62 * systems that support SSE, which is the only one that needs to know the
63 * alignment of Values (which alternates between 8- and 16-byte aligned).
65 if(!(reinterpret_cast<uintptr_t>(Values
.data())&15))
67 const auto vals4
= al::span
{reinterpret_cast<__m128
*>(Values
[0].data()), count4
};
68 const auto coeffs4
= al::span
{reinterpret_cast<const __m128
*>(Coeffs
[0].data()), count4
};
70 std::transform(vals4
.cbegin(), vals4
.cend(), coeffs4
.cbegin(), vals4
.begin(),
71 [lrlr
](const __m128
&val
, const __m128
&coeff
) -> __m128
72 { return vmadd(val
, coeff
, lrlr
); });
76 auto coeffs
= _mm_load_ps(Coeffs
[0].data());
77 auto vals
= _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64
*>(Values
[0].data()));
78 auto imp0
= _mm_mul_ps(lrlr
, coeffs
);
79 vals
= _mm_add_ps(imp0
, vals
);
80 _mm_storel_pi(reinterpret_cast<__m64
*>(Values
[0].data()), vals
);
81 size_t td
{count4
- 1};
84 coeffs
= _mm_load_ps(Coeffs
[i
+1].data());
85 vals
= _mm_load_ps(Values
[i
].data());
86 const auto imp1
= _mm_mul_ps(lrlr
, coeffs
);
87 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
88 vals
= _mm_add_ps(imp0
, vals
);
89 _mm_store_ps(Values
[i
].data(), vals
);
93 vals
= _mm_loadl_pi(vals
, reinterpret_cast<__m64
*>(Values
[i
].data()));
94 imp0
= _mm_movehl_ps(imp0
, imp0
);
95 vals
= _mm_add_ps(imp0
, vals
);
96 _mm_storel_pi(reinterpret_cast<__m64
*>(Values
[i
].data()), vals
);
100 force_inline
void MixLine(const al::span
<const float> InSamples
, const al::span
<float> dst
,
101 float &CurrentGain
, const float TargetGain
, const float delta
, const size_t fade_len
,
102 const size_t realign_len
, size_t Counter
)
104 const auto step
= float{(TargetGain
-CurrentGain
) * delta
};
107 if(std::abs(step
) > std::numeric_limits
<float>::epsilon())
109 const auto gain
= float{CurrentGain
};
110 auto step_count
= float{0.0f
};
111 /* Mix with applying gain steps in aligned multiples of 4. */
112 if(const size_t todo
{fade_len
>> 2})
114 const auto four4
= _mm_set1_ps(4.0f
);
115 const auto step4
= _mm_set1_ps(step
);
116 const auto gain4
= _mm_set1_ps(gain
);
117 auto step_count4
= _mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
);
119 const auto in4
= al::span
{reinterpret_cast<const __m128
*>(InSamples
.data()),
120 InSamples
.size()/4}.first(todo
);
121 const auto out4
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), dst
.size()/4};
122 std::transform(in4
.begin(), in4
.end(), out4
.begin(), out4
.begin(),
123 [gain4
,step4
,four4
,&step_count4
](const __m128 val4
, __m128 dry4
) -> __m128
125 /* dry += val * (gain + step*step_count) */
126 dry4
= vmadd(dry4
, val4
, vmadd(gain4
, step4
, step_count4
));
127 step_count4
= _mm_add_ps(step_count4
, four4
);
132 /* NOTE: step_count4 now represents the next four counts after the
133 * last four mixed samples, so the lowest element represents the
134 * next step count to apply.
136 step_count
= _mm_cvtss_f32(step_count4
);
138 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
139 if(const size_t leftover
{fade_len
&3})
141 const auto in
= InSamples
.subspan(pos
, leftover
);
142 const auto out
= dst
.subspan(pos
);
144 std::transform(in
.begin(), in
.end(), out
.begin(), out
.begin(),
145 [gain
,step
,&step_count
](const float val
, float dry
) noexcept
-> float
147 dry
+= val
* (gain
+ step
*step_count
);
155 CurrentGain
= gain
+ step
*step_count
;
159 /* Mix until pos is aligned with 4 or the mix is done. */
160 if(const size_t leftover
{realign_len
&3})
162 const auto in
= InSamples
.subspan(pos
, leftover
);
163 const auto out
= dst
.subspan(pos
);
165 std::transform(in
.begin(), in
.end(), out
.begin(), out
.begin(),
166 [TargetGain
](const float val
, const float dry
) noexcept
-> float
167 { return dry
+ val
*TargetGain
; });
171 CurrentGain
= TargetGain
;
173 if(!(std::abs(TargetGain
) > GainSilenceThreshold
))
175 if(size_t todo
{(InSamples
.size()-pos
) >> 2})
177 const auto in4
= al::span
{reinterpret_cast<const __m128
*>(InSamples
.data()),
178 InSamples
.size()/4}.last(todo
);
179 const auto out
= dst
.subspan(pos
);
180 const auto out4
= al::span
{reinterpret_cast<__m128
*>(out
.data()), out
.size()/4};
182 const auto gain4
= _mm_set1_ps(TargetGain
);
183 std::transform(in4
.begin(), in4
.end(), out4
.begin(), out4
.begin(),
184 [gain4
](const __m128 val4
, const __m128 dry4
) -> __m128
185 { return vmadd(dry4
, val4
, gain4
); });
188 if(const size_t leftover
{(InSamples
.size()-pos
)&3})
190 const auto in
= InSamples
.last(leftover
);
191 const auto out
= dst
.subspan(pos
);
193 std::transform(in
.begin(), in
.end(), out
.begin(), out
.begin(),
194 [TargetGain
](const float val
, const float dry
) noexcept
-> float
195 { return dry
+ val
*TargetGain
; });
202 void Resample_
<CubicTag
,SSETag
>(const InterpState
*state
, const al::span
<const float> src
,
203 uint frac
, const uint increment
, const al::span
<float> dst
)
205 ASSUME(frac
< MixerFracOne
);
207 const auto filter
= std::get
<CubicState
>(*state
).filter
;
209 size_t pos
{MaxResamplerEdge
-1};
210 std::generate(dst
.begin(), dst
.end(), [&pos
,&frac
,src
,increment
,filter
]() -> float
212 const uint pi
{frac
>> CubicPhaseDiffBits
}; ASSUME(pi
< CubicPhaseCount
);
213 const float pf
{static_cast<float>(frac
&CubicPhaseDiffMask
) * (1.0f
/CubicPhaseDiffOne
)};
214 const __m128 pf4
{_mm_set1_ps(pf
)};
216 /* Apply the phase interpolated filter. */
218 /* f = fil + pf*phd */
219 const __m128 f4
= vmadd(_mm_load_ps(filter
[pi
].mCoeffs
.data()), pf4
,
220 _mm_load_ps(filter
[pi
].mDeltas
.data()));
222 __m128 r4
{_mm_mul_ps(f4
, _mm_loadu_ps(&src
[pos
]))};
224 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
225 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
226 const float output
{_mm_cvtss_f32(r4
)};
229 pos
+= frac
>>MixerFracBits
;
230 frac
&= MixerFracMask
;
236 void Resample_
<BSincTag
,SSETag
>(const InterpState
*state
, const al::span
<const float> src
,
237 uint frac
, const uint increment
, const al::span
<float> dst
)
239 const auto &bsinc
= std::get
<BsincState
>(*state
);
240 const auto sf4
= _mm_set1_ps(bsinc
.sf
);
241 const auto m
= size_t{bsinc
.m
};
243 ASSUME(m
<= MaxResamplerPadding
);
244 ASSUME(frac
< MixerFracOne
);
246 const auto filter
= bsinc
.filter
.first(4_uz
*BSincPhaseCount
*m
);
248 ASSUME(bsinc
.l
<= MaxResamplerEdge
);
249 auto pos
= size_t{MaxResamplerEdge
-bsinc
.l
};
250 std::generate(dst
.begin(), dst
.end(), [&pos
,&frac
,src
,increment
,sf4
,m
,filter
]() -> float
252 // Calculate the phase index and factor.
253 const size_t pi
{frac
>> BSincPhaseDiffBits
}; ASSUME(pi
< BSincPhaseCount
);
254 const float pf
{static_cast<float>(frac
&BSincPhaseDiffMask
) * (1.0f
/BSincPhaseDiffOne
)};
256 // Apply the scale and phase interpolated filter.
257 auto r4
= _mm_setzero_ps();
259 const auto pf4
= _mm_set1_ps(pf
);
260 const auto fil
= filter
.subspan(2_uz
*pi
*m
);
261 const auto phd
= fil
.subspan(m
);
262 const auto scd
= fil
.subspan(2_uz
*BSincPhaseCount
*m
);
263 const auto spd
= scd
.subspan(m
);
264 auto td
= size_t{m
>> 2};
268 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
269 const __m128 f4
= vmadd(
270 vmadd(_mm_load_ps(&fil
[j
]), sf4
, _mm_load_ps(&scd
[j
])),
271 pf4
, vmadd(_mm_load_ps(&phd
[j
]), sf4
, _mm_load_ps(&spd
[j
])));
273 r4
= vmadd(r4
, f4
, _mm_loadu_ps(&src
[pos
+j
]));
277 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
278 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
279 const auto output
= _mm_cvtss_f32(r4
);
282 pos
+= frac
>>MixerFracBits
;
283 frac
&= MixerFracMask
;
289 void Resample_
<FastBSincTag
,SSETag
>(const InterpState
*state
, const al::span
<const float> src
,
290 uint frac
, const uint increment
, const al::span
<float> dst
)
292 const auto &bsinc
= std::get
<BsincState
>(*state
);
293 const auto m
= size_t{bsinc
.m
};
295 ASSUME(m
<= MaxResamplerPadding
);
296 ASSUME(frac
< MixerFracOne
);
298 const auto filter
= bsinc
.filter
.first(2_uz
*m
*BSincPhaseCount
);
300 ASSUME(bsinc
.l
<= MaxResamplerEdge
);
301 size_t pos
{MaxResamplerEdge
-bsinc
.l
};
302 std::generate(dst
.begin(), dst
.end(), [&pos
,&frac
,src
,increment
,filter
,m
]() -> float
304 // Calculate the phase index and factor.
305 const size_t pi
{frac
>> BSincPhaseDiffBits
}; ASSUME(pi
< BSincPhaseCount
);
306 const float pf
{static_cast<float>(frac
&BSincPhaseDiffMask
) * (1.0f
/BSincPhaseDiffOne
)};
308 // Apply the phase interpolated filter.
309 auto r4
= _mm_setzero_ps();
311 const auto pf4
= _mm_set1_ps(pf
);
312 const auto fil
= filter
.subspan(2_uz
*m
*pi
);
313 const auto phd
= fil
.subspan(m
);
314 auto td
= size_t{m
>> 2};
318 /* f = fil + pf*phd */
319 const auto f4
= vmadd(_mm_load_ps(&fil
[j
]), pf4
, _mm_load_ps(&phd
[j
]));
321 r4
= vmadd(r4
, f4
, _mm_loadu_ps(&src
[pos
+j
]));
325 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
326 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
327 const auto output
= _mm_cvtss_f32(r4
);
330 pos
+= frac
>>MixerFracBits
;
331 frac
&= MixerFracMask
;
338 void MixHrtf_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<float2
> AccumSamples
,
339 const uint IrSize
, const MixHrtfFilter
*hrtfparams
, const size_t SamplesToDo
)
340 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, SamplesToDo
); }
343 void MixHrtfBlend_
<SSETag
>(const al::span
<const float> InSamples
,
344 const al::span
<float2
> AccumSamples
, const uint IrSize
, const HrtfFilter
*oldparams
,
345 const MixHrtfFilter
*newparams
, const size_t SamplesToDo
)
347 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
352 void MixDirectHrtf_
<SSETag
>(const FloatBufferSpan LeftOut
, const FloatBufferSpan RightOut
,
353 const al::span
<const FloatBufferLine
> InSamples
, const al::span
<float2
> AccumSamples
,
354 const al::span
<float,BufferLineSize
> TempBuf
, const al::span
<HrtfChannelState
> ChanState
,
355 const size_t IrSize
, const size_t SamplesToDo
)
357 MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, TempBuf
, ChanState
,
358 IrSize
, SamplesToDo
);
363 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
364 const al::span
<float> CurrentGains
, const al::span
<const float> TargetGains
,
365 const size_t Counter
, const size_t OutPos
)
367 if((OutPos
&3) != 0) UNLIKELY
368 return Mix_
<CTag
>(InSamples
, OutBuffer
, CurrentGains
, TargetGains
, Counter
, OutPos
);
370 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
371 const auto fade_len
= std::min(Counter
, InSamples
.size());
372 const auto realign_len
= std::min((fade_len
+3_uz
) & ~3_uz
, InSamples
.size()) - fade_len
;
374 auto curgains
= CurrentGains
.begin();
375 auto targetgains
= TargetGains
.cbegin();
376 for(FloatBufferLine
&output
: OutBuffer
)
377 MixLine(InSamples
, al::span
{output
}.subspan(OutPos
), *curgains
++, *targetgains
++, delta
,
378 fade_len
, realign_len
, Counter
);
382 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<float> OutBuffer
,
383 float &CurrentGain
, const float TargetGain
, const size_t Counter
)
385 if((reinterpret_cast<uintptr_t>(OutBuffer
.data())&15) != 0) UNLIKELY
386 return Mix_
<CTag
>(InSamples
, OutBuffer
, CurrentGain
, TargetGain
, Counter
);
388 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
389 const auto fade_len
= std::min(Counter
, InSamples
.size());
390 const auto realign_len
= std::min((fade_len
+3_uz
) & ~3_uz
, InSamples
.size()) - fade_len
;
392 MixLine(InSamples
, OutBuffer
, CurrentGain
, TargetGain
, delta
, fade_len
, realign_len
, Counter
);