Apply the source's AL_AIR_ABSORPTION_FACTOR to send paths
[openal-soft.git] / core / mixer / mixer_sse.cpp
blob622ccac8d3144eeedd33be6d3db1d270764225e8
1 #include "config.h"
3 #include <mmintrin.h>
4 #include <xmmintrin.h>
6 #include <algorithm>
7 #include <array>
8 #include <cstddef>
9 #include <cstdint>
10 #include <limits>
11 #include <variant>
13 #include "alnumeric.h"
14 #include "alspan.h"
15 #include "core/bsinc_defs.h"
16 #include "core/bufferline.h"
17 #include "core/cubic_defs.h"
18 #include "core/mixer/hrtfdefs.h"
19 #include "core/resampler_limits.h"
20 #include "defs.h"
21 #include "hrtfbase.h"
22 #include "opthelpers.h"
24 struct CTag;
25 struct SSETag;
26 struct CubicTag;
27 struct BSincTag;
28 struct FastBSincTag;
31 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
32 #pragma GCC target("sse")
33 #endif
35 namespace {
37 constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
38 constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
39 constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
41 constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
42 constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
43 constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
45 force_inline __m128 vmadd(const __m128 x, const __m128 y, const __m128 z) noexcept
46 { return _mm_add_ps(x, _mm_mul_ps(y, z)); }
48 inline void ApplyCoeffs(const al::span<float2> Values, const size_t IrSize,
49 const ConstHrirSpan Coeffs, const float left, const float right)
51 ASSUME(IrSize >= MinIrLength);
52 ASSUME(IrSize <= HrirLength);
53 const auto lrlr = _mm_setr_ps(left, right, left, right);
54 /* Round up the IR size to a multiple of 2 for SIMD (2 IRs for 2 channels
55 * is 4 floats), to avoid cutting the last sample for odd IR counts. The
56 * underlying HRIR is a fixed-size multiple of 2, any extra samples are
57 * either 0 (silence) or more IR samples that get applied for "free".
59 const auto count4 = size_t{(IrSize+1) >> 1};
61 /* This isn't technically correct to test alignment, but it's true for
62 * systems that support SSE, which is the only one that needs to know the
63 * alignment of Values (which alternates between 8- and 16-byte aligned).
65 if(!(reinterpret_cast<uintptr_t>(Values.data())&15))
67 const auto vals4 = al::span{reinterpret_cast<__m128*>(Values[0].data()), count4};
68 const auto coeffs4 = al::span{reinterpret_cast<const __m128*>(Coeffs[0].data()), count4};
70 std::transform(vals4.cbegin(), vals4.cend(), coeffs4.cbegin(), vals4.begin(),
71 [lrlr](const __m128 &val, const __m128 &coeff) -> __m128
72 { return vmadd(val, coeff, lrlr); });
74 else
76 auto coeffs = _mm_load_ps(Coeffs[0].data());
77 auto vals = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64*>(Values[0].data()));
78 auto imp0 = _mm_mul_ps(lrlr, coeffs);
79 vals = _mm_add_ps(imp0, vals);
80 _mm_storel_pi(reinterpret_cast<__m64*>(Values[0].data()), vals);
81 size_t td{count4 - 1};
82 size_t i{1};
83 do {
84 coeffs = _mm_load_ps(Coeffs[i+1].data());
85 vals = _mm_load_ps(Values[i].data());
86 const auto imp1 = _mm_mul_ps(lrlr, coeffs);
87 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
88 vals = _mm_add_ps(imp0, vals);
89 _mm_store_ps(Values[i].data(), vals);
90 imp0 = imp1;
91 i += 2;
92 } while(--td);
93 vals = _mm_loadl_pi(vals, reinterpret_cast<__m64*>(Values[i].data()));
94 imp0 = _mm_movehl_ps(imp0, imp0);
95 vals = _mm_add_ps(imp0, vals);
96 _mm_storel_pi(reinterpret_cast<__m64*>(Values[i].data()), vals);
100 force_inline void MixLine(const al::span<const float> InSamples, const al::span<float> dst,
101 float &CurrentGain, const float TargetGain, const float delta, const size_t fade_len,
102 const size_t realign_len, size_t Counter)
104 const auto step = float{(TargetGain-CurrentGain) * delta};
106 size_t pos{0};
107 if(std::abs(step) > std::numeric_limits<float>::epsilon())
109 const auto gain = float{CurrentGain};
110 auto step_count = float{0.0f};
111 /* Mix with applying gain steps in aligned multiples of 4. */
112 if(const size_t todo{fade_len >> 2})
114 const auto four4 = _mm_set1_ps(4.0f);
115 const auto step4 = _mm_set1_ps(step);
116 const auto gain4 = _mm_set1_ps(gain);
117 auto step_count4 = _mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f);
119 const auto in4 = al::span{reinterpret_cast<const __m128*>(InSamples.data()),
120 InSamples.size()/4}.first(todo);
121 const auto out4 = al::span{reinterpret_cast<__m128*>(dst.data()), dst.size()/4};
122 std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
123 [gain4,step4,four4,&step_count4](const __m128 val4, __m128 dry4) -> __m128
125 /* dry += val * (gain + step*step_count) */
126 dry4 = vmadd(dry4, val4, vmadd(gain4, step4, step_count4));
127 step_count4 = _mm_add_ps(step_count4, four4);
128 return dry4;
130 pos += in4.size()*4;
132 /* NOTE: step_count4 now represents the next four counts after the
133 * last four mixed samples, so the lowest element represents the
134 * next step count to apply.
136 step_count = _mm_cvtss_f32(step_count4);
138 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
139 if(const size_t leftover{fade_len&3})
141 const auto in = InSamples.subspan(pos, leftover);
142 const auto out = dst.subspan(pos);
144 std::transform(in.begin(), in.end(), out.begin(), out.begin(),
145 [gain,step,&step_count](const float val, float dry) noexcept -> float
147 dry += val * (gain + step*step_count);
148 step_count += 1.0f;
149 return dry;
151 pos += leftover;
153 if(pos < Counter)
155 CurrentGain = gain + step*step_count;
156 return;
159 /* Mix until pos is aligned with 4 or the mix is done. */
160 if(const size_t leftover{realign_len&3})
162 const auto in = InSamples.subspan(pos, leftover);
163 const auto out = dst.subspan(pos);
165 std::transform(in.begin(), in.end(), out.begin(), out.begin(),
166 [TargetGain](const float val, const float dry) noexcept -> float
167 { return dry + val*TargetGain; });
168 pos += leftover;
171 CurrentGain = TargetGain;
173 if(!(std::abs(TargetGain) > GainSilenceThreshold))
174 return;
175 if(size_t todo{(InSamples.size()-pos) >> 2})
177 const auto in4 = al::span{reinterpret_cast<const __m128*>(InSamples.data()),
178 InSamples.size()/4}.last(todo);
179 const auto out = dst.subspan(pos);
180 const auto out4 = al::span{reinterpret_cast<__m128*>(out.data()), out.size()/4};
182 const auto gain4 = _mm_set1_ps(TargetGain);
183 std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
184 [gain4](const __m128 val4, const __m128 dry4) -> __m128
185 { return vmadd(dry4, val4, gain4); });
186 pos += in4.size()*4;
188 if(const size_t leftover{(InSamples.size()-pos)&3})
190 const auto in = InSamples.last(leftover);
191 const auto out = dst.subspan(pos);
193 std::transform(in.begin(), in.end(), out.begin(), out.begin(),
194 [TargetGain](const float val, const float dry) noexcept -> float
195 { return dry + val*TargetGain; });
199 } // namespace
201 template<>
202 void Resample_<CubicTag,SSETag>(const InterpState *state, const al::span<const float> src,
203 uint frac, const uint increment, const al::span<float> dst)
205 ASSUME(frac < MixerFracOne);
207 const auto filter = std::get<CubicState>(*state).filter;
209 size_t pos{MaxResamplerEdge-1};
210 std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,filter]() -> float
212 const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
213 const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
214 const __m128 pf4{_mm_set1_ps(pf)};
216 /* Apply the phase interpolated filter. */
218 /* f = fil + pf*phd */
219 const __m128 f4 = vmadd(_mm_load_ps(filter[pi].mCoeffs.data()), pf4,
220 _mm_load_ps(filter[pi].mDeltas.data()));
221 /* r = f*src */
222 __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(&src[pos]))};
224 r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
225 r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
226 const float output{_mm_cvtss_f32(r4)};
228 frac += increment;
229 pos += frac>>MixerFracBits;
230 frac &= MixerFracMask;
231 return output;
235 template<>
236 void Resample_<BSincTag,SSETag>(const InterpState *state, const al::span<const float> src,
237 uint frac, const uint increment, const al::span<float> dst)
239 const auto &bsinc = std::get<BsincState>(*state);
240 const auto sf4 = _mm_set1_ps(bsinc.sf);
241 const auto m = size_t{bsinc.m};
242 ASSUME(m > 0);
243 ASSUME(m <= MaxResamplerPadding);
244 ASSUME(frac < MixerFracOne);
246 const auto filter = bsinc.filter.first(4_uz*BSincPhaseCount*m);
248 ASSUME(bsinc.l <= MaxResamplerEdge);
249 auto pos = size_t{MaxResamplerEdge-bsinc.l};
250 std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,sf4,m,filter]() -> float
252 // Calculate the phase index and factor.
253 const size_t pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
254 const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
256 // Apply the scale and phase interpolated filter.
257 auto r4 = _mm_setzero_ps();
259 const auto pf4 = _mm_set1_ps(pf);
260 const auto fil = filter.subspan(2_uz*pi*m);
261 const auto phd = fil.subspan(m);
262 const auto scd = fil.subspan(2_uz*BSincPhaseCount*m);
263 const auto spd = scd.subspan(m);
264 auto td = size_t{m >> 2};
265 auto j = size_t{0};
267 do {
268 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
269 const __m128 f4 = vmadd(
270 vmadd(_mm_load_ps(&fil[j]), sf4, _mm_load_ps(&scd[j])),
271 pf4, vmadd(_mm_load_ps(&phd[j]), sf4, _mm_load_ps(&spd[j])));
272 /* r += f*src */
273 r4 = vmadd(r4, f4, _mm_loadu_ps(&src[pos+j]));
274 j += 4;
275 } while(--td);
277 r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
278 r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
279 const auto output = _mm_cvtss_f32(r4);
281 frac += increment;
282 pos += frac>>MixerFracBits;
283 frac &= MixerFracMask;
284 return output;
288 template<>
289 void Resample_<FastBSincTag,SSETag>(const InterpState *state, const al::span<const float> src,
290 uint frac, const uint increment, const al::span<float> dst)
292 const auto &bsinc = std::get<BsincState>(*state);
293 const auto m = size_t{bsinc.m};
294 ASSUME(m > 0);
295 ASSUME(m <= MaxResamplerPadding);
296 ASSUME(frac < MixerFracOne);
298 const auto filter = bsinc.filter.first(2_uz*m*BSincPhaseCount);
300 ASSUME(bsinc.l <= MaxResamplerEdge);
301 size_t pos{MaxResamplerEdge-bsinc.l};
302 std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,filter,m]() -> float
304 // Calculate the phase index and factor.
305 const size_t pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
306 const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
308 // Apply the phase interpolated filter.
309 auto r4 = _mm_setzero_ps();
311 const auto pf4 = _mm_set1_ps(pf);
312 const auto fil = filter.subspan(2_uz*m*pi);
313 const auto phd = fil.subspan(m);
314 auto td = size_t{m >> 2};
315 auto j = size_t{0};
317 do {
318 /* f = fil + pf*phd */
319 const auto f4 = vmadd(_mm_load_ps(&fil[j]), pf4, _mm_load_ps(&phd[j]));
320 /* r += f*src */
321 r4 = vmadd(r4, f4, _mm_loadu_ps(&src[pos+j]));
322 j += 4;
323 } while(--td);
325 r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
326 r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
327 const auto output = _mm_cvtss_f32(r4);
329 frac += increment;
330 pos += frac>>MixerFracBits;
331 frac &= MixerFracMask;
332 return output;
337 template<>
338 void MixHrtf_<SSETag>(const al::span<const float> InSamples, const al::span<float2> AccumSamples,
339 const uint IrSize, const MixHrtfFilter *hrtfparams, const size_t SamplesToDo)
340 { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, SamplesToDo); }
342 template<>
343 void MixHrtfBlend_<SSETag>(const al::span<const float> InSamples,
344 const al::span<float2> AccumSamples, const uint IrSize, const HrtfFilter *oldparams,
345 const MixHrtfFilter *newparams, const size_t SamplesToDo)
347 MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
348 SamplesToDo);
351 template<>
352 void MixDirectHrtf_<SSETag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
353 const al::span<const FloatBufferLine> InSamples, const al::span<float2> AccumSamples,
354 const al::span<float,BufferLineSize> TempBuf, const al::span<HrtfChannelState> ChanState,
355 const size_t IrSize, const size_t SamplesToDo)
357 MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
358 IrSize, SamplesToDo);
362 template<>
363 void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
364 const al::span<float> CurrentGains, const al::span<const float> TargetGains,
365 const size_t Counter, const size_t OutPos)
367 if((OutPos&3) != 0) UNLIKELY
368 return Mix_<CTag>(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos);
370 const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
371 const auto fade_len = std::min(Counter, InSamples.size());
372 const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
374 auto curgains = CurrentGains.begin();
375 auto targetgains = TargetGains.cbegin();
376 for(FloatBufferLine &output : OutBuffer)
377 MixLine(InSamples, al::span{output}.subspan(OutPos), *curgains++, *targetgains++, delta,
378 fade_len, realign_len, Counter);
381 template<>
382 void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<float> OutBuffer,
383 float &CurrentGain, const float TargetGain, const size_t Counter)
385 if((reinterpret_cast<uintptr_t>(OutBuffer.data())&15) != 0) UNLIKELY
386 return Mix_<CTag>(InSamples, OutBuffer, CurrentGain, TargetGain, Counter);
388 const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
389 const auto fade_len = std::min(Counter, InSamples.size());
390 const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
392 MixLine(InSamples, OutBuffer, CurrentGain, TargetGain, delta, fade_len, realign_len, Counter);