9 #include "core/bsinc_defs.h"
18 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
19 #pragma GCC target("sse")
24 constexpr uint FracPhaseBitDiff
{MixerFracBits
- BSincPhaseBits
};
25 constexpr uint FracPhaseDiffOne
{1 << FracPhaseBitDiff
};
27 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
29 inline void ApplyCoeffs(float2
*RESTRICT Values
, const size_t IrSize
, const ConstHrirSpan Coeffs
,
30 const float left
, const float right
)
32 const __m128 lrlr
{_mm_setr_ps(left
, right
, left
, right
)};
34 ASSUME(IrSize
>= MinIrLength
);
35 /* This isn't technically correct to test alignment, but it's true for
36 * systems that support SSE, which is the only one that needs to know the
37 * alignment of Values (which alternates between 8- and 16-byte aligned).
39 if(!(reinterpret_cast<uintptr_t>(Values
)&15))
41 for(size_t i
{0};i
< IrSize
;i
+= 2)
43 const __m128 coeffs
{_mm_load_ps(Coeffs
[i
].data())};
44 __m128 vals
{_mm_load_ps(Values
[i
].data())};
45 vals
= MLA4(vals
, lrlr
, coeffs
);
46 _mm_store_ps(Values
[i
].data(), vals
);
52 __m128 coeffs
{_mm_load_ps(Coeffs
[0].data())};
53 __m128 vals
{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64
*>(Values
[0].data()))};
54 imp0
= _mm_mul_ps(lrlr
, coeffs
);
55 vals
= _mm_add_ps(imp0
, vals
);
56 _mm_storel_pi(reinterpret_cast<__m64
*>(Values
[0].data()), vals
);
57 size_t td
{((IrSize
+1)>>1) - 1};
60 coeffs
= _mm_load_ps(Coeffs
[i
+1].data());
61 vals
= _mm_load_ps(Values
[i
].data());
62 imp1
= _mm_mul_ps(lrlr
, coeffs
);
63 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
64 vals
= _mm_add_ps(imp0
, vals
);
65 _mm_store_ps(Values
[i
].data(), vals
);
69 vals
= _mm_loadl_pi(vals
, reinterpret_cast<__m64
*>(Values
[i
].data()));
70 imp0
= _mm_movehl_ps(imp0
, imp0
);
71 vals
= _mm_add_ps(imp0
, vals
);
72 _mm_storel_pi(reinterpret_cast<__m64
*>(Values
[i
].data()), vals
);
76 force_inline
void MixLine(const al::span
<const float> InSamples
, float *RESTRICT dst
,
77 float &CurrentGain
, const float TargetGain
, const float delta
, const size_t min_len
,
78 const size_t aligned_len
, size_t Counter
)
80 float gain
{CurrentGain
};
81 const float step
{(TargetGain
-gain
) * delta
};
84 if(!(std::abs(step
) > std::numeric_limits
<float>::epsilon()))
88 float step_count
{0.0f
};
89 /* Mix with applying gain steps in aligned multiples of 4. */
90 if(size_t todo
{min_len
>> 2})
92 const __m128 four4
{_mm_set1_ps(4.0f
)};
93 const __m128 step4
{_mm_set1_ps(step
)};
94 const __m128 gain4
{_mm_set1_ps(gain
)};
95 __m128 step_count4
{_mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
)};
97 const __m128 val4
{_mm_load_ps(&InSamples
[pos
])};
98 __m128 dry4
{_mm_load_ps(&dst
[pos
])};
100 /* dry += val * (gain + step*step_count) */
101 dry4
= MLA4(dry4
, val4
, MLA4(gain4
, step4
, step_count4
));
103 _mm_store_ps(&dst
[pos
], dry4
);
104 step_count4
= _mm_add_ps(step_count4
, four4
);
107 /* NOTE: step_count4 now represents the next four counts after the
108 * last four mixed samples, so the lowest element represents the
109 * next step count to apply.
111 step_count
= _mm_cvtss_f32(step_count4
);
113 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
114 for(size_t leftover
{min_len
&3};leftover
;++pos
,--leftover
)
116 dst
[pos
] += InSamples
[pos
] * (gain
+ step
*step_count
);
122 gain
+= step
*step_count
;
124 /* Mix until pos is aligned with 4 or the mix is done. */
125 for(size_t leftover
{aligned_len
&3};leftover
;++pos
,--leftover
)
126 dst
[pos
] += InSamples
[pos
] * gain
;
130 if(!(std::abs(gain
) > GainSilenceThreshold
))
132 if(size_t todo
{(InSamples
.size()-pos
) >> 2})
134 const __m128 gain4
{_mm_set1_ps(gain
)};
136 const __m128 val4
{_mm_load_ps(&InSamples
[pos
])};
137 __m128 dry4
{_mm_load_ps(&dst
[pos
])};
138 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
139 _mm_store_ps(&dst
[pos
], dry4
);
143 for(size_t leftover
{(InSamples
.size()-pos
)&3};leftover
;++pos
,--leftover
)
144 dst
[pos
] += InSamples
[pos
] * gain
;
150 float *Resample_
<BSincTag
,SSETag
>(const InterpState
*state
, float *RESTRICT src
, uint frac
,
151 uint increment
, const al::span
<float> dst
)
153 const float *const filter
{state
->bsinc
.filter
};
154 const __m128 sf4
{_mm_set1_ps(state
->bsinc
.sf
)};
155 const size_t m
{state
->bsinc
.m
};
158 src
-= state
->bsinc
.l
;
159 for(float &out_sample
: dst
)
161 // Calculate the phase index and factor.
162 const uint pi
{frac
>> FracPhaseBitDiff
};
163 const float pf
{static_cast<float>(frac
& (FracPhaseDiffOne
-1)) * (1.0f
/FracPhaseDiffOne
)};
165 // Apply the scale and phase interpolated filter.
166 __m128 r4
{_mm_setzero_ps()};
168 const __m128 pf4
{_mm_set1_ps(pf
)};
169 const float *RESTRICT fil
{filter
+ m
*pi
*2};
170 const float *RESTRICT phd
{fil
+ m
};
171 const float *RESTRICT scd
{fil
+ BSincPhaseCount
*2*m
};
172 const float *RESTRICT spd
{scd
+ m
};
177 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
178 const __m128 f4
= MLA4(
179 MLA4(_mm_load_ps(&fil
[j
]), sf4
, _mm_load_ps(&scd
[j
])),
180 pf4
, MLA4(_mm_load_ps(&phd
[j
]), sf4
, _mm_load_ps(&spd
[j
])));
182 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
186 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
187 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
188 out_sample
= _mm_cvtss_f32(r4
);
191 src
+= frac
>>MixerFracBits
;
192 frac
&= MixerFracMask
;
198 float *Resample_
<FastBSincTag
,SSETag
>(const InterpState
*state
, float *RESTRICT src
, uint frac
,
199 uint increment
, const al::span
<float> dst
)
201 const float *const filter
{state
->bsinc
.filter
};
202 const size_t m
{state
->bsinc
.m
};
205 src
-= state
->bsinc
.l
;
206 for(float &out_sample
: dst
)
208 // Calculate the phase index and factor.
209 const uint pi
{frac
>> FracPhaseBitDiff
};
210 const float pf
{static_cast<float>(frac
& (FracPhaseDiffOne
-1)) * (1.0f
/FracPhaseDiffOne
)};
212 // Apply the phase interpolated filter.
213 __m128 r4
{_mm_setzero_ps()};
215 const __m128 pf4
{_mm_set1_ps(pf
)};
216 const float *RESTRICT fil
{filter
+ m
*pi
*2};
217 const float *RESTRICT phd
{fil
+ m
};
222 /* f = fil + pf*phd */
223 const __m128 f4
= MLA4(_mm_load_ps(&fil
[j
]), pf4
, _mm_load_ps(&phd
[j
]));
225 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
229 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
230 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
231 out_sample
= _mm_cvtss_f32(r4
);
234 src
+= frac
>>MixerFracBits
;
235 frac
&= MixerFracMask
;
242 void MixHrtf_
<SSETag
>(const float *InSamples
, float2
*AccumSamples
, const uint IrSize
,
243 const MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
244 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, BufferSize
); }
247 void MixHrtfBlend_
<SSETag
>(const float *InSamples
, float2
*AccumSamples
, const uint IrSize
,
248 const HrtfFilter
*oldparams
, const MixHrtfFilter
*newparams
, const size_t BufferSize
)
250 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
255 void MixDirectHrtf_
<SSETag
>(const FloatBufferSpan LeftOut
, const FloatBufferSpan RightOut
,
256 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
,
257 float *TempBuf
, HrtfChannelState
*ChanState
, const size_t IrSize
, const size_t BufferSize
)
259 MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, TempBuf
, ChanState
,
265 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
266 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
268 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
269 const auto min_len
= minz(Counter
, InSamples
.size());
270 const auto aligned_len
= minz((min_len
+3) & ~size_t{3}, InSamples
.size()) - min_len
;
272 for(FloatBufferLine
&output
: OutBuffer
)
273 MixLine(InSamples
, al::assume_aligned
<16>(output
.data()+OutPos
), *CurrentGains
++,
274 *TargetGains
++, delta
, min_len
, aligned_len
, Counter
);
278 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, float *OutBuffer
, float &CurrentGain
,
279 const float TargetGain
, const size_t Counter
)
281 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
282 const auto min_len
= minz(Counter
, InSamples
.size());
283 const auto aligned_len
= minz((min_len
+3) & ~size_t{3}, InSamples
.size()) - min_len
;
285 MixLine(InSamples
, al::assume_aligned
<16>(OutBuffer
), CurrentGain
, TargetGain
, delta
, min_len
,
286 aligned_len
, Counter
);