9 #include "core/bsinc_defs.h"
18 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
19 #pragma GCC target("sse")
24 constexpr uint FracPhaseBitDiff
{MixerFracBits
- BSincPhaseBits
};
25 constexpr uint FracPhaseDiffOne
{1 << FracPhaseBitDiff
};
27 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
29 inline void ApplyCoeffs(float2
*RESTRICT Values
, const size_t IrSize
, const ConstHrirSpan Coeffs
,
30 const float left
, const float right
)
32 const __m128 lrlr
{_mm_setr_ps(left
, right
, left
, right
)};
34 ASSUME(IrSize
>= MinIrLength
);
35 /* This isn't technically correct to test alignment, but it's true for
36 * systems that support SSE, which is the only one that needs to know the
37 * alignment of Values (which alternates between 8- and 16-byte aligned).
39 if(reinterpret_cast<intptr_t>(Values
)&0x8)
42 __m128 coeffs
{_mm_load_ps(&Coeffs
[0][0])};
43 __m128 vals
{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64
*>(&Values
[0][0]))};
44 imp0
= _mm_mul_ps(lrlr
, coeffs
);
45 vals
= _mm_add_ps(imp0
, vals
);
46 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[0][0]), vals
);
47 size_t td
{((IrSize
+1)>>1) - 1};
50 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
51 vals
= _mm_load_ps(&Values
[i
][0]);
52 imp1
= _mm_mul_ps(lrlr
, coeffs
);
53 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
54 vals
= _mm_add_ps(imp0
, vals
);
55 _mm_store_ps(&Values
[i
][0], vals
);
59 vals
= _mm_loadl_pi(vals
, reinterpret_cast<__m64
*>(&Values
[i
][0]));
60 imp0
= _mm_movehl_ps(imp0
, imp0
);
61 vals
= _mm_add_ps(imp0
, vals
);
62 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[i
][0]), vals
);
66 for(size_t i
{0};i
< IrSize
;i
+= 2)
68 const __m128 coeffs
{_mm_load_ps(&Coeffs
[i
][0])};
69 __m128 vals
{_mm_load_ps(&Values
[i
][0])};
70 vals
= MLA4(vals
, lrlr
, coeffs
);
71 _mm_store_ps(&Values
[i
][0], vals
);
79 float *Resample_
<BSincTag
,SSETag
>(const InterpState
*state
, float *RESTRICT src
, uint frac
,
80 uint increment
, const al::span
<float> dst
)
82 const float *const filter
{state
->bsinc
.filter
};
83 const __m128 sf4
{_mm_set1_ps(state
->bsinc
.sf
)};
84 const size_t m
{state
->bsinc
.m
};
87 src
-= state
->bsinc
.l
;
88 for(float &out_sample
: dst
)
90 // Calculate the phase index and factor.
91 const uint pi
{frac
>> FracPhaseBitDiff
};
92 const float pf
{static_cast<float>(frac
& (FracPhaseDiffOne
-1)) * (1.0f
/FracPhaseDiffOne
)};
94 // Apply the scale and phase interpolated filter.
95 __m128 r4
{_mm_setzero_ps()};
97 const __m128 pf4
{_mm_set1_ps(pf
)};
98 const float *RESTRICT fil
{filter
+ m
*pi
*2};
99 const float *RESTRICT phd
{fil
+ m
};
100 const float *RESTRICT scd
{fil
+ BSincPhaseCount
*2*m
};
101 const float *RESTRICT spd
{scd
+ m
};
106 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
107 const __m128 f4
= MLA4(
108 MLA4(_mm_load_ps(&fil
[j
]), sf4
, _mm_load_ps(&scd
[j
])),
109 pf4
, MLA4(_mm_load_ps(&phd
[j
]), sf4
, _mm_load_ps(&spd
[j
])));
111 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
115 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
116 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
117 out_sample
= _mm_cvtss_f32(r4
);
120 src
+= frac
>>MixerFracBits
;
121 frac
&= MixerFracMask
;
127 float *Resample_
<FastBSincTag
,SSETag
>(const InterpState
*state
, float *RESTRICT src
, uint frac
,
128 uint increment
, const al::span
<float> dst
)
130 const float *const filter
{state
->bsinc
.filter
};
131 const size_t m
{state
->bsinc
.m
};
134 src
-= state
->bsinc
.l
;
135 for(float &out_sample
: dst
)
137 // Calculate the phase index and factor.
138 const uint pi
{frac
>> FracPhaseBitDiff
};
139 const float pf
{static_cast<float>(frac
& (FracPhaseDiffOne
-1)) * (1.0f
/FracPhaseDiffOne
)};
141 // Apply the phase interpolated filter.
142 __m128 r4
{_mm_setzero_ps()};
144 const __m128 pf4
{_mm_set1_ps(pf
)};
145 const float *RESTRICT fil
{filter
+ m
*pi
*2};
146 const float *RESTRICT phd
{fil
+ m
};
151 /* f = fil + pf*phd */
152 const __m128 f4
= MLA4(_mm_load_ps(&fil
[j
]), pf4
, _mm_load_ps(&phd
[j
]));
154 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
158 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
159 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
160 out_sample
= _mm_cvtss_f32(r4
);
163 src
+= frac
>>MixerFracBits
;
164 frac
&= MixerFracMask
;
171 void MixHrtf_
<SSETag
>(const float *InSamples
, float2
*AccumSamples
, const uint IrSize
,
172 const MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
173 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, BufferSize
); }
176 void MixHrtfBlend_
<SSETag
>(const float *InSamples
, float2
*AccumSamples
, const uint IrSize
,
177 const HrtfFilter
*oldparams
, const MixHrtfFilter
*newparams
, const size_t BufferSize
)
179 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
184 void MixDirectHrtf_
<SSETag
>(const FloatBufferSpan LeftOut
, const FloatBufferSpan RightOut
,
185 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
,
186 float *TempBuf
, HrtfChannelState
*ChanState
, const size_t IrSize
, const size_t BufferSize
)
188 MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, TempBuf
, ChanState
,
194 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
195 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
197 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
198 const auto min_len
= minz(Counter
, InSamples
.size());
199 const auto aligned_len
= minz((min_len
+3) & ~size_t{3}, InSamples
.size()) - min_len
;
201 for(FloatBufferLine
&output
: OutBuffer
)
203 float *RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
204 float gain
{*CurrentGains
};
205 const float step
{(*TargetGains
-gain
) * delta
};
208 if(!(std::abs(step
) > std::numeric_limits
<float>::epsilon()))
212 float step_count
{0.0f
};
213 /* Mix with applying gain steps in aligned multiples of 4. */
214 if(size_t todo
{(min_len
-pos
) >> 2})
216 const __m128 four4
{_mm_set1_ps(4.0f
)};
217 const __m128 step4
{_mm_set1_ps(step
)};
218 const __m128 gain4
{_mm_set1_ps(gain
)};
219 __m128 step_count4
{_mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
)};
221 const __m128 val4
{_mm_load_ps(&InSamples
[pos
])};
222 __m128 dry4
{_mm_load_ps(&dst
[pos
])};
224 /* dry += val * (gain + step*step_count) */
225 dry4
= MLA4(dry4
, val4
, MLA4(gain4
, step4
, step_count4
));
227 _mm_store_ps(&dst
[pos
], dry4
);
228 step_count4
= _mm_add_ps(step_count4
, four4
);
231 /* NOTE: step_count4 now represents the next four counts after
232 * the last four mixed samples, so the lowest element
233 * represents the next step count to apply.
235 step_count
= _mm_cvtss_f32(step_count4
);
237 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
238 for(size_t leftover
{min_len
&3};leftover
;++pos
,--leftover
)
240 dst
[pos
] += InSamples
[pos
] * (gain
+ step
*step_count
);
246 gain
+= step
*step_count
;
248 /* Mix until pos is aligned with 4 or the mix is done. */
249 for(size_t leftover
{aligned_len
&3};leftover
;++pos
,--leftover
)
250 dst
[pos
] += InSamples
[pos
] * gain
;
252 *CurrentGains
= gain
;
256 if(!(std::abs(gain
) > GainSilenceThreshold
))
258 if(size_t todo
{(InSamples
.size()-pos
) >> 2})
260 const __m128 gain4
{_mm_set1_ps(gain
)};
262 const __m128 val4
{_mm_load_ps(&InSamples
[pos
])};
263 __m128 dry4
{_mm_load_ps(&dst
[pos
])};
264 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
265 _mm_store_ps(&dst
[pos
], dry4
);
269 for(size_t leftover
{(InSamples
.size()-pos
)&3};leftover
;++pos
,--leftover
)
270 dst
[pos
] += InSamples
[pos
] * gain
;