17 const ALfloat
*Resample_
<BSincTag
,SSETag
>(const InterpState
*state
, const ALfloat
*RESTRICT src
,
18 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
20 const ALfloat
*const filter
{state
->bsinc
.filter
};
21 const __m128 sf4
{_mm_set1_ps(state
->bsinc
.sf
)};
22 const size_t m
{state
->bsinc
.m
};
24 src
-= state
->bsinc
.l
;
25 for(float &out_sample
: dst
)
27 // Calculate the phase index and factor.
28 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
29 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
30 const ALfloat pf
{static_cast<float>(frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) *
31 (1.0f
/(1<<FRAC_PHASE_BITDIFF
))};
32 #undef FRAC_PHASE_BITDIFF
34 // Apply the scale and phase interpolated filter.
35 __m128 r4
{_mm_setzero_ps()};
37 const __m128 pf4
{_mm_set1_ps(pf
)};
38 const float *fil
{filter
+ m
*pi
*4};
39 const float *scd
{fil
+ m
};
40 const float *phd
{scd
+ m
};
41 const float *spd
{phd
+ m
};
45 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
47 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
48 const __m128 f4
= MLA4(
49 MLA4(_mm_load_ps(fil
), sf4
, _mm_load_ps(scd
)),
50 pf4
, MLA4(_mm_load_ps(phd
), sf4
, _mm_load_ps(spd
)));
51 fil
+= 4; scd
+= 4; phd
+= 4; spd
+= 4;
53 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
58 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
59 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
60 out_sample
= _mm_cvtss_f32(r4
);
63 src
+= frac
>>FRACTIONBITS
;
70 const ALfloat
*Resample_
<FastBSincTag
,SSETag
>(const InterpState
*state
,
71 const ALfloat
*RESTRICT src
, ALuint frac
, ALuint increment
, const al::span
<float> dst
)
73 const ALfloat
*const filter
{state
->bsinc
.filter
};
74 const size_t m
{state
->bsinc
.m
};
76 src
-= state
->bsinc
.l
;
77 for(float &out_sample
: dst
)
79 // Calculate the phase index and factor.
80 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
81 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
82 const ALfloat pf
{static_cast<float>(frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) *
83 (1.0f
/(1<<FRAC_PHASE_BITDIFF
))};
84 #undef FRAC_PHASE_BITDIFF
86 // Apply the phase interpolated filter.
87 __m128 r4
{_mm_setzero_ps()};
89 const __m128 pf4
{_mm_set1_ps(pf
)};
90 const float *fil
{filter
+ m
*pi
*4};
91 const float *phd
{fil
+ m
*2};
95 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
97 /* f = fil + pf*phd */
98 const __m128 f4
= MLA4(_mm_load_ps(fil
), pf4
, _mm_load_ps(phd
));
100 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
101 fil
+= 4; phd
+= 4; j
+= 4;
105 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
106 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
107 out_sample
= _mm_cvtss_f32(r4
);
110 src
+= frac
>>FRACTIONBITS
;
111 frac
&= FRACTIONMASK
;
117 static inline void ApplyCoeffs(size_t Offset
, float2
*RESTRICT Values
, const ALuint IrSize
,
118 const HrirArray
&Coeffs
, const ALfloat left
, const ALfloat right
)
120 const __m128 lrlr
{_mm_setr_ps(left
, right
, left
, right
)};
127 __m128 coeffs
{_mm_load_ps(&Coeffs
[0][0])};
128 __m128 vals
{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64
*>(&Values
[0][0]))};
129 imp0
= _mm_mul_ps(lrlr
, coeffs
);
130 vals
= _mm_add_ps(imp0
, vals
);
131 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[0][0]), vals
);
133 for(;i
< IrSize
-1;i
+= 2)
135 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
136 vals
= _mm_load_ps(&Values
[i
][0]);
137 imp1
= _mm_mul_ps(lrlr
, coeffs
);
138 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
139 vals
= _mm_add_ps(imp0
, vals
);
140 _mm_store_ps(&Values
[i
][0], vals
);
143 vals
= _mm_loadl_pi(vals
, reinterpret_cast<__m64
*>(&Values
[i
][0]));
144 imp0
= _mm_movehl_ps(imp0
, imp0
);
145 vals
= _mm_add_ps(imp0
, vals
);
146 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[i
][0]), vals
);
150 for(ALuint i
{0};i
< IrSize
;i
+= 2)
152 __m128 coeffs
{_mm_load_ps(&Coeffs
[i
][0])};
153 __m128 vals
{_mm_load_ps(&Values
[i
][0])};
154 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
155 _mm_store_ps(&Values
[i
][0], vals
);
161 void MixHrtf_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
162 const ALfloat
*InSamples
, float2
*AccumSamples
, const size_t OutPos
, const ALuint IrSize
,
163 MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
165 MixHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, OutPos
, IrSize
,
166 hrtfparams
, BufferSize
);
170 void MixHrtfBlend_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
171 const ALfloat
*InSamples
, float2
*AccumSamples
, const size_t OutPos
, const ALuint IrSize
,
172 const HrtfFilter
*oldparams
, MixHrtfFilter
*newparams
, const size_t BufferSize
)
174 MixHrtfBlendBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, OutPos
, IrSize
,
175 oldparams
, newparams
, BufferSize
);
179 void MixDirectHrtf_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
180 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
, DirectHrtfState
*State
,
181 const size_t BufferSize
)
183 MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, State
, BufferSize
);
188 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
189 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
191 const ALfloat delta
{(Counter
> 0) ? 1.0f
/ static_cast<ALfloat
>(Counter
) : 0.0f
};
192 const bool reached_target
{InSamples
.size() >= Counter
};
193 const auto min_end
= reached_target
? InSamples
.begin() + Counter
: InSamples
.end();
194 const auto aligned_end
= minz(static_cast<uintptr_t>(min_end
-InSamples
.begin()+3) & ~3u,
195 InSamples
.size()) + InSamples
.begin();
196 for(FloatBufferLine
&output
: OutBuffer
)
198 ALfloat
*RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
199 ALfloat gain
{*CurrentGains
};
200 const ALfloat diff
{*TargetGains
- gain
};
202 auto in_iter
= InSamples
.begin();
203 if(std::fabs(diff
) > std::numeric_limits
<float>::epsilon())
205 const ALfloat step
{diff
* delta
};
206 ALfloat step_count
{0.0f
};
207 /* Mix with applying gain steps in aligned multiples of 4. */
208 if(ptrdiff_t todo
{(min_end
-in_iter
) >> 2})
210 const __m128 four4
{_mm_set1_ps(4.0f
)};
211 const __m128 step4
{_mm_set1_ps(step
)};
212 const __m128 gain4
{_mm_set1_ps(gain
)};
213 __m128 step_count4
{_mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
)};
215 const __m128 val4
{_mm_load_ps(in_iter
)};
216 __m128 dry4
{_mm_load_ps(dst
)};
217 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
218 /* dry += val * (gain + step*step_count) */
219 dry4
= MLA4(dry4
, val4
, MLA4(gain4
, step4
, step_count4
));
221 _mm_store_ps(dst
, dry4
);
222 step_count4
= _mm_add_ps(step_count4
, four4
);
223 in_iter
+= 4; dst
+= 4;
225 /* NOTE: step_count4 now represents the next four counts after
226 * the last four mixed samples, so the lowest element
227 * represents the next step count to apply.
229 step_count
= _mm_cvtss_f32(step_count4
);
231 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
232 while(in_iter
!= min_end
)
234 *(dst
++) += *(in_iter
++) * (gain
+ step
*step_count
);
240 gain
+= step
*step_count
;
241 *CurrentGains
= gain
;
243 /* Mix until pos is aligned with 4 or the mix is done. */
244 while(in_iter
!= aligned_end
)
245 *(dst
++) += *(in_iter
++) * gain
;
250 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
252 if(ptrdiff_t todo
{(InSamples
.end()-in_iter
) >> 2})
254 const __m128 gain4
{_mm_set1_ps(gain
)};
256 const __m128 val4
{_mm_load_ps(in_iter
)};
257 __m128 dry4
{_mm_load_ps(dst
)};
258 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
259 _mm_store_ps(dst
, dry4
);
260 in_iter
+= 4; dst
+= 4;
263 while(in_iter
!= InSamples
.end())
264 *(dst
++) += *(in_iter
++) * gain
;
269 void MixRow_
<SSETag
>(const al::span
<float> OutBuffer
, const al::span
<const float> Gains
,
270 const float *InSamples
, const size_t InStride
)
272 for(const float gain
: Gains
)
274 const float *RESTRICT input
{InSamples
};
275 InSamples
+= InStride
;
277 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
280 auto out_iter
= OutBuffer
.begin();
281 if(size_t todo
{OutBuffer
.size() >> 2})
283 const __m128 gain4
= _mm_set1_ps(gain
);
285 const __m128 val4
{_mm_load_ps(input
)};
286 __m128 dry4
{_mm_load_ps(out_iter
)};
287 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
288 _mm_store_ps(out_iter
, dry4
);
289 out_iter
+= 4; input
+= 4;
293 auto do_mix
= [gain
](const float cur
, const float src
) noexcept
-> float
294 { return cur
+ src
*gain
; };
295 std::transform(out_iter
, OutBuffer
.end(), input
, out_iter
, do_mix
);