18 inline void ApplyCoeffs(float2
*RESTRICT Values
, const ALuint IrSize
, const HrirArray
&Coeffs
,
19 const float left
, const float right
)
21 const __m128 lrlr
{_mm_setr_ps(left
, right
, left
, right
)};
24 /* This isn't technically correct to test alignment, but it's true for
25 * systems that support SSE, which is the only one that needs to know the
26 * alignment of Values (which alternates between 8- and 16-byte aligned).
28 if(reinterpret_cast<intptr_t>(Values
)&0x8)
31 __m128 coeffs
{_mm_load_ps(&Coeffs
[0][0])};
32 __m128 vals
{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64
*>(&Values
[0][0]))};
33 imp0
= _mm_mul_ps(lrlr
, coeffs
);
34 vals
= _mm_add_ps(imp0
, vals
);
35 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[0][0]), vals
);
37 for(;i
< IrSize
-1;i
+= 2)
39 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
40 vals
= _mm_load_ps(&Values
[i
][0]);
41 imp1
= _mm_mul_ps(lrlr
, coeffs
);
42 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
43 vals
= _mm_add_ps(imp0
, vals
);
44 _mm_store_ps(&Values
[i
][0], vals
);
47 vals
= _mm_loadl_pi(vals
, reinterpret_cast<__m64
*>(&Values
[i
][0]));
48 imp0
= _mm_movehl_ps(imp0
, imp0
);
49 vals
= _mm_add_ps(imp0
, vals
);
50 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[i
][0]), vals
);
54 for(ALuint i
{0};i
< IrSize
;i
+= 2)
56 __m128 coeffs
{_mm_load_ps(&Coeffs
[i
][0])};
57 __m128 vals
{_mm_load_ps(&Values
[i
][0])};
58 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
59 _mm_store_ps(&Values
[i
][0], vals
);
67 const ALfloat
*Resample_
<BSincTag
,SSETag
>(const InterpState
*state
, const ALfloat
*RESTRICT src
,
68 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
70 const float *const filter
{state
->bsinc
.filter
};
71 const __m128 sf4
{_mm_set1_ps(state
->bsinc
.sf
)};
72 const size_t m
{state
->bsinc
.m
};
74 src
-= state
->bsinc
.l
;
75 for(float &out_sample
: dst
)
77 // Calculate the phase index and factor.
78 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
79 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
80 const float pf
{static_cast<float>(frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) *
81 (1.0f
/(1<<FRAC_PHASE_BITDIFF
))};
82 #undef FRAC_PHASE_BITDIFF
84 // Apply the scale and phase interpolated filter.
85 __m128 r4
{_mm_setzero_ps()};
87 const __m128 pf4
{_mm_set1_ps(pf
)};
88 const float *fil
{filter
+ m
*pi
*4};
89 const float *phd
{fil
+ m
};
90 const float *scd
{phd
+ m
};
91 const float *spd
{scd
+ m
};
95 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
97 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
98 const __m128 f4
= MLA4(
99 MLA4(_mm_load_ps(fil
), sf4
, _mm_load_ps(scd
)),
100 pf4
, MLA4(_mm_load_ps(phd
), sf4
, _mm_load_ps(spd
)));
101 fil
+= 4; scd
+= 4; phd
+= 4; spd
+= 4;
103 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
108 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
109 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
110 out_sample
= _mm_cvtss_f32(r4
);
113 src
+= frac
>>FRACTIONBITS
;
114 frac
&= FRACTIONMASK
;
120 const ALfloat
*Resample_
<FastBSincTag
,SSETag
>(const InterpState
*state
,
121 const ALfloat
*RESTRICT src
, ALuint frac
, ALuint increment
, const al::span
<float> dst
)
123 const float *const filter
{state
->bsinc
.filter
};
124 const size_t m
{state
->bsinc
.m
};
126 src
-= state
->bsinc
.l
;
127 for(float &out_sample
: dst
)
129 // Calculate the phase index and factor.
130 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
131 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
132 const float pf
{static_cast<float>(frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) *
133 (1.0f
/(1<<FRAC_PHASE_BITDIFF
))};
134 #undef FRAC_PHASE_BITDIFF
136 // Apply the phase interpolated filter.
137 __m128 r4
{_mm_setzero_ps()};
139 const __m128 pf4
{_mm_set1_ps(pf
)};
140 const float *fil
{filter
+ m
*pi
*4};
141 const float *phd
{fil
+ m
};
145 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
147 /* f = fil + pf*phd */
148 const __m128 f4
= MLA4(_mm_load_ps(fil
), pf4
, _mm_load_ps(phd
));
150 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
151 fil
+= 4; phd
+= 4; j
+= 4;
155 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
156 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
157 out_sample
= _mm_cvtss_f32(r4
);
160 src
+= frac
>>FRACTIONBITS
;
161 frac
&= FRACTIONMASK
;
168 void MixHrtf_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
169 const float *InSamples
, float2
*AccumSamples
, const size_t OutPos
, const ALuint IrSize
,
170 MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
172 MixHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, OutPos
, IrSize
,
173 hrtfparams
, BufferSize
);
177 void MixHrtfBlend_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
178 const float *InSamples
, float2
*AccumSamples
, const size_t OutPos
, const ALuint IrSize
,
179 const HrtfFilter
*oldparams
, MixHrtfFilter
*newparams
, const size_t BufferSize
)
181 MixHrtfBlendBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, OutPos
, IrSize
,
182 oldparams
, newparams
, BufferSize
);
186 void MixDirectHrtf_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
187 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
, DirectHrtfState
*State
,
188 const size_t BufferSize
)
189 { MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, State
, BufferSize
); }
193 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
194 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
196 const ALfloat delta
{(Counter
> 0) ? 1.0f
/ static_cast<ALfloat
>(Counter
) : 0.0f
};
197 const bool reached_target
{InSamples
.size() >= Counter
};
198 const auto min_end
= reached_target
? InSamples
.begin() + Counter
: InSamples
.end();
199 const auto aligned_end
= minz(static_cast<uintptr_t>(min_end
-InSamples
.begin()+3) & ~3u,
200 InSamples
.size()) + InSamples
.begin();
201 for(FloatBufferLine
&output
: OutBuffer
)
203 ALfloat
*RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
204 ALfloat gain
{*CurrentGains
};
205 const ALfloat diff
{*TargetGains
- gain
};
207 auto in_iter
= InSamples
.begin();
208 if(std::fabs(diff
) > std::numeric_limits
<float>::epsilon())
210 const ALfloat step
{diff
* delta
};
211 ALfloat step_count
{0.0f
};
212 /* Mix with applying gain steps in aligned multiples of 4. */
213 if(ptrdiff_t todo
{(min_end
-in_iter
) >> 2})
215 const __m128 four4
{_mm_set1_ps(4.0f
)};
216 const __m128 step4
{_mm_set1_ps(step
)};
217 const __m128 gain4
{_mm_set1_ps(gain
)};
218 __m128 step_count4
{_mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
)};
220 const __m128 val4
{_mm_load_ps(in_iter
)};
221 __m128 dry4
{_mm_load_ps(dst
)};
222 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
223 /* dry += val * (gain + step*step_count) */
224 dry4
= MLA4(dry4
, val4
, MLA4(gain4
, step4
, step_count4
));
226 _mm_store_ps(dst
, dry4
);
227 step_count4
= _mm_add_ps(step_count4
, four4
);
228 in_iter
+= 4; dst
+= 4;
230 /* NOTE: step_count4 now represents the next four counts after
231 * the last four mixed samples, so the lowest element
232 * represents the next step count to apply.
234 step_count
= _mm_cvtss_f32(step_count4
);
236 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
237 while(in_iter
!= min_end
)
239 *(dst
++) += *(in_iter
++) * (gain
+ step
*step_count
);
245 gain
+= step
*step_count
;
246 *CurrentGains
= gain
;
248 /* Mix until pos is aligned with 4 or the mix is done. */
249 while(in_iter
!= aligned_end
)
250 *(dst
++) += *(in_iter
++) * gain
;
255 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
257 if(ptrdiff_t todo
{(InSamples
.end()-in_iter
) >> 2})
259 const __m128 gain4
{_mm_set1_ps(gain
)};
261 const __m128 val4
{_mm_load_ps(in_iter
)};
262 __m128 dry4
{_mm_load_ps(dst
)};
263 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
264 _mm_store_ps(dst
, dry4
);
265 in_iter
+= 4; dst
+= 4;
268 while(in_iter
!= InSamples
.end())
269 *(dst
++) += *(in_iter
++) * gain
;
274 void MixRow_
<SSETag
>(const al::span
<float> OutBuffer
, const al::span
<const float> Gains
,
275 const float *InSamples
, const size_t InStride
)
277 for(const float gain
: Gains
)
279 const float *RESTRICT input
{InSamples
};
280 InSamples
+= InStride
;
282 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
285 auto out_iter
= OutBuffer
.begin();
286 if(size_t todo
{OutBuffer
.size() >> 2})
288 const __m128 gain4
= _mm_set1_ps(gain
);
290 const __m128 val4
{_mm_load_ps(input
)};
291 __m128 dry4
{_mm_load_ps(out_iter
)};
292 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
293 _mm_store_ps(out_iter
, dry4
);
294 out_iter
+= 4; input
+= 4;
298 auto do_mix
= [gain
](const float cur
, const float src
) noexcept
-> float
299 { return cur
+ src
*gain
; };
300 std::transform(out_iter
, OutBuffer
.end(), input
, out_iter
, do_mix
);