12 #include "bsinc_defs.h"
23 #define FRAC_PHASE_BITDIFF (FRACTIONBITS - BSINC_PHASE_BITS)
24 #define FRAC_PHASE_DIFFONE (1<<FRAC_PHASE_BITDIFF)
26 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
28 inline void ApplyCoeffs(float2
*RESTRICT Values
, const uint_fast32_t IrSize
,
29 const HrirArray
&Coeffs
, const float left
, const float right
)
31 const __m128 lrlr
{_mm_setr_ps(left
, right
, left
, right
)};
33 ASSUME(IrSize
>= MIN_IR_LENGTH
);
34 /* This isn't technically correct to test alignment, but it's true for
35 * systems that support SSE, which is the only one that needs to know the
36 * alignment of Values (which alternates between 8- and 16-byte aligned).
38 if(reinterpret_cast<intptr_t>(Values
)&0x8)
41 __m128 coeffs
{_mm_load_ps(&Coeffs
[0][0])};
42 __m128 vals
{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64
*>(&Values
[0][0]))};
43 imp0
= _mm_mul_ps(lrlr
, coeffs
);
44 vals
= _mm_add_ps(imp0
, vals
);
45 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[0][0]), vals
);
46 uint_fast32_t td
{((IrSize
+1)>>1) - 1};
49 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
50 vals
= _mm_load_ps(&Values
[i
][0]);
51 imp1
= _mm_mul_ps(lrlr
, coeffs
);
52 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
53 vals
= _mm_add_ps(imp0
, vals
);
54 _mm_store_ps(&Values
[i
][0], vals
);
58 vals
= _mm_loadl_pi(vals
, reinterpret_cast<__m64
*>(&Values
[i
][0]));
59 imp0
= _mm_movehl_ps(imp0
, imp0
);
60 vals
= _mm_add_ps(imp0
, vals
);
61 _mm_storel_pi(reinterpret_cast<__m64
*>(&Values
[i
][0]), vals
);
65 for(size_t i
{0};i
< IrSize
;i
+= 2)
67 const __m128 coeffs
{_mm_load_ps(&Coeffs
[i
][0])};
68 __m128 vals
{_mm_load_ps(&Values
[i
][0])};
69 vals
= MLA4(vals
, lrlr
, coeffs
);
70 _mm_store_ps(&Values
[i
][0], vals
);
78 const float *Resample_
<BSincTag
,SSETag
>(const InterpState
*state
, const float *RESTRICT src
,
79 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
81 const float *const filter
{state
->bsinc
.filter
};
82 const __m128 sf4
{_mm_set1_ps(state
->bsinc
.sf
)};
83 const size_t m
{state
->bsinc
.m
};
85 src
-= state
->bsinc
.l
;
86 for(float &out_sample
: dst
)
88 // Calculate the phase index and factor.
89 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
90 const float pf
{static_cast<float>(frac
& (FRAC_PHASE_DIFFONE
-1)) *
91 (1.0f
/FRAC_PHASE_DIFFONE
)};
93 // Apply the scale and phase interpolated filter.
94 __m128 r4
{_mm_setzero_ps()};
96 const __m128 pf4
{_mm_set1_ps(pf
)};
97 const float *fil
{filter
+ m
*pi
*4};
98 const float *phd
{fil
+ m
};
99 const float *scd
{phd
+ m
};
100 const float *spd
{scd
+ m
};
105 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
106 const __m128 f4
= MLA4(
107 MLA4(_mm_load_ps(&fil
[j
]), sf4
, _mm_load_ps(&scd
[j
])),
108 pf4
, MLA4(_mm_load_ps(&phd
[j
]), sf4
, _mm_load_ps(&spd
[j
])));
110 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
114 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
115 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
116 out_sample
= _mm_cvtss_f32(r4
);
119 src
+= frac
>>FRACTIONBITS
;
120 frac
&= FRACTIONMASK
;
126 const float *Resample_
<FastBSincTag
,SSETag
>(const InterpState
*state
, const float *RESTRICT src
,
127 ALuint frac
, ALuint increment
, const al::span
<float> dst
)
129 const float *const filter
{state
->bsinc
.filter
};
130 const size_t m
{state
->bsinc
.m
};
132 src
-= state
->bsinc
.l
;
133 for(float &out_sample
: dst
)
135 // Calculate the phase index and factor.
136 const ALuint pi
{frac
>> FRAC_PHASE_BITDIFF
};
137 const float pf
{static_cast<float>(frac
& (FRAC_PHASE_DIFFONE
-1)) *
138 (1.0f
/FRAC_PHASE_DIFFONE
)};
140 // Apply the phase interpolated filter.
141 __m128 r4
{_mm_setzero_ps()};
143 const __m128 pf4
{_mm_set1_ps(pf
)};
144 const float *fil
{filter
+ m
*pi
*4};
145 const float *phd
{fil
+ m
};
150 /* f = fil + pf*phd */
151 const __m128 f4
= MLA4(_mm_load_ps(&fil
[j
]), pf4
, _mm_load_ps(&phd
[j
]));
153 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
157 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
158 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
159 out_sample
= _mm_cvtss_f32(r4
);
162 src
+= frac
>>FRACTIONBITS
;
163 frac
&= FRACTIONMASK
;
170 void MixHrtf_
<SSETag
>(const float *InSamples
, float2
*AccumSamples
, const ALuint IrSize
,
171 const MixHrtfFilter
*hrtfparams
, const size_t BufferSize
)
172 { MixHrtfBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, hrtfparams
, BufferSize
); }
175 void MixHrtfBlend_
<SSETag
>(const float *InSamples
, float2
*AccumSamples
, const ALuint IrSize
,
176 const HrtfFilter
*oldparams
, const MixHrtfFilter
*newparams
, const size_t BufferSize
)
178 MixHrtfBlendBase
<ApplyCoeffs
>(InSamples
, AccumSamples
, IrSize
, oldparams
, newparams
,
183 void MixDirectHrtf_
<SSETag
>(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
184 const al::span
<const FloatBufferLine
> InSamples
, float2
*AccumSamples
, DirectHrtfState
*State
,
185 const size_t BufferSize
)
186 { MixDirectHrtfBase
<ApplyCoeffs
>(LeftOut
, RightOut
, InSamples
, AccumSamples
, State
, BufferSize
); }
190 void Mix_
<SSETag
>(const al::span
<const float> InSamples
, const al::span
<FloatBufferLine
> OutBuffer
,
191 float *CurrentGains
, const float *TargetGains
, const size_t Counter
, const size_t OutPos
)
193 const float delta
{(Counter
> 0) ? 1.0f
/ static_cast<float>(Counter
) : 0.0f
};
194 const auto min_len
= minz(Counter
, InSamples
.size());
195 const auto aligned_len
= minz((min_len
+3) & ~size_t{3}, InSamples
.size()) - min_len
;
197 for(FloatBufferLine
&output
: OutBuffer
)
199 float *RESTRICT dst
{al::assume_aligned
<16>(output
.data()+OutPos
)};
200 float gain
{*CurrentGains
};
201 const float step
{(*TargetGains
-gain
) * delta
};
204 if(!(std::fabs(step
) > std::numeric_limits
<float>::epsilon()))
208 float step_count
{0.0f
};
209 /* Mix with applying gain steps in aligned multiples of 4. */
210 if(size_t todo
{(min_len
-pos
) >> 2})
212 const __m128 four4
{_mm_set1_ps(4.0f
)};
213 const __m128 step4
{_mm_set1_ps(step
)};
214 const __m128 gain4
{_mm_set1_ps(gain
)};
215 __m128 step_count4
{_mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
)};
217 const __m128 val4
{_mm_load_ps(&InSamples
[pos
])};
218 __m128 dry4
{_mm_load_ps(&dst
[pos
])};
220 /* dry += val * (gain + step*step_count) */
221 dry4
= MLA4(dry4
, val4
, MLA4(gain4
, step4
, step_count4
));
223 _mm_store_ps(&dst
[pos
], dry4
);
224 step_count4
= _mm_add_ps(step_count4
, four4
);
227 /* NOTE: step_count4 now represents the next four counts after
228 * the last four mixed samples, so the lowest element
229 * represents the next step count to apply.
231 step_count
= _mm_cvtss_f32(step_count4
);
233 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
234 for(size_t leftover
{min_len
&3};leftover
;++pos
,--leftover
)
236 dst
[pos
] += InSamples
[pos
] * (gain
+ step
*step_count
);
242 gain
+= step
*step_count
;
244 /* Mix until pos is aligned with 4 or the mix is done. */
245 for(size_t leftover
{aligned_len
&3};leftover
;++pos
,--leftover
)
246 dst
[pos
] += InSamples
[pos
] * gain
;
248 *CurrentGains
= gain
;
252 if(!(std::fabs(gain
) > GAIN_SILENCE_THRESHOLD
))
254 if(size_t todo
{(InSamples
.size()-pos
) >> 2})
256 const __m128 gain4
{_mm_set1_ps(gain
)};
258 const __m128 val4
{_mm_load_ps(&InSamples
[pos
])};
259 __m128 dry4
{_mm_load_ps(&dst
[pos
])};
260 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
261 _mm_store_ps(&dst
[pos
], dry4
);
265 for(size_t leftover
{(InSamples
.size()-pos
)&3};leftover
;++pos
,--leftover
)
266 dst
[pos
] += InSamples
[pos
] * gain
;