6 #ifdef HAVE_SSE_INTRINSICS
15 #include "alcomplex.h"
16 #include "alnumeric.h"
17 #include "opthelpers.h"
22 using complex_d
= std::complex<double>;
24 std::array
<float,Uhj2Encoder::sFilterSize
> GenerateFilter()
26 /* Some notes on this filter construction.
28 * An impulse in the frequency domain is represented by a continuous series
29 * of +1,-1 values, with a 0 imaginary term. Consequently, that impulse
30 * with a +90 degree phase offset would be represented by 0s with imaginary
31 * terms that alternate between +1,-1. Converting that to the time domain
32 * results in a FIR filter that can be convolved with the incoming signal
33 * to apply a wide-band 90-degree phase shift.
35 * A particularly notable aspect of the time-domain filter response is that
36 * every other coefficient is 0. This allows doubling the effective size of
37 * the filter, by only storing the non-0 coefficients and double-stepping
38 * over the input to apply it.
40 * Additionally, the resulting filter is independent of the sample rate.
41 * The same filter can be applied regardless of the device's sample rate
42 * and achieve the same effect, although a lower rate allows the filter to
43 * cover more time and improve the results.
45 constexpr complex_d c0
{0.0, 1.0};
46 constexpr complex_d c1
{0.0, -1.0};
47 constexpr size_t half_size
{32768};
49 /* Generate a frequency domain impulse with a +90 degree phase offset. Keep
50 * the mirrored frequencies clear for converting to the time domain.
52 auto fftBuffer
= std::vector
<complex_d
>(half_size
*2, complex_d
{});
53 for(size_t i
{0};i
< half_size
;i
+= 2)
58 fftBuffer
[half_size
] = c0
;
59 complex_fft(fftBuffer
, 1.0);
61 /* Reverse and truncate the filter to a usable size, and store only the
62 * non-0 terms. Should this be windowed?
64 std::array
<float,Uhj2Encoder::sFilterSize
> ret
;
65 auto fftiter
= fftBuffer
.data() + half_size
+ (Uhj2Encoder::sFilterSize
-1);
66 for(float &coeff
: ret
)
68 coeff
= static_cast<float>(fftiter
->real() / (half_size
+1));
73 alignas(16) const auto PShiftCoeffs
= GenerateFilter();
76 void allpass_process(al::span
<float> dst
, const float *RESTRICT src
)
78 #ifdef HAVE_SSE_INTRINSICS
80 if(size_t todo
{dst
.size()>>1})
83 __m128 r04
{_mm_setzero_ps()};
84 __m128 r14
{_mm_setzero_ps()};
85 for(size_t j
{0};j
< PShiftCoeffs
.size();j
+=4)
87 const __m128 coeffs
{_mm_load_ps(&PShiftCoeffs
[j
])};
88 const __m128 s0
{_mm_loadu_ps(&src
[j
*2])};
89 const __m128 s1
{_mm_loadu_ps(&src
[j
*2 + 4])};
91 __m128 s
{_mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(2, 0, 2, 0))};
92 r04
= _mm_add_ps(r04
, _mm_mul_ps(s
, coeffs
));
94 s
= _mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(3, 1, 3, 1));
95 r14
= _mm_add_ps(r14
, _mm_mul_ps(s
, coeffs
));
97 r04
= _mm_add_ps(r04
, _mm_shuffle_ps(r04
, r04
, _MM_SHUFFLE(0, 1, 2, 3)));
98 r04
= _mm_add_ps(r04
, _mm_movehl_ps(r04
, r04
));
99 dst
[pos
++] += _mm_cvtss_f32(r04
);
101 r14
= _mm_add_ps(r14
, _mm_shuffle_ps(r14
, r14
, _MM_SHUFFLE(0, 1, 2, 3)));
102 r14
= _mm_add_ps(r14
, _mm_movehl_ps(r14
, r14
));
103 dst
[pos
++] += _mm_cvtss_f32(r14
);
110 __m128 r4
{_mm_setzero_ps()};
111 for(size_t j
{0};j
< PShiftCoeffs
.size();j
+=4)
113 const __m128 coeffs
{_mm_load_ps(&PShiftCoeffs
[j
])};
114 /* NOTE: This could alternatively be done with two unaligned loads
115 * and a shuffle. Which would be better?
117 const __m128 s
{_mm_setr_ps(src
[j
*2], src
[j
*2 + 2], src
[j
*2 + 4], src
[j
*2 + 6])};
118 r4
= _mm_add_ps(r4
, _mm_mul_ps(s
, coeffs
));
120 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
121 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
123 dst
[pos
] += _mm_cvtss_f32(r4
);
128 for(float &output
: dst
)
131 for(size_t j
{0};j
< PShiftCoeffs
.size();++j
)
132 ret
+= src
[j
*2] * PShiftCoeffs
[j
];
143 /* NOTE: There seems to be a bit of an inconsistency in how this encoding is
144 * supposed to work. Some references, such as
146 * http://members.tripod.com/martin_leese/Ambisonic/UHJ_file_format.html
148 * specify a pre-scaling of sqrt(2) on the W channel input, while other
149 * references, such as
151 * https://en.wikipedia.org/wiki/Ambisonic_UHJ_format#Encoding.5B1.5D
153 * https://wiki.xiph.org/Ambisonics#UHJ_format
155 * do not. The sqrt(2) scaling is in line with B-Format decoder coefficients
156 * which include such a scaling for the W channel input, however the original
157 * source for this equation is a 1985 paper by Michael Gerzon, which does not
158 * apparently include the scaling. Applying the extra scaling creates a louder
159 * result with a narrower stereo image compared to not scaling, and I don't
160 * know which is the intended result.
163 void Uhj2Encoder::encode(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
164 const FloatBufferLine
*InSamples
, const size_t SamplesToDo
)
166 ASSUME(SamplesToDo
> 0);
168 float *RESTRICT left
{al::assume_aligned
<16>(LeftOut
.data())};
169 float *RESTRICT right
{al::assume_aligned
<16>(RightOut
.data())};
171 const float *RESTRICT winput
{al::assume_aligned
<16>(InSamples
[0].data())};
172 const float *RESTRICT xinput
{al::assume_aligned
<16>(InSamples
[1].data())};
173 const float *RESTRICT yinput
{al::assume_aligned
<16>(InSamples
[2].data())};
175 /* Combine the previously delayed mid/side signal with the input. */
177 /* S = 0.9396926*W + 0.1855740*X */
178 auto miditer
= std::copy(mMidDelay
.cbegin(), mMidDelay
.cend(), mMid
.begin());
179 std::transform(winput
, winput
+SamplesToDo
, xinput
, miditer
,
180 [](const float w
, const float x
) noexcept
-> float
181 { return 0.9396926f
*w
+ 0.1855740f
*x
; });
183 /* D = 0.6554516*Y */
184 auto sideiter
= std::copy(mSideDelay
.cbegin(), mSideDelay
.cend(), mSide
.begin());
185 std::transform(yinput
, yinput
+SamplesToDo
, sideiter
,
186 [](const float y
) noexcept
-> float { return 0.6554516f
*y
; });
188 /* Include any existing direct signal in the mid/side buffers. */
189 for(size_t i
{0};i
< SamplesToDo
;++i
,++miditer
)
190 *miditer
+= left
[i
] + right
[i
];
191 for(size_t i
{0};i
< SamplesToDo
;++i
,++sideiter
)
192 *sideiter
+= left
[i
] - right
[i
];
194 /* Copy the future samples back to the delay buffers for next time. */
195 std::copy_n(mMid
.cbegin()+SamplesToDo
, mMidDelay
.size(), mMidDelay
.begin());
196 std::copy_n(mSide
.cbegin()+SamplesToDo
, mSideDelay
.size(), mSideDelay
.begin());
198 /* Now add the all-passed signal into the side signal. */
200 /* D += j(-0.3420201*W + 0.5098604*X) */
201 auto tmpiter
= std::copy(mSideHistory
.cbegin(), mSideHistory
.cend(), mTemp
.begin());
202 std::transform(winput
, winput
+SamplesToDo
, xinput
, tmpiter
,
203 [](const float w
, const float x
) noexcept
-> float
204 { return -0.3420201f
*w
+ 0.5098604f
*x
; });
205 std::copy_n(mTemp
.cbegin()+SamplesToDo
, mSideHistory
.size(), mSideHistory
.begin());
206 allpass_process({mSide
.data(), SamplesToDo
}, mTemp
.data());
208 /* Left = (S + D)/2.0 */
209 for(size_t i
{0};i
< SamplesToDo
;i
++)
210 left
[i
] = (mMid
[i
] + mSide
[i
]) * 0.5f
;
211 /* Right = (S - D)/2.0 */
212 for(size_t i
{0};i
< SamplesToDo
;i
++)
213 right
[i
] = (mMid
[i
] - mSide
[i
]) * 0.5f
;