6 #ifdef HAVE_SSE_INTRINSICS
15 #include "alcomplex.h"
16 #include "alnumeric.h"
17 #include "opthelpers.h"
22 using complex_d
= std::complex<double>;
24 std::array
<float,Uhj2Encoder::sFilterSize
> GenerateFilter()
26 /* Some notes on this filter construction.
28 * A wide-band phase-shift filter needs a delay to maintain linearity. A
29 * dirac impulse in the center of a time-domain buffer represents a filter
30 * passing all frequencies through as-is with a pure delay. Converting that
31 * to the frequency domain, adjusting the phase of each frequency bin by
32 * +90 degrees, then converting back to the time domain, results in a FIR
33 * filter that applies a +90 degree wide-band phase-shift.
35 * A particularly notable aspect of the time-domain filter response is that
36 * every other coefficient is 0. This allows doubling the effective size of
37 * the filter, by storing only the non-0 coefficients and double-stepping
38 * over the input to apply it.
40 * Additionally, the resulting filter is independent of the sample rate.
41 * The same filter can be applied regardless of the device's sample rate
42 * and achieve the same effect.
44 constexpr size_t fft_size
{Uhj2Encoder::sFilterSize
* 2};
45 constexpr size_t half_size
{fft_size
/ 2};
47 /* Generate a frequency domain impulse with a +90 degree phase offset.
48 * Reconstruct the mirrored frequencies to convert to the time domain.
50 auto fftBuffer
= std::make_unique
<complex_d
[]>(fft_size
);
51 std::fill_n(fftBuffer
.get(), fft_size
, complex_d
{});
52 fftBuffer
[half_size
] = 1.0;
54 forward_fft({fftBuffer
.get(), fft_size
});
55 for(size_t i
{0};i
< half_size
+1;++i
)
56 fftBuffer
[i
] = complex_d
{-fftBuffer
[i
].imag(), fftBuffer
[i
].real()};
57 for(size_t i
{half_size
+1};i
< fft_size
;++i
)
58 fftBuffer
[i
] = std::conj(fftBuffer
[fft_size
- i
]);
59 inverse_fft({fftBuffer
.get(), fft_size
});
61 /* Reverse the filter for simpler processing, and store only the non-0
64 auto ret
= std::make_unique
<std::array
<float,Uhj2Encoder::sFilterSize
>>();
65 auto fftiter
= fftBuffer
.get() + half_size
+ (Uhj2Encoder::sFilterSize
-1);
66 for(float &coeff
: *ret
)
68 coeff
= static_cast<float>(fftiter
->real() / double{fft_size
});
73 alignas(16) const auto PShiftCoeffs
= GenerateFilter();
76 void allpass_process(al::span
<float> dst
, const float *RESTRICT src
)
78 #ifdef HAVE_SSE_INTRINSICS
80 if(size_t todo
{dst
.size()>>1})
83 __m128 r04
{_mm_setzero_ps()};
84 __m128 r14
{_mm_setzero_ps()};
85 for(size_t j
{0};j
< PShiftCoeffs
.size();j
+=4)
87 const __m128 coeffs
{_mm_load_ps(&PShiftCoeffs
[j
])};
88 const __m128 s0
{_mm_loadu_ps(&src
[j
*2])};
89 const __m128 s1
{_mm_loadu_ps(&src
[j
*2 + 4])};
91 __m128 s
{_mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(2, 0, 2, 0))};
92 r04
= _mm_add_ps(r04
, _mm_mul_ps(s
, coeffs
));
94 s
= _mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(3, 1, 3, 1));
95 r14
= _mm_add_ps(r14
, _mm_mul_ps(s
, coeffs
));
97 r04
= _mm_add_ps(r04
, _mm_shuffle_ps(r04
, r04
, _MM_SHUFFLE(0, 1, 2, 3)));
98 r04
= _mm_add_ps(r04
, _mm_movehl_ps(r04
, r04
));
99 dst
[pos
++] += _mm_cvtss_f32(r04
);
101 r14
= _mm_add_ps(r14
, _mm_shuffle_ps(r14
, r14
, _MM_SHUFFLE(0, 1, 2, 3)));
102 r14
= _mm_add_ps(r14
, _mm_movehl_ps(r14
, r14
));
103 dst
[pos
++] += _mm_cvtss_f32(r14
);
110 __m128 r4
{_mm_setzero_ps()};
111 for(size_t j
{0};j
< PShiftCoeffs
.size();j
+=4)
113 const __m128 coeffs
{_mm_load_ps(&PShiftCoeffs
[j
])};
114 /* NOTE: This could alternatively be done with two unaligned loads
115 * and a shuffle. Which would be better?
117 const __m128 s
{_mm_setr_ps(src
[j
*2], src
[j
*2 + 2], src
[j
*2 + 4], src
[j
*2 + 6])};
118 r4
= _mm_add_ps(r4
, _mm_mul_ps(s
, coeffs
));
120 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
121 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
123 dst
[pos
] += _mm_cvtss_f32(r4
);
128 for(float &output
: dst
)
131 for(size_t j
{0};j
< PShiftCoeffs
.size();++j
)
132 ret
+= src
[j
*2] * PShiftCoeffs
[j
];
143 /* Encoding 2-channel UHJ from B-Format is done as:
145 * S = 0.9396926*W + 0.1855740*X
146 * D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y
149 * Right = (S - D)/2.0
151 * where j is a wide-band +90 degree phase shift.
153 * The phase shift is done using a FIR filter derived from an FFT'd impulse
154 * with the desired shift.
157 void Uhj2Encoder::encode(FloatBufferLine
&LeftOut
, FloatBufferLine
&RightOut
,
158 const FloatBufferLine
*InSamples
, const size_t SamplesToDo
)
160 ASSUME(SamplesToDo
> 0);
162 float *RESTRICT left
{al::assume_aligned
<16>(LeftOut
.data())};
163 float *RESTRICT right
{al::assume_aligned
<16>(RightOut
.data())};
165 const float *RESTRICT winput
{al::assume_aligned
<16>(InSamples
[0].data())};
166 const float *RESTRICT xinput
{al::assume_aligned
<16>(InSamples
[1].data())};
167 const float *RESTRICT yinput
{al::assume_aligned
<16>(InSamples
[2].data())};
169 /* Combine the previously delayed mid/side signal with the input. */
171 /* S = 0.9396926*W + 0.1855740*X */
172 auto miditer
= std::copy(mMidDelay
.cbegin(), mMidDelay
.cend(), mMid
.begin());
173 std::transform(winput
, winput
+SamplesToDo
, xinput
, miditer
,
174 [](const float w
, const float x
) noexcept
-> float
175 { return 0.9396926f
*w
+ 0.1855740f
*x
; });
177 /* D = 0.6554516*Y */
178 auto sideiter
= std::copy(mSideDelay
.cbegin(), mSideDelay
.cend(), mSide
.begin());
179 std::transform(yinput
, yinput
+SamplesToDo
, sideiter
,
180 [](const float y
) noexcept
-> float { return 0.6554516f
*y
; });
182 /* Include any existing direct signal in the mid/side buffers. */
183 for(size_t i
{0};i
< SamplesToDo
;++i
,++miditer
)
184 *miditer
+= left
[i
] + right
[i
];
185 for(size_t i
{0};i
< SamplesToDo
;++i
,++sideiter
)
186 *sideiter
+= left
[i
] - right
[i
];
188 /* Copy the future samples back to the delay buffers for next time. */
189 std::copy_n(mMid
.cbegin()+SamplesToDo
, mMidDelay
.size(), mMidDelay
.begin());
190 std::copy_n(mSide
.cbegin()+SamplesToDo
, mSideDelay
.size(), mSideDelay
.begin());
192 /* Now add the all-passed signal into the side signal. */
194 /* D += j(-0.3420201*W + 0.5098604*X) */
195 auto tmpiter
= std::copy(mSideHistory
.cbegin(), mSideHistory
.cend(), mTemp
.begin());
196 std::transform(winput
, winput
+SamplesToDo
, xinput
, tmpiter
,
197 [](const float w
, const float x
) noexcept
-> float
198 { return -0.3420201f
*w
+ 0.5098604f
*x
; });
199 std::copy_n(mTemp
.cbegin()+SamplesToDo
, mSideHistory
.size(), mSideHistory
.begin());
200 allpass_process({mSide
.data(), SamplesToDo
}, mTemp
.data());
202 /* Left = (S + D)/2.0 */
203 for(size_t i
{0};i
< SamplesToDo
;i
++)
204 left
[i
] = (mMid
[i
] + mSide
[i
]) * 0.5f
;
205 /* Right = (S - D)/2.0 */
206 for(size_t i
{0};i
< SamplesToDo
;i
++)
207 right
[i
] = (mMid
[i
] - mSide
[i
]) * 0.5f
;