core/uhjfilter.cpp

   1
   2 #include "config.h"
   3
   4 #include "uhjfilter.h"
   5
   6 #include <algorithm>
   7 #include <cmath>
   8 #include <complex>
   9 #include <functional>
  10 #include <vector>
  11
  12 #include "alcomplex.h"
  13 #include "almalloc.h"
  14 #include "alnumbers.h"
  15 #include "core/bufferline.h"
  16 #include "opthelpers.h"
  17 #include "pffft.h"
  18 #include "phase_shifter.h"
  19 #include "vector.h"
  20
  21
  22 namespace {
  23
  24 template<std::size_t A, typename T, std::size_t N>
  25 constexpr auto assume_aligned_span(const al::span<T,N> s) noexcept -> al::span<T,N>
  26 { return al::span<T,N>{al::assume_aligned<A>(s.data()), s.size()}; }
  27
  28 /* Convolution is implemented using a segmented overlap-add method. The filter
  29  * response is broken up into multiple segments of 128 samples, and each
  30  * segment has an FFT applied with a 256-sample buffer (the latter half left
  31  * silent) to get its frequency-domain response.
  32  *
  33  * Input samples are similarly broken up into 128-sample segments, with a 256-
  34  * sample FFT applied to each new incoming segment to get its frequency-domain
  35  * response. A history of FFT'd input segments is maintained, equal to the
  36  * number of filter response segments.
  37  *
  38  * To apply the convolution, each filter response segment is convolved with its
  39  * paired input segment (using complex multiplies, far cheaper than time-domain
  40  * FIRs), accumulating into an FFT buffer. The input history is then shifted to
  41  * align with later filter response segments for the next input segment.
  42  *
  43  * An inverse FFT is then applied to the accumulated FFT buffer to get a 256-
  44  * sample time-domain response for output, which is split in two halves. The
  45  * first half is the 128-sample output, and the second half is a 128-sample
  46  * (really, 127) delayed extension, which gets added to the output next time.
  47  * Convolving two time-domain responses of length N results in a time-domain
  48  * signal of length N*2 - 1, and this holds true regardless of the convolution
  49  * being applied in the frequency domain, so these "overflow" samples need to
  50  * be accounted for.
  51  */
  52 template<size_t N>
  53 struct SegmentedFilter {
  54     static constexpr size_t sFftLength{256};
  55     static constexpr size_t sSampleLength{sFftLength / 2};
  56     static constexpr size_t sNumSegments{N/sSampleLength};
  57     static_assert(N >= sFftLength);
  58     static_assert((N % sSampleLength) == 0);
  59
  60     PFFFTSetup mFft;
  61     alignas(16) std::array<float,sFftLength*sNumSegments> mFilterData;
  62
  63     SegmentedFilter() : mFft{sFftLength, PFFFT_REAL}
  64     {
  65         static constexpr size_t fft_size{N};
  66
  67         /* To set up the filter, we first need to generate the desired
  68          * response (not reversed).
  69          */
  70         auto tmpBuffer = std::vector<double>(fft_size, 0.0);
  71         for(std::size_t i{0};i < fft_size/2;++i)
  72         {
  73             const int k{int{fft_size/2} - static_cast<int>(i*2 + 1)};
  74
  75             const double w{2.0*al::numbers::pi * static_cast<double>(i*2 + 1)
  76                 / double{fft_size}};
  77             const double window{0.3635819 - 0.4891775*std::cos(w) + 0.1365995*std::cos(2.0*w)
  78                 - 0.0106411*std::cos(3.0*w)};
  79
  80             const double pk{al::numbers::pi * static_cast<double>(k)};
  81             tmpBuffer[i*2 + 1] = window * (1.0-std::cos(pk)) / pk;
  82         }
  83
  84         /* The segments of the filter are converted back to the frequency
  85          * domain, each on their own (0 stuffed).
  86          */
  87         using complex_d = std::complex<double>;
  88         auto fftBuffer = std::vector<complex_d>(sFftLength);
  89         auto fftTmp = al::vector<float,16>(sFftLength);
  90         auto filter = mFilterData.begin();
  91         for(size_t s{0};s < sNumSegments;++s)
  92         {
  93             const auto tmpspan = al::span{tmpBuffer}.subspan(sSampleLength*s, sSampleLength);
  94             auto iter = std::copy_n(tmpspan.cbegin(), tmpspan.size(), fftBuffer.begin());
  95             std::fill(iter, fftBuffer.end(), complex_d{});
  96             forward_fft(fftBuffer);
  97
  98             /* Convert to zdomain data for PFFFT, scaled by the FFT length so
  99              * the iFFT result will be normalized.
 100              */
 101             for(size_t i{0};i < sSampleLength;++i)
 102             {
 103                 fftTmp[i*2 + 0] = static_cast<float>(fftBuffer[i].real()) / float{sFftLength};
 104                 fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer[sSampleLength].real()
 105                     : fftBuffer[i].imag()) / float{sFftLength};
 106             }
 107             mFft.zreorder(fftTmp.data(), al::to_address(filter), PFFFT_BACKWARD);
 108             filter += sFftLength;
 109         }
 110     }
 111 };
 112
 113 template<size_t N>
 114 const SegmentedFilter<N> gSegmentedFilter;
 115
 116 template<size_t N>
 117 const PhaseShifterT<N> PShifter;
 118
 119
 120 /* Filter coefficients for the 'base' all-pass IIR, which applies a frequency-
 121  * dependent phase-shift of N degrees. The output of the filter requires a 1-
 122  * sample delay.
 123  */
 124 constexpr std::array<float,4> Filter1Coeff{{
 125     0.479400865589f, 0.876218493539f, 0.976597589508f, 0.997499255936f
 126 }};
 127 /* Filter coefficients for the offset all-pass IIR, which applies a frequency-
 128  * dependent phase-shift of N+90 degrees.
 129  */
 130 constexpr std::array<float,4> Filter2Coeff{{
 131     0.161758498368f, 0.733028932341f, 0.945349700329f, 0.990599156684f
 132 }};
 133
 134 } // namespace
 135
 136 void UhjAllPassFilter::processOne(const al::span<const float, 4> coeffs, float x)
 137 {
 138     auto state = mState;
 139     for(size_t i{0};i < 4;++i)
 140     {
 141         const float y{x*coeffs[i] + state[i].z[0]};
 142         state[i].z[0] = state[i].z[1];
 143         state[i].z[1] = y*coeffs[i] - x;
 144         x = y;
 145     }
 146     mState = state;
 147 }
 148
 149 void UhjAllPassFilter::process(const al::span<const float,4> coeffs,
 150     const al::span<const float> src, const bool updateState, const al::span<float> dst)
 151 {
 152     auto state = mState;
 153
 154     auto proc_sample = [&state,coeffs](float x) noexcept -> float
 155     {
 156         for(size_t i{0};i < 4;++i)
 157         {
 158             const float y{x*coeffs[i] + state[i].z[0]};
 159             state[i].z[0] = state[i].z[1];
 160             state[i].z[1] = y*coeffs[i] - x;
 161             x = y;
 162         }
 163         return x;
 164     };
 165     std::transform(src.begin(), src.end(), dst.begin(), proc_sample);
 166     if(updateState) LIKELY mState = state;
 167 }
 168
 169
 170 /* Encoding UHJ from B-Format is done as:
 171  *
 172  * S = 0.9396926*W + 0.1855740*X
 173  * D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y
 174  *
 175  * Left = (S + D)/2.0
 176  * Right = (S - D)/2.0
 177  * T = j(-0.1432*W + 0.6512*X) - 0.7071068*Y
 178  * Q = 0.9772*Z
 179  *
 180  * where j is a wide-band +90 degree phase shift. 3-channel UHJ excludes Q,
 181  * while 2-channel excludes Q and T.
 182  *
 183  * The phase shift is done using a linear FIR filter derived from an FFT'd
 184  * impulse with the desired shift.
 185  */
 186
 187 template<size_t N>
 188 void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
 189     const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
 190 {
 191     static constexpr auto &Filter = gSegmentedFilter<N>;
 192     static_assert(sFftLength == Filter.sFftLength);
 193     static_assert(sSegmentSize == Filter.sSampleLength);
 194     static_assert(sNumSegments == Filter.sNumSegments);
 195
 196     ASSUME(SamplesToDo > 0);
 197     ASSUME(SamplesToDo <= BufferLineSize);
 198
 199     const auto winput = al::span{al::assume_aligned<16>(InSamples[0]), SamplesToDo};
 200     const auto xinput = al::span{al::assume_aligned<16>(InSamples[1]), SamplesToDo};
 201     const auto yinput = al::span{al::assume_aligned<16>(InSamples[2]), SamplesToDo};
 202
 203     std::copy_n(winput.begin(), SamplesToDo, mW.begin()+sFilterDelay);
 204     std::copy_n(xinput.begin(), SamplesToDo, mX.begin()+sFilterDelay);
 205     std::copy_n(yinput.begin(), SamplesToDo, mY.begin()+sFilterDelay);
 206
 207     /* S = 0.9396926*W + 0.1855740*X */
 208     std::transform(mW.begin(), mW.begin()+SamplesToDo, mX.begin(), mS.begin(),
 209         [](const float w, const float x) noexcept { return 0.9396926f*w + 0.1855740f*x; });
 210
 211     /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */
 212     auto dstore = mD.begin();
 213     size_t curseg{mCurrentSegment};
 214     for(size_t base{0};base < SamplesToDo;)
 215     {
 216         const size_t todo{std::min(sSegmentSize-mFifoPos, SamplesToDo-base)};
 217         auto wseg = winput.subspan(base, todo);
 218         auto xseg = xinput.subspan(base, todo);
 219         auto wxio = al::span{mWXInOut}.subspan(mFifoPos, todo);
 220
 221         /* Copy out the samples that were previously processed by the FFT. */
 222         dstore = std::copy_n(wxio.begin(), todo, dstore);
 223
 224         /* Transform the non-delayed input and store in the front half of the
 225          * filter input.
 226          */
 227         std::transform(wseg.begin(), wseg.end(), xseg.begin(), wxio.begin(),
 228             [](const float w, const float x) noexcept -> float
 229             { return -0.3420201f*w + 0.5098604f*x; });
 230
 231         mFifoPos += todo;
 232         base += todo;
 233
 234         /* Check whether the input buffer is filled with new samples. */
 235         if(mFifoPos < sSegmentSize) break;
 236         mFifoPos = 0;
 237
 238         /* Copy the new input to the next history segment, clearing the back
 239          * half of the segment, and convert to the frequency domain.
 240          */
 241         auto input = mWXHistory.begin() + curseg*sFftLength;
 242         std::copy_n(mWXInOut.begin(), sSegmentSize, input);
 243         std::fill_n(input+sSegmentSize, sSegmentSize, 0.0f);
 244
 245         Filter.mFft.transform(al::to_address(input), al::to_address(input), mWorkData.data(),
 246             PFFFT_FORWARD);
 247
 248         /* Convolve each input segment with its IR filter counterpart (aligned
 249          * in time, from newest to oldest).
 250          */
 251         mFftBuffer.fill(0.0f);
 252         auto filter = Filter.mFilterData.begin();
 253         for(size_t s{curseg};s < sNumSegments;++s)
 254         {
 255             Filter.mFft.zconvolve_accumulate(al::to_address(input), al::to_address(filter),
 256                 mFftBuffer.data());
 257             input += sFftLength;
 258             filter += sFftLength;
 259         }
 260         input = mWXHistory.begin();
 261         for(size_t s{0};s < curseg;++s)
 262         {
 263             Filter.mFft.zconvolve_accumulate(al::to_address(input), al::to_address(filter),
 264                 mFftBuffer.data());
 265             input += sFftLength;
 266             filter += sFftLength;
 267         }
 268
 269         /* Convert back to samples, writing to the output and storing the extra
 270          * for next time.
 271          */
 272         Filter.mFft.transform(mFftBuffer.data(), mFftBuffer.data(), mWorkData.data(),
 273             PFFFT_BACKWARD);
 274
 275         std::transform(mFftBuffer.begin(), mFftBuffer.begin()+sSegmentSize,
 276             mWXInOut.begin()+sSegmentSize, mWXInOut.begin(), std::plus{});
 277         std::copy_n(mFftBuffer.begin()+sSegmentSize, sSegmentSize, mWXInOut.begin()+sSegmentSize);
 278
 279         /* Shift the input history. */
 280         curseg = curseg ? (curseg-1) : (sNumSegments-1);
 281     }
 282     mCurrentSegment = curseg;
 283
 284     /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
 285     std::transform(mD.begin(), mD.begin()+SamplesToDo, mY.begin(), mD.begin(),
 286         [](const float jwx, const float y) noexcept { return jwx + 0.6554516f*y; });
 287
 288     /* Copy the future samples to the front for next time. */
 289     std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin());
 290     std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin());
 291     std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin());
 292
 293     /* Apply a delay to the existing output to align with the input delay. */
 294     auto delayBuffer = mDirectDelay.begin();
 295     for(float *buffer : {LeftOut, RightOut})
 296     {
 297         const auto distbuf = assume_aligned_span<16>(al::span{*delayBuffer});
 298         ++delayBuffer;
 299
 300         const auto inout = al::span{al::assume_aligned<16>(buffer), SamplesToDo};
 301         if(SamplesToDo >= sFilterDelay)
 302         {
 303             auto delay_end = std::rotate(inout.begin(), inout.end() - sFilterDelay, inout.end());
 304             std::swap_ranges(inout.begin(), delay_end, distbuf.begin());
 305         }
 306         else
 307         {
 308             auto delay_start = std::swap_ranges(inout.begin(), inout.end(), distbuf.begin());
 309             std::rotate(distbuf.begin(), delay_start, distbuf.begin() + sFilterDelay);
 310         }
 311     }
 312
 313     /* Combine the direct signal with the produced output. */
 314
 315     /* Left = (S + D)/2.0 */
 316     const auto left = al::span{al::assume_aligned<16>(LeftOut), SamplesToDo};
 317     for(size_t i{0};i < SamplesToDo;++i)
 318         left[i] += (mS[i] + mD[i]) * 0.5f;
 319
 320     /* Right = (S - D)/2.0 */
 321     const auto right = al::span{al::assume_aligned<16>(RightOut), SamplesToDo};
 322     for(size_t i{0};i < SamplesToDo;++i)
 323         right[i] += (mS[i] - mD[i]) * 0.5f;
 324 }
 325
 326 /* This encoding implementation uses two sets of four chained IIR filters to
 327  * produce the desired relative phase shift. The first filter chain produces a
 328  * phase shift of varying degrees over a wide range of frequencies, while the
 329  * second filter chain produces a phase shift 90 degrees ahead of the first
 330  * over the same range. Further details are described here:
 331  *
 332  * https://web.archive.org/web/20060708031958/http://www.biochem.oulu.fi/~oniemita/dsp/hilbert/
 333  *
 334  * 2-channel UHJ output requires the use of three filter chains. The S channel
 335  * output uses a Filter1 chain on the W and X channel mix, while the D channel
 336  * output uses a Filter1 chain on the Y channel plus a Filter2 chain on the W
 337  * and X channel mix. This results in the W and X input mix on the D channel
 338  * output having the required +90 degree phase shift relative to the other
 339  * inputs.
 340  */
 341 void UhjEncoderIIR::encode(float *LeftOut, float *RightOut,
 342     const al::span<const float *const, 3> InSamples, const size_t SamplesToDo)
 343 {
 344     ASSUME(SamplesToDo > 0);
 345     ASSUME(SamplesToDo <= BufferLineSize);
 346
 347     const auto winput = al::span{al::assume_aligned<16>(InSamples[0]), SamplesToDo};
 348     const auto xinput = al::span{al::assume_aligned<16>(InSamples[1]), SamplesToDo};
 349     const auto yinput = al::span{al::assume_aligned<16>(InSamples[2]), SamplesToDo};
 350
 351     /* S = 0.9396926*W + 0.1855740*X */
 352     std::transform(winput.begin(), winput.end(), xinput.begin(), mTemp.begin(),
 353         [](const float w, const float x) noexcept { return 0.9396926f*w + 0.1855740f*x; });
 354     mFilter1WX.process(Filter1Coeff, al::span{mTemp}.first(SamplesToDo), true,
 355         al::span{mS}.subspan(1));
 356     mS[0] = mDelayWX; mDelayWX = mS[SamplesToDo];
 357
 358     /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mWX. */
 359     std::transform(winput.begin(), winput.end(), xinput.begin(), mTemp.begin(),
 360         [](const float w, const float x) noexcept { return -0.3420201f*w + 0.5098604f*x; });
 361     mFilter2WX.process(Filter2Coeff, al::span{mTemp}.first(SamplesToDo), true, mWX);
 362
 363     /* Apply filter1 to Y and store in mD. */
 364     mFilter1Y.process(Filter1Coeff, yinput, true, al::span{mD}.subspan(1));
 365     mD[0] = mDelayY; mDelayY = mD[SamplesToDo];
 366
 367     /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
 368     std::transform(mWX.begin(), mWX.begin()+SamplesToDo, mD.begin(), mD.begin(),
 369         [](const float jwx, const float y) noexcept { return jwx + 0.6554516f*y; });
 370
 371     /* Apply the base filter to the existing output to align with the processed
 372      * signal.
 373      */
 374     const auto left = al::span{al::assume_aligned<16>(LeftOut), SamplesToDo};
 375     mFilter1Direct[0].process(Filter1Coeff, left, true, al::span{mTemp}.subspan(1));
 376     mTemp[0] = mDirectDelay[0]; mDirectDelay[0] = mTemp[SamplesToDo];
 377
 378     /* Left = (S + D)/2.0 */
 379     for(size_t i{0};i < SamplesToDo;++i)
 380         left[i] = (mS[i] + mD[i])*0.5f + mTemp[i];
 381
 382     const auto right = al::span{al::assume_aligned<16>(RightOut), SamplesToDo};
 383     mFilter1Direct[1].process(Filter1Coeff, right, true, al::span{mTemp}.subspan(1));
 384     mTemp[0] = mDirectDelay[1]; mDirectDelay[1] = mTemp[SamplesToDo];
 385
 386     /* Right = (S - D)/2.0 */
 387     for(size_t i{0};i < SamplesToDo;++i)
 388         right[i] = (mS[i] - mD[i])*0.5f + mTemp[i];
 389 }
 390
 391
 392 /* Decoding UHJ is done as:
 393  *
 394  * S = Left + Right
 395  * D = Left - Right
 396  *
 397  * W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T)
 398  * X = 0.418496*S - j(0.828331*D + 0.767820*T)
 399  * Y = 0.795968*D - 0.676392*T + j(0.186633*S)
 400  * Z = 1.023332*Q
 401  *
 402  * where j is a +90 degree phase shift. 3-channel UHJ excludes Q, while 2-
 403  * channel excludes Q and T.
 404  */
 405 template<size_t N>
 406 void UhjDecoder<N>::decode(const al::span<float*> samples, const size_t samplesToDo,
 407     const bool updateState)
 408 {
 409     static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
 410
 411     constexpr auto &PShift = PShifter<N>;
 412
 413     ASSUME(samplesToDo > 0);
 414     ASSUME(samplesToDo <= BufferLineSize);
 415
 416     {
 417         const auto left = al::span{al::assume_aligned<16>(samples[0]), samplesToDo+sInputPadding};
 418         const auto right = al::span{al::assume_aligned<16>(samples[1]), samplesToDo+sInputPadding};
 419         const auto t = al::span{al::assume_aligned<16>(samples[2]), samplesToDo+sInputPadding};
 420
 421         /* S = Left + Right */
 422         std::transform(left.begin(), left.end(), right.begin(), mS.begin(), std::plus{});
 423
 424         /* D = Left - Right */
 425         std::transform(left.begin(), left.end(), right.begin(), mD.begin(), std::minus{});
 426
 427         /* T */
 428         std::copy(t.begin(), t.end(), mT.begin());
 429     }
 430
 431     const auto woutput = al::span{al::assume_aligned<16>(samples[0]), samplesToDo};
 432     const auto xoutput = al::span{al::assume_aligned<16>(samples[1]), samplesToDo};
 433     const auto youtput = al::span{al::assume_aligned<16>(samples[2]), samplesToDo};
 434
 435     /* Precompute j(0.828331*D + 0.767820*T) and store in xoutput. */
 436     auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin());
 437     std::transform(mD.cbegin(), mD.cbegin()+samplesToDo+sInputPadding, mT.cbegin(), tmpiter,
 438         [](const float d, const float t) noexcept { return 0.828331f*d + 0.767820f*t; });
 439     if(updateState) LIKELY
 440         std::copy_n(mTemp.cbegin()+samplesToDo, mDTHistory.size(), mDTHistory.begin());
 441     PShift.process(xoutput, mTemp);
 442
 443     /* W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T) */
 444     std::transform(mS.begin(), mS.begin()+samplesToDo, xoutput.begin(), woutput.begin(),
 445         [](const float s, const float jdt) noexcept { return 0.981532f*s + 0.197484f*jdt; });
 446
 447     /* X = 0.418496*S - j(0.828331*D + 0.767820*T) */
 448     std::transform(mS.begin(), mS.begin()+samplesToDo, xoutput.begin(), xoutput.begin(),
 449         [](const float s, const float jdt) noexcept { return 0.418496f*s - jdt; });
 450
 451     /* Precompute j*S and store in youtput. */
 452     tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
 453     std::copy_n(mS.cbegin(), samplesToDo+sInputPadding, tmpiter);
 454     if(updateState) LIKELY
 455         std::copy_n(mTemp.cbegin()+samplesToDo, mSHistory.size(), mSHistory.begin());
 456     PShift.process(youtput, mTemp);
 457
 458     /* Y = 0.795968*D - 0.676392*T + j(0.186633*S) */
 459     for(size_t i{0};i < samplesToDo;++i)
 460         youtput[i] = 0.795968f*mD[i] - 0.676392f*mT[i] + 0.186633f*youtput[i];
 461
 462     if(samples.size() > 3)
 463     {
 464         const auto zoutput = al::span{al::assume_aligned<16>(samples[3]), samplesToDo};
 465         /* Z = 1.023332*Q */
 466         std::transform(zoutput.begin(), zoutput.end(), zoutput.begin(),
 467             [](const float q) noexcept { return 1.023332f*q; });
 468     }
 469 }
 470
 471 void UhjDecoderIIR::decode(const al::span<float*> samples, const size_t samplesToDo,
 472     const bool updateState)
 473 {
 474     static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
 475
 476     ASSUME(samplesToDo > 0);
 477     ASSUME(samplesToDo <= BufferLineSize);
 478
 479     {
 480         const auto left = al::span{al::assume_aligned<16>(samples[0]), samplesToDo+sInputPadding};
 481         const auto right = al::span{al::assume_aligned<16>(samples[1]), samplesToDo+sInputPadding};
 482
 483         /* S = Left + Right */
 484         std::transform(left.begin(), left.end(), right.begin(), mS.begin(), std::plus{});
 485
 486         /* D = Left - Right */
 487         std::transform(left.begin(), left.end(), right.begin(), mD.begin(), std::minus{});
 488     }
 489
 490     const auto woutput = al::span{al::assume_aligned<16>(samples[0]), samplesToDo};
 491     const auto xoutput = al::span{al::assume_aligned<16>(samples[1]), samplesToDo};
 492     const auto youtput = al::span{al::assume_aligned<16>(samples[2]), samplesToDo+sInputPadding};
 493
 494     /* Precompute j(0.828331*D + 0.767820*T) and store in xoutput. */
 495     std::transform(mD.cbegin(), mD.cbegin()+sInputPadding+samplesToDo, youtput.begin(),
 496         mTemp.begin(),
 497         [](const float d, const float t) noexcept { return 0.828331f*d + 0.767820f*t; });
 498     if(mFirstRun) mFilter2DT.processOne(Filter2Coeff, mTemp[0]);
 499     mFilter2DT.process(Filter2Coeff, al::span{mTemp}.subspan(1,samplesToDo), updateState, xoutput);
 500
 501     /* Apply filter1 to S and store in mTemp. */
 502     mFilter1S.process(Filter1Coeff, al::span{mS}.first(samplesToDo), updateState, mTemp);
 503
 504     /* W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T) */
 505     std::transform(mTemp.begin(), mTemp.begin()+samplesToDo, xoutput.begin(), woutput.begin(),
 506         [](const float s, const float jdt) noexcept { return 0.981532f*s + 0.197484f*jdt; });
 507     /* X = 0.418496*S - j(0.828331*D + 0.767820*T) */
 508     std::transform(mTemp.begin(), mTemp.begin()+samplesToDo, xoutput.begin(), xoutput.begin(),
 509         [](const float s, const float jdt) noexcept { return 0.418496f*s - jdt; });
 510
 511
 512     /* Apply filter1 to (0.795968*D - 0.676392*T) and store in mTemp. */
 513     std::transform(mD.cbegin(), mD.cbegin()+samplesToDo, youtput.begin(), youtput.begin(),
 514         [](const float d, const float t) noexcept { return 0.795968f*d - 0.676392f*t; });
 515     mFilter1DT.process(Filter1Coeff, youtput.first(samplesToDo), updateState, mTemp);
 516
 517     /* Precompute j*S and store in youtput. */
 518     if(mFirstRun) mFilter2S.processOne(Filter2Coeff, mS[0]);
 519     mFilter2S.process(Filter2Coeff, al::span{mS}.subspan(1, samplesToDo), updateState, youtput);
 520
 521     /* Y = 0.795968*D - 0.676392*T + j(0.186633*S) */
 522     std::transform(mTemp.begin(), mTemp.begin()+samplesToDo, youtput.begin(), youtput.begin(),
 523         [](const float dt, const float js) noexcept { return dt + 0.186633f*js; });
 524
 525     if(samples.size() > 3)
 526     {
 527         const auto zoutput = al::span{al::assume_aligned<16>(samples[3]), samplesToDo};
 528
 529         /* Apply filter1 to Q and store in mTemp. */
 530         mFilter1Q.process(Filter1Coeff, zoutput, updateState, mTemp);
 531
 532         /* Z = 1.023332*Q */
 533         std::transform(mTemp.begin(), mTemp.end(), zoutput.begin(),
 534             [](const float q) noexcept { return 1.023332f*q; });
 535     }
 536
 537     mFirstRun = false;
 538 }
 539
 540
 541 /* Super Stereo processing is done as:
 542  *
 543  * S = Left + Right
 544  * D = Left - Right
 545  *
 546  * W = 0.6098637*S + 0.6896511*j*w*D
 547  * X = 0.8624776*S - 0.7626955*j*w*D
 548  * Y = 1.6822415*w*D + 0.2156194*j*S
 549  *
 550  * where j is a +90 degree phase shift. w is a variable control for the
 551  * resulting stereo width, with the range 0 <= w <= 0.7.
 552  */
 553 template<size_t N>
 554 void UhjStereoDecoder<N>::decode(const al::span<float*> samples, const size_t samplesToDo,
 555     const bool updateState)
 556 {
 557     static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
 558
 559     constexpr auto &PShift = PShifter<N>;
 560
 561     ASSUME(samplesToDo > 0);
 562     ASSUME(samplesToDo <= BufferLineSize);
 563
 564     {
 565         const auto left = al::span{al::assume_aligned<16>(samples[0]), samplesToDo+sInputPadding};
 566         const auto right = al::span{al::assume_aligned<16>(samples[1]), samplesToDo+sInputPadding};
 567
 568         std::transform(left.begin(), left.end(), right.begin(), mS.begin(), std::plus{});
 569
 570         /* Pre-apply the width factor to the difference signal D. Smoothly
 571          * interpolate when it changes.
 572          */
 573         const float wtarget{mWidthControl};
 574         const float wcurrent{(mCurrentWidth < 0.0f) ? wtarget : mCurrentWidth};
 575         if(wtarget == wcurrent || !updateState)
 576         {
 577             std::transform(left.begin(), left.end(), right.begin(), mD.begin(),
 578                 [wcurrent](const float l, const float r) noexcept { return (l-r) * wcurrent; });
 579             mCurrentWidth = wcurrent;
 580         }
 581         else
 582         {
 583             const float wstep{(wtarget - wcurrent) / static_cast<float>(samplesToDo)};
 584             float fi{0.0f};
 585
 586             const auto lfade = left.first(samplesToDo);
 587             auto dstore = std::transform(lfade.begin(), lfade.begin(), right.begin(), mD.begin(),
 588                 [wcurrent,wstep,&fi](const float l, const float r) noexcept
 589                 {
 590                     const float ret{(l-r) * (wcurrent + wstep*fi)};
 591                     fi += 1.0f;
 592                     return ret;
 593                 });
 594
 595             const auto lend = left.subspan(samplesToDo);
 596             const auto rend = right.subspan(samplesToDo);
 597             std::transform(lend.begin(), lend.end(), rend.begin(), dstore,
 598                 [wtarget](const float l, const float r) noexcept { return (l-r) * wtarget; });
 599             mCurrentWidth = wtarget;
 600         }
 601     }
 602
 603     const auto woutput = al::span{al::assume_aligned<16>(samples[0]), samplesToDo};
 604     const auto xoutput = al::span{al::assume_aligned<16>(samples[1]), samplesToDo};
 605     const auto youtput = al::span{al::assume_aligned<16>(samples[2]), samplesToDo};
 606
 607     /* Precompute j*D and store in xoutput. */
 608     auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin());
 609     std::copy_n(mD.cbegin(), samplesToDo+sInputPadding, tmpiter);
 610     if(updateState) LIKELY
 611         std::copy_n(mTemp.cbegin()+samplesToDo, mDTHistory.size(), mDTHistory.begin());
 612     PShift.process(xoutput, mTemp);
 613
 614     /* W = 0.6098637*S + 0.6896511*j*w*D */
 615     std::transform(mS.begin(), mS.begin()+samplesToDo, xoutput.begin(), woutput.begin(),
 616         [](const float s, const float jd) noexcept { return 0.6098637f*s + 0.6896511f*jd; });
 617     /* X = 0.8624776*S - 0.7626955*j*w*D */
 618     std::transform(mS.begin(), mS.begin()+samplesToDo, xoutput.begin(), xoutput.begin(),
 619         [](const float s, const float jd) noexcept { return 0.8624776f*s - 0.7626955f*jd; });
 620
 621     /* Precompute j*S and store in youtput. */
 622     tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
 623     std::copy_n(mS.cbegin(), samplesToDo+sInputPadding, tmpiter);
 624     if(updateState) LIKELY
 625         std::copy_n(mTemp.cbegin()+samplesToDo, mSHistory.size(), mSHistory.begin());
 626     PShift.process(youtput, mTemp);
 627
 628     /* Y = 1.6822415*w*D + 0.2156194*j*S */
 629     std::transform(mD.begin(), mD.begin()+samplesToDo, youtput.begin(), youtput.begin(),
 630         [](const float d, const float js) noexcept { return 1.6822415f*d + 0.2156194f*js; });
 631 }
 632
 633 void UhjStereoDecoderIIR::decode(const al::span<float*> samples, const size_t samplesToDo,
 634     const bool updateState)
 635 {
 636     static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
 637
 638     ASSUME(samplesToDo > 0);
 639     ASSUME(samplesToDo <= BufferLineSize);
 640
 641     {
 642         const auto left = al::span{al::assume_aligned<16>(samples[0]), samplesToDo+sInputPadding};
 643         const auto right = al::span{al::assume_aligned<16>(samples[1]), samplesToDo+sInputPadding};
 644
 645         std::transform(left.begin(), left.end(), right.begin(), mS.begin(), std::plus{});
 646
 647         /* Pre-apply the width factor to the difference signal D. Smoothly
 648          * interpolate when it changes.
 649          */
 650         const float wtarget{mWidthControl};
 651         const float wcurrent{(mCurrentWidth < 0.0f) ? wtarget : mCurrentWidth};
 652         if(wtarget == wcurrent || !updateState)
 653         {
 654             std::transform(left.begin(), left.end(), right.begin(), mD.begin(),
 655                 [wcurrent](const float l, const float r) noexcept
 656                 { return (l-r) * wcurrent; });
 657             mCurrentWidth = wcurrent;
 658         }
 659         else
 660         {
 661             const float wstep{(wtarget - wcurrent) / static_cast<float>(samplesToDo)};
 662             float fi{0.0f};
 663
 664             const auto lfade = left.first(samplesToDo);
 665             auto dstore = std::transform(lfade.begin(), lfade.begin(), right.begin(), mD.begin(),
 666                 [wcurrent,wstep,&fi](const float l, const float r) noexcept
 667                 {
 668                     const float ret{(l-r) * (wcurrent + wstep*fi)};
 669                     fi += 1.0f;
 670                     return ret;
 671                 });
 672
 673             const auto lend = left.subspan(samplesToDo);
 674             const auto rend = right.subspan(samplesToDo);
 675             std::transform(lend.begin(), lend.end(), rend.begin(), dstore,
 676                 [wtarget](const float l, const float r) noexcept { return (l-r) * wtarget; });
 677             mCurrentWidth = wtarget;
 678         }
 679     }
 680
 681     const auto woutput = al::span{al::assume_aligned<16>(samples[0]), samplesToDo};
 682     const auto xoutput = al::span{al::assume_aligned<16>(samples[1]), samplesToDo};
 683     const auto youtput = al::span{al::assume_aligned<16>(samples[2]), samplesToDo};
 684
 685     /* Apply filter1 to S and store in mTemp. */
 686     mFilter1S.process(Filter1Coeff, al::span{mS}.first(samplesToDo), updateState, mTemp);
 687
 688     /* Precompute j*D and store in xoutput. */
 689     if(mFirstRun) mFilter2D.processOne(Filter2Coeff, mD[0]);
 690     mFilter2D.process(Filter2Coeff, al::span{mD}.subspan(1, samplesToDo), updateState, xoutput);
 691
 692     /* W = 0.6098637*S + 0.6896511*j*w*D */
 693     std::transform(mTemp.begin(), mTemp.begin()+samplesToDo, xoutput.begin(), woutput.begin(),
 694         [](const float s, const float jd) noexcept { return 0.6098637f*s + 0.6896511f*jd; });
 695     /* X = 0.8624776*S - 0.7626955*j*w*D */
 696     std::transform(mTemp.begin(), mTemp.begin()+samplesToDo, xoutput.begin(), xoutput.begin(),
 697         [](const float s, const float jd) noexcept { return 0.8624776f*s - 0.7626955f*jd; });
 698
 699     /* Precompute j*S and store in youtput. */
 700     if(mFirstRun) mFilter2S.processOne(Filter2Coeff, mS[0]);
 701     mFilter2S.process(Filter2Coeff, al::span{mS}.subspan(1, samplesToDo), updateState, youtput);
 702
 703     /* Apply filter1 to D and store in mTemp. */
 704     mFilter1D.process(Filter1Coeff, al::span{mD}.first(samplesToDo), updateState, mTemp);
 705
 706     /* Y = 1.6822415*w*D + 0.2156194*j*S */
 707     std::transform(mTemp.begin(), mTemp.begin()+samplesToDo, youtput.begin(), youtput.begin(),
 708         [](const float d, const float js) noexcept { return 1.6822415f*d + 0.2156194f*js; });
 709
 710     mFirstRun = false;
 711 }
 712
 713
 714 template struct UhjEncoder<UhjLength256>;
 715 template struct UhjDecoder<UhjLength256>;
 716 template struct UhjStereoDecoder<UhjLength256>;
 717
 718 template struct UhjEncoder<UhjLength512>;
 719 template struct UhjDecoder<UhjLength512>;
 720 template struct UhjStereoDecoder<UhjLength512>;