core/mixer/mixer_neon.cpp

   1 #include "config.h"
   2
   3 #include <arm_neon.h>
   4
   5 #include <algorithm>
   6 #include <array>
   7 #include <cstddef>
   8 #include <limits>
   9 #include <variant>
  10
  11 #include "alnumeric.h"
  12 #include "alspan.h"
  13 #include "core/bsinc_defs.h"
  14 #include "core/bufferline.h"
  15 #include "core/cubic_defs.h"
  16 #include "core/mixer/hrtfdefs.h"
  17 #include "core/resampler_limits.h"
  18 #include "defs.h"
  19 #include "hrtfbase.h"
  20 #include "opthelpers.h"
  21
  22 struct CTag;
  23 struct NEONTag;
  24 struct LerpTag;
  25 struct CubicTag;
  26 struct BSincTag;
  27 struct FastBSincTag;
  28
  29
  30 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
  31 #pragma GCC target("fpu=neon")
  32 #endif
  33
  34 using uint = unsigned int;
  35
  36 namespace {
  37
  38 constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
  39 constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
  40 constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
  41
  42 constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
  43 constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
  44 constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
  45
  46 force_inline
  47 void vtranspose4(float32x4_t &x0, float32x4_t &x1, float32x4_t &x2, float32x4_t &x3) noexcept
  48 {
  49     float32x4x2_t t0_{vzipq_f32(x0, x2)};
  50     float32x4x2_t t1_{vzipq_f32(x1, x3)};
  51     float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
  52     float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
  53     x0 = u0_.val[0];
  54     x1 = u0_.val[1];
  55     x2 = u1_.val[0];
  56     x3 = u1_.val[1];
  57 }
  58
  59 inline float32x4_t set_f4(float l0, float l1, float l2, float l3)
  60 {
  61     float32x4_t ret{vmovq_n_f32(l0)};
  62     ret = vsetq_lane_f32(l1, ret, 1);
  63     ret = vsetq_lane_f32(l2, ret, 2);
  64     ret = vsetq_lane_f32(l3, ret, 3);
  65     return ret;
  66 }
  67
  68 inline void ApplyCoeffs(const al::span<float2> Values, const size_t IrSize,
  69     const ConstHrirSpan Coeffs, const float left, const float right)
  70 {
  71     ASSUME(IrSize >= MinIrLength);
  72     ASSUME(IrSize <= HrirLength);
  73
  74     auto dup_samples = [left,right]() -> float32x4_t
  75     {
  76         float32x2_t leftright2{vset_lane_f32(right, vmov_n_f32(left), 1)};
  77         return vcombine_f32(leftright2, leftright2);
  78     };
  79     const auto leftright4 = dup_samples();
  80
  81     /* Using a loop here instead of std::transform since some builds seem to
  82      * have an issue with accessing an array/span of float32x4_t.
  83      */
  84     for(size_t c{0};c < IrSize;c += 2)
  85     {
  86         auto vals = vld1q_f32(&Values[c][0]);
  87         vals = vmlaq_f32(vals, vld1q_f32(&Coeffs[c][0]), leftright4);
  88         vst1q_f32(&Values[c][0], vals);
  89     }
  90 }
  91
  92 force_inline void MixLine(const al::span<const float> InSamples, const al::span<float> dst,
  93     float &CurrentGain, const float TargetGain, const float delta, const size_t fade_len,
  94     const size_t realign_len, size_t Counter)
  95 {
  96     const auto step = float{(TargetGain-CurrentGain) * delta};
  97
  98     auto pos = size_t{0};
  99     if(std::abs(step) > std::numeric_limits<float>::epsilon())
 100     {
 101         const auto gain = float{CurrentGain};
 102         auto step_count = float{0.0f};
 103         /* Mix with applying gain steps in aligned multiples of 4. */
 104         if(const size_t todo{fade_len >> 2})
 105         {
 106             const auto four4 = vdupq_n_f32(4.0f);
 107             const auto step4 = vdupq_n_f32(step);
 108             const auto gain4 = vdupq_n_f32(gain);
 109             auto step_count4 = set_f4(0.0f, 1.0f, 2.0f, 3.0f);
 110
 111             const auto in4 = al::span{reinterpret_cast<const float32x4_t*>(InSamples.data()),
 112                 InSamples.size()/4}.first(todo);
 113             const auto out4 = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
 114             std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
 115                 [gain4,step4,four4,&step_count4](const float32x4_t val4, float32x4_t dry4)
 116                 {
 117                     /* dry += val * (gain + step*step_count) */
 118                     dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
 119                     step_count4 = vaddq_f32(step_count4, four4);
 120                     return dry4;
 121                 });
 122             pos += in4.size()*4;
 123
 124             /* NOTE: step_count4 now represents the next four counts after the
 125              * last four mixed samples, so the lowest element represents the
 126              * next step count to apply.
 127              */
 128             step_count = vgetq_lane_f32(step_count4, 0);
 129         }
 130         /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 131         if(const size_t leftover{fade_len&3})
 132         {
 133             const auto in = InSamples.subspan(pos, leftover);
 134             const auto out = dst.subspan(pos);
 135
 136             std::transform(in.begin(), in.end(), out.begin(), out.begin(),
 137                 [gain,step,&step_count](const float val, float dry) noexcept -> float
 138                 {
 139                     dry += val * (gain + step*step_count);
 140                     step_count += 1.0f;
 141                     return dry;
 142                 });
 143             pos += leftover;
 144         }
 145         if(pos < Counter)
 146         {
 147             CurrentGain = gain + step*step_count;
 148             return;
 149         }
 150
 151         /* Mix until pos is aligned with 4 or the mix is done. */
 152         if(const size_t leftover{realign_len&3})
 153         {
 154             const auto in = InSamples.subspan(pos, leftover);
 155             const auto out = dst.subspan(pos);
 156
 157             std::transform(in.begin(), in.end(), out.begin(), out.begin(),
 158                 [TargetGain](const float val, const float dry) noexcept -> float
 159                 { return dry + val*TargetGain; });
 160             pos += leftover;
 161         }
 162     }
 163     CurrentGain = TargetGain;
 164
 165     if(!(std::abs(TargetGain) > GainSilenceThreshold))
 166         return;
 167     if(const size_t todo{(InSamples.size()-pos) >> 2})
 168     {
 169         const auto in4 = al::span{reinterpret_cast<const float32x4_t*>(InSamples.data()),
 170             InSamples.size()/4}.last(todo);
 171         const auto out = dst.subspan(pos);
 172         const auto out4 = al::span{reinterpret_cast<float32x4_t*>(out.data()), out.size()/4};
 173
 174         const auto gain4 = vdupq_n_f32(TargetGain);
 175         std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
 176             [gain4](const float32x4_t val4, const float32x4_t dry4) -> float32x4_t
 177             { return vmlaq_f32(dry4, val4, gain4); });
 178         pos += in4.size()*4;
 179     }
 180     if(const size_t leftover{(InSamples.size()-pos)&3})
 181     {
 182         const auto in = InSamples.last(leftover);
 183         const auto out = dst.subspan(pos);
 184
 185         std::transform(in.begin(), in.end(), out.begin(), out.begin(),
 186             [TargetGain](const float val, const float dry) noexcept -> float
 187             { return dry + val*TargetGain; });
 188     }
 189 }
 190
 191 } // namespace
 192
 193 template<>
 194 void Resample_<LerpTag,NEONTag>(const InterpState*, const al::span<const float> src, uint frac,
 195     const uint increment, const al::span<float> dst)
 196 {
 197     ASSUME(frac < MixerFracOne);
 198
 199     const uint32x4_t increment4 = vdupq_n_u32(increment*4u);
 200     const float32x4_t fracOne4 = vdupq_n_f32(1.0f/MixerFracOne);
 201     const uint32x4_t fracMask4 = vdupq_n_u32(MixerFracMask);
 202
 203     alignas(16) std::array<uint,4> pos_{}, frac_{};
 204     InitPosArrays(MaxResamplerEdge, frac, increment, al::span{frac_}, al::span{pos_});
 205     uint32x4_t frac4 = vld1q_u32(frac_.data());
 206     uint32x4_t pos4 = vld1q_u32(pos_.data());
 207
 208     auto vecout = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
 209     std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]() -> float32x4_t
 210     {
 211         const uint pos0{vgetq_lane_u32(pos4, 0)};
 212         const uint pos1{vgetq_lane_u32(pos4, 1)};
 213         const uint pos2{vgetq_lane_u32(pos4, 2)};
 214         const uint pos3{vgetq_lane_u32(pos4, 3)};
 215         ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
 216         const float32x4_t val1{set_f4(src[pos0], src[pos1], src[pos2], src[pos3])};
 217         const float32x4_t val2{set_f4(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])};
 218
 219         /* val1 + (val2-val1)*mu */
 220         const float32x4_t r0{vsubq_f32(val2, val1)};
 221         const float32x4_t mu{vmulq_f32(vcvtq_f32_u32(frac4), fracOne4)};
 222         const float32x4_t out{vmlaq_f32(val1, mu, r0)};
 223
 224         frac4 = vaddq_u32(frac4, increment4);
 225         pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits));
 226         frac4 = vandq_u32(frac4, fracMask4);
 227         return out;
 228     });
 229
 230     if(size_t todo{dst.size()&3})
 231     {
 232         auto pos = size_t{vgetq_lane_u32(pos4, 0)};
 233         frac = vgetq_lane_u32(frac4, 0);
 234
 235         const auto out = dst.last(todo);
 236         std::generate(out.begin(), out.end(), [&pos,&frac,src,increment]
 237         {
 238             const float output{lerpf(src[pos+0], src[pos+1],
 239                 static_cast<float>(frac) * (1.0f/MixerFracOne))};
 240
 241             frac += increment;
 242             pos  += frac>>MixerFracBits;
 243             frac &= MixerFracMask;
 244             return output;
 245         });
 246     }
 247 }
 248
 249 template<>
 250 void Resample_<CubicTag,NEONTag>(const InterpState *state, const al::span<const float> src,
 251     uint frac, const uint increment, const al::span<float> dst)
 252 {
 253     ASSUME(frac < MixerFracOne);
 254
 255     const auto filter = std::get<CubicState>(*state).filter;
 256
 257     const uint32x4_t increment4{vdupq_n_u32(increment*4u)};
 258     const uint32x4_t fracMask4{vdupq_n_u32(MixerFracMask)};
 259     const float32x4_t fracDiffOne4{vdupq_n_f32(1.0f/CubicPhaseDiffOne)};
 260     const uint32x4_t fracDiffMask4{vdupq_n_u32(CubicPhaseDiffMask)};
 261
 262     alignas(16) std::array<uint,4> pos_{}, frac_{};
 263     InitPosArrays(MaxResamplerEdge-1, frac, increment, al::span{frac_}, al::span{pos_});
 264     uint32x4_t frac4{vld1q_u32(frac_.data())};
 265     uint32x4_t pos4{vld1q_u32(pos_.data())};
 266
 267     auto vecout = al::span{reinterpret_cast<float32x4_t*>(dst.data()), dst.size()/4};
 268     std::generate(vecout.begin(), vecout.end(), [=,&pos4,&frac4]
 269     {
 270         const uint pos0{vgetq_lane_u32(pos4, 0)};
 271         const uint pos1{vgetq_lane_u32(pos4, 1)};
 272         const uint pos2{vgetq_lane_u32(pos4, 2)};
 273         const uint pos3{vgetq_lane_u32(pos4, 3)};
 274         ASSUME(pos0 <= pos1); ASSUME(pos1 <= pos2); ASSUME(pos2 <= pos3);
 275         const float32x4_t val0{vld1q_f32(&src[pos0])};
 276         const float32x4_t val1{vld1q_f32(&src[pos1])};
 277         const float32x4_t val2{vld1q_f32(&src[pos2])};
 278         const float32x4_t val3{vld1q_f32(&src[pos3])};
 279
 280         const uint32x4_t pi4{vshrq_n_u32(frac4, CubicPhaseDiffBits)};
 281         const uint pi0{vgetq_lane_u32(pi4, 0)}; ASSUME(pi0 < CubicPhaseCount);
 282         const uint pi1{vgetq_lane_u32(pi4, 1)}; ASSUME(pi1 < CubicPhaseCount);
 283         const uint pi2{vgetq_lane_u32(pi4, 2)}; ASSUME(pi2 < CubicPhaseCount);
 284         const uint pi3{vgetq_lane_u32(pi4, 3)}; ASSUME(pi3 < CubicPhaseCount);
 285
 286         const float32x4_t pf4{vmulq_f32(vcvtq_f32_u32(vandq_u32(frac4, fracDiffMask4)),
 287             fracDiffOne4)};
 288
 289         float32x4_t r0{vmulq_f32(val0,
 290             vmlaq_f32(vld1q_f32(filter[pi0].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 0),
 291                 vld1q_f32(filter[pi0].mDeltas.data())))};
 292         float32x4_t r1{vmulq_f32(val1,
 293             vmlaq_f32(vld1q_f32(filter[pi1].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 1),
 294                 vld1q_f32(filter[pi1].mDeltas.data())))};
 295         float32x4_t r2{vmulq_f32(val2,
 296             vmlaq_f32(vld1q_f32(filter[pi2].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 0),
 297                 vld1q_f32(filter[pi2].mDeltas.data())))};
 298         float32x4_t r3{vmulq_f32(val3,
 299             vmlaq_f32(vld1q_f32(filter[pi3].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 1),
 300                 vld1q_f32(filter[pi3].mDeltas.data())))};
 301
 302         vtranspose4(r0, r1, r2, r3);
 303         r0 = vaddq_f32(vaddq_f32(r0, r1), vaddq_f32(r2, r3));
 304
 305         frac4 = vaddq_u32(frac4, increment4);
 306         pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits));
 307         frac4 = vandq_u32(frac4, fracMask4);
 308         return r0;
 309     });
 310
 311     if(const size_t todo{dst.size()&3})
 312     {
 313         auto pos = size_t{vgetq_lane_u32(pos4, 0)};
 314         frac = vgetq_lane_u32(frac4, 0);
 315
 316         auto out = dst.last(todo);
 317         std::generate(out.begin(), out.end(), [&pos,&frac,src,increment,filter]
 318         {
 319             const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
 320             const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
 321             const float32x4_t pf4{vdupq_n_f32(pf)};
 322
 323             const float32x4_t f4{vmlaq_f32(vld1q_f32(filter[pi].mCoeffs.data()), pf4,
 324                 vld1q_f32(filter[pi].mDeltas.data()))};
 325             float32x4_t r4{vmulq_f32(f4, vld1q_f32(&src[pos]))};
 326
 327             r4 = vaddq_f32(r4, vrev64q_f32(r4));
 328             const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
 329
 330             frac += increment;
 331             pos  += frac>>MixerFracBits;
 332             frac &= MixerFracMask;
 333             return output;
 334         });
 335     }
 336 }
 337
 338 template<>
 339 void Resample_<BSincTag,NEONTag>(const InterpState *state, const al::span<const float> src,
 340     uint frac, const uint increment, const al::span<float> dst)
 341 {
 342     const auto &bsinc = std::get<BsincState>(*state);
 343     const auto sf4 = vdupq_n_f32(bsinc.sf);
 344     const auto m = size_t{bsinc.m};
 345     ASSUME(m > 0);
 346     ASSUME(m <= MaxResamplerPadding);
 347     ASSUME(frac < MixerFracOne);
 348
 349     const auto filter = bsinc.filter.first(4_uz*BSincPhaseCount*m);
 350
 351     ASSUME(bsinc.l <= MaxResamplerEdge);
 352     auto pos = size_t{MaxResamplerEdge-bsinc.l};
 353     std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,sf4,m,filter]() -> float
 354     {
 355         // Calculate the phase index and factor.
 356         const uint pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
 357         const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
 358
 359         // Apply the scale and phase interpolated filter.
 360         float32x4_t r4{vdupq_n_f32(0.0f)};
 361         {
 362             const float32x4_t pf4{vdupq_n_f32(pf)};
 363             const auto fil = filter.subspan(2_uz*pi*m);
 364             const auto phd = fil.subspan(m);
 365             const auto scd = fil.subspan(2_uz*BSincPhaseCount*m);
 366             const auto spd = scd.subspan(m);
 367             size_t td{m >> 2};
 368             size_t j{0u};
 369
 370             do {
 371                 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
 372                 const float32x4_t f4 = vmlaq_f32(
 373                     vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j])),
 374                     pf4, vmlaq_f32(vld1q_f32(&phd[j]), sf4, vld1q_f32(&spd[j])));
 375                 /* r += f*src */
 376                 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[pos+j]));
 377                 j += 4;
 378             } while(--td);
 379         }
 380         r4 = vaddq_f32(r4, vrev64q_f32(r4));
 381         const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
 382
 383         frac += increment;
 384         pos  += frac>>MixerFracBits;
 385         frac &= MixerFracMask;
 386         return output;
 387     });
 388 }
 389
 390 template<>
 391 void Resample_<FastBSincTag,NEONTag>(const InterpState *state, const al::span<const float> src,
 392     uint frac, const uint increment, const al::span<float> dst)
 393 {
 394     const auto &bsinc = std::get<BsincState>(*state);
 395     const auto m = size_t{bsinc.m};
 396     ASSUME(m > 0);
 397     ASSUME(m <= MaxResamplerPadding);
 398     ASSUME(frac < MixerFracOne);
 399
 400     const auto filter = bsinc.filter.first(2_uz*BSincPhaseCount*m);
 401
 402     ASSUME(bsinc.l <= MaxResamplerEdge);
 403     auto pos = size_t{MaxResamplerEdge-bsinc.l};
 404     std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,m,filter]() -> float
 405     {
 406         // Calculate the phase index and factor.
 407         const uint pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
 408         const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
 409
 410         // Apply the phase interpolated filter.
 411         float32x4_t r4{vdupq_n_f32(0.0f)};
 412         {
 413             const float32x4_t pf4{vdupq_n_f32(pf)};
 414             const auto fil = filter.subspan(2_uz*pi*m);
 415             const auto phd = fil.subspan(m);
 416             size_t td{m >> 2};
 417             size_t j{0u};
 418
 419             do {
 420                 /* f = fil + pf*phd */
 421                 const float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), pf4, vld1q_f32(&phd[j]));
 422                 /* r += f*src */
 423                 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[pos+j]));
 424                 j += 4;
 425             } while(--td);
 426         }
 427         r4 = vaddq_f32(r4, vrev64q_f32(r4));
 428         const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)};
 429
 430         frac += increment;
 431         pos  += frac>>MixerFracBits;
 432         frac &= MixerFracMask;
 433         return output;
 434     });
 435 }
 436
 437
 438 template<>
 439 void MixHrtf_<NEONTag>(const al::span<const float> InSamples, const al::span<float2> AccumSamples,
 440     const uint IrSize, const MixHrtfFilter *hrtfparams, const size_t SamplesToDo)
 441 { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, SamplesToDo); }
 442
 443 template<>
 444 void MixHrtfBlend_<NEONTag>(const al::span<const float> InSamples,
 445     const al::span<float2> AccumSamples, const uint IrSize, const HrtfFilter *oldparams,
 446     const MixHrtfFilter *newparams, const size_t SamplesToDo)
 447 {
 448     MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
 449         SamplesToDo);
 450 }
 451
 452 template<>
 453 void MixDirectHrtf_<NEONTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
 454     const al::span<const FloatBufferLine> InSamples, const al::span<float2> AccumSamples,
 455     const al::span<float,BufferLineSize> TempBuf, const al::span<HrtfChannelState> ChanState,
 456     const size_t IrSize, const size_t SamplesToDo)
 457 {
 458     MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
 459         IrSize, SamplesToDo);
 460 }
 461
 462
 463 template<>
 464 void Mix_<NEONTag>(const al::span<const float> InSamples,const al::span<FloatBufferLine> OutBuffer,
 465     const al::span<float> CurrentGains, const al::span<const float> TargetGains,
 466     const size_t Counter, const size_t OutPos)
 467 {
 468     if((OutPos&3) != 0) UNLIKELY
 469         return Mix_<CTag>(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos);
 470
 471     const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
 472     const auto fade_len = std::min(Counter, InSamples.size());
 473     const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
 474
 475     auto curgains = CurrentGains.begin();
 476     auto targetgains = TargetGains.cbegin();
 477     for(FloatBufferLine &output : OutBuffer)
 478         MixLine(InSamples, al::span{output}.subspan(OutPos), *curgains++, *targetgains++, delta,
 479             fade_len, realign_len, Counter);
 480 }
 481
 482 template<>
 483 void Mix_<NEONTag>(const al::span<const float> InSamples, const al::span<float> OutBuffer,
 484     float &CurrentGain, const float TargetGain, const size_t Counter)
 485 {
 486     if((reinterpret_cast<uintptr_t>(OutBuffer.data())&15) != 0) UNLIKELY
 487         return Mix_<CTag>(InSamples, OutBuffer, CurrentGain, TargetGain, Counter);
 488
 489     const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
 490     const auto fade_len = std::min(Counter, InSamples.size());
 491     const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
 492
 493     MixLine(InSamples, OutBuffer, CurrentGain, TargetGain, delta, fade_len, realign_len, Counter);
 494 }