core/mixer/mixer_sse.cpp

   1 #include "config.h"
   2
   3 #include <mmintrin.h>
   4 #include <xmmintrin.h>
   5
   6 #include <algorithm>
   7 #include <array>
   8 #include <cstddef>
   9 #include <cstdint>
  10 #include <limits>
  11 #include <variant>
  12
  13 #include "alnumeric.h"
  14 #include "alspan.h"
  15 #include "core/bsinc_defs.h"
  16 #include "core/bufferline.h"
  17 #include "core/cubic_defs.h"
  18 #include "core/mixer/hrtfdefs.h"
  19 #include "core/resampler_limits.h"
  20 #include "defs.h"
  21 #include "hrtfbase.h"
  22 #include "opthelpers.h"
  23
  24 struct CTag;
  25 struct SSETag;
  26 struct CubicTag;
  27 struct BSincTag;
  28 struct FastBSincTag;
  29
  30
  31 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
  32 #pragma GCC target("sse")
  33 #endif
  34
  35 namespace {
  36
  37 constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
  38 constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
  39 constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
  40
  41 constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
  42 constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
  43 constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
  44
  45 force_inline __m128 vmadd(const __m128 x, const __m128 y, const __m128 z) noexcept
  46 { return _mm_add_ps(x, _mm_mul_ps(y, z)); }
  47
  48 inline void ApplyCoeffs(const al::span<float2> Values, const size_t IrSize,
  49     const ConstHrirSpan Coeffs, const float left, const float right)
  50 {
  51     ASSUME(IrSize >= MinIrLength);
  52     ASSUME(IrSize <= HrirLength);
  53     const auto lrlr = _mm_setr_ps(left, right, left, right);
  54     /* Round up the IR size to a multiple of 2 for SIMD (2 IRs for 2 channels
  55      * is 4 floats), to avoid cutting the last sample for odd IR counts. The
  56      * underlying HRIR is a fixed-size multiple of 2, any extra samples are
  57      * either 0 (silence) or more IR samples that get applied for "free".
  58      */
  59     const auto count4 = size_t{(IrSize+1) >> 1};
  60
  61     /* This isn't technically correct to test alignment, but it's true for
  62      * systems that support SSE, which is the only one that needs to know the
  63      * alignment of Values (which alternates between 8- and 16-byte aligned).
  64      */
  65     if(!(reinterpret_cast<uintptr_t>(Values.data())&15))
  66     {
  67         const auto vals4 = al::span{reinterpret_cast<__m128*>(Values[0].data()), count4};
  68         const auto coeffs4 = al::span{reinterpret_cast<const __m128*>(Coeffs[0].data()), count4};
  69
  70         std::transform(vals4.cbegin(), vals4.cend(), coeffs4.cbegin(), vals4.begin(),
  71             [lrlr](const __m128 &val, const __m128 &coeff) -> __m128
  72             { return vmadd(val, coeff, lrlr); });
  73     }
  74     else
  75     {
  76         auto coeffs = _mm_load_ps(Coeffs[0].data());
  77         auto vals = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64*>(Values[0].data()));
  78         auto imp0 = _mm_mul_ps(lrlr, coeffs);
  79         vals = _mm_add_ps(imp0, vals);
  80         _mm_storel_pi(reinterpret_cast<__m64*>(Values[0].data()), vals);
  81         size_t td{count4 - 1};
  82         size_t i{1};
  83         do {
  84             coeffs = _mm_load_ps(Coeffs[i+1].data());
  85             vals = _mm_load_ps(Values[i].data());
  86             const auto imp1 = _mm_mul_ps(lrlr, coeffs);
  87             imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  88             vals = _mm_add_ps(imp0, vals);
  89             _mm_store_ps(Values[i].data(), vals);
  90             imp0 = imp1;
  91             i += 2;
  92         } while(--td);
  93         vals = _mm_loadl_pi(vals, reinterpret_cast<__m64*>(Values[i].data()));
  94         imp0 = _mm_movehl_ps(imp0, imp0);
  95         vals = _mm_add_ps(imp0, vals);
  96         _mm_storel_pi(reinterpret_cast<__m64*>(Values[i].data()), vals);
  97     }
  98 }
  99
 100 force_inline void MixLine(const al::span<const float> InSamples, const al::span<float> dst,
 101     float &CurrentGain, const float TargetGain, const float delta, const size_t fade_len,
 102     const size_t realign_len, size_t Counter)
 103 {
 104     const auto step = float{(TargetGain-CurrentGain) * delta};
 105
 106     size_t pos{0};
 107     if(std::abs(step) > std::numeric_limits<float>::epsilon())
 108     {
 109         const auto gain = float{CurrentGain};
 110         auto step_count = float{0.0f};
 111         /* Mix with applying gain steps in aligned multiples of 4. */
 112         if(const size_t todo{fade_len >> 2})
 113         {
 114             const auto four4 = _mm_set1_ps(4.0f);
 115             const auto step4 = _mm_set1_ps(step);
 116             const auto gain4 = _mm_set1_ps(gain);
 117             auto step_count4 = _mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f);
 118
 119             const auto in4 = al::span{reinterpret_cast<const __m128*>(InSamples.data()),
 120                 InSamples.size()/4}.first(todo);
 121             const auto out4 = al::span{reinterpret_cast<__m128*>(dst.data()), dst.size()/4};
 122             std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
 123                 [gain4,step4,four4,&step_count4](const __m128 val4, __m128 dry4) -> __m128
 124                 {
 125                     /* dry += val * (gain + step*step_count) */
 126                     dry4 = vmadd(dry4, val4, vmadd(gain4, step4, step_count4));
 127                     step_count4 = _mm_add_ps(step_count4, four4);
 128                     return dry4;
 129                 });
 130             pos += in4.size()*4;
 131
 132             /* NOTE: step_count4 now represents the next four counts after the
 133              * last four mixed samples, so the lowest element represents the
 134              * next step count to apply.
 135              */
 136             step_count = _mm_cvtss_f32(step_count4);
 137         }
 138         /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 139         if(const size_t leftover{fade_len&3})
 140         {
 141             const auto in = InSamples.subspan(pos, leftover);
 142             const auto out = dst.subspan(pos);
 143
 144             std::transform(in.begin(), in.end(), out.begin(), out.begin(),
 145                 [gain,step,&step_count](const float val, float dry) noexcept -> float
 146                 {
 147                     dry += val * (gain + step*step_count);
 148                     step_count += 1.0f;
 149                     return dry;
 150                 });
 151             pos += leftover;
 152         }
 153         if(pos < Counter)
 154         {
 155             CurrentGain = gain + step*step_count;
 156             return;
 157         }
 158
 159         /* Mix until pos is aligned with 4 or the mix is done. */
 160         if(const size_t leftover{realign_len&3})
 161         {
 162             const auto in = InSamples.subspan(pos, leftover);
 163             const auto out = dst.subspan(pos);
 164
 165             std::transform(in.begin(), in.end(), out.begin(), out.begin(),
 166                 [TargetGain](const float val, const float dry) noexcept -> float
 167                 { return dry + val*TargetGain; });
 168             pos += leftover;
 169         }
 170     }
 171     CurrentGain = TargetGain;
 172
 173     if(!(std::abs(TargetGain) > GainSilenceThreshold))
 174         return;
 175     if(size_t todo{(InSamples.size()-pos) >> 2})
 176     {
 177         const auto in4 = al::span{reinterpret_cast<const __m128*>(InSamples.data()),
 178             InSamples.size()/4}.last(todo);
 179         const auto out = dst.subspan(pos);
 180         const auto out4 = al::span{reinterpret_cast<__m128*>(out.data()), out.size()/4};
 181
 182         const auto gain4 = _mm_set1_ps(TargetGain);
 183         std::transform(in4.begin(), in4.end(), out4.begin(), out4.begin(),
 184             [gain4](const __m128 val4, const __m128 dry4) -> __m128
 185             { return vmadd(dry4, val4, gain4); });
 186         pos += in4.size()*4;
 187     }
 188     if(const size_t leftover{(InSamples.size()-pos)&3})
 189     {
 190         const auto in = InSamples.last(leftover);
 191         const auto out = dst.subspan(pos);
 192
 193         std::transform(in.begin(), in.end(), out.begin(), out.begin(),
 194             [TargetGain](const float val, const float dry) noexcept -> float
 195             { return dry + val*TargetGain; });
 196     }
 197 }
 198
 199 } // namespace
 200
 201 template<>
 202 void Resample_<CubicTag,SSETag>(const InterpState *state, const al::span<const float> src,
 203     uint frac, const uint increment, const al::span<float> dst)
 204 {
 205     ASSUME(frac < MixerFracOne);
 206
 207     const auto filter = std::get<CubicState>(*state).filter;
 208
 209     size_t pos{MaxResamplerEdge-1};
 210     std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,filter]() -> float
 211     {
 212         const uint pi{frac >> CubicPhaseDiffBits}; ASSUME(pi < CubicPhaseCount);
 213         const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
 214         const __m128 pf4{_mm_set1_ps(pf)};
 215
 216         /* Apply the phase interpolated filter. */
 217
 218         /* f = fil + pf*phd */
 219         const __m128 f4 = vmadd(_mm_load_ps(filter[pi].mCoeffs.data()), pf4,
 220             _mm_load_ps(filter[pi].mDeltas.data()));
 221         /* r = f*src */
 222         __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(&src[pos]))};
 223
 224         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 225         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 226         const float output{_mm_cvtss_f32(r4)};
 227
 228         frac += increment;
 229         pos  += frac>>MixerFracBits;
 230         frac &= MixerFracMask;
 231         return output;
 232     });
 233 }
 234
 235 template<>
 236 void Resample_<BSincTag,SSETag>(const InterpState *state, const al::span<const float> src,
 237     uint frac, const uint increment, const al::span<float> dst)
 238 {
 239     const auto &bsinc = std::get<BsincState>(*state);
 240     const auto sf4 = _mm_set1_ps(bsinc.sf);
 241     const auto m = size_t{bsinc.m};
 242     ASSUME(m > 0);
 243     ASSUME(m <= MaxResamplerPadding);
 244     ASSUME(frac < MixerFracOne);
 245
 246     const auto filter = bsinc.filter.first(4_uz*BSincPhaseCount*m);
 247
 248     ASSUME(bsinc.l <= MaxResamplerEdge);
 249     auto pos = size_t{MaxResamplerEdge-bsinc.l};
 250     std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,sf4,m,filter]() -> float
 251     {
 252         // Calculate the phase index and factor.
 253         const size_t pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
 254         const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
 255
 256         // Apply the scale and phase interpolated filter.
 257         auto r4 = _mm_setzero_ps();
 258         {
 259             const auto pf4 = _mm_set1_ps(pf);
 260             const auto fil = filter.subspan(2_uz*pi*m);
 261             const auto phd = fil.subspan(m);
 262             const auto scd = fil.subspan(2_uz*BSincPhaseCount*m);
 263             const auto spd = scd.subspan(m);
 264             auto td = size_t{m >> 2};
 265             auto j = size_t{0};
 266
 267             do {
 268                 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
 269                 const __m128 f4 = vmadd(
 270                     vmadd(_mm_load_ps(&fil[j]), sf4, _mm_load_ps(&scd[j])),
 271                     pf4, vmadd(_mm_load_ps(&phd[j]), sf4, _mm_load_ps(&spd[j])));
 272                 /* r += f*src */
 273                 r4 = vmadd(r4, f4, _mm_loadu_ps(&src[pos+j]));
 274                 j += 4;
 275             } while(--td);
 276         }
 277         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 278         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 279         const auto output = _mm_cvtss_f32(r4);
 280
 281         frac += increment;
 282         pos  += frac>>MixerFracBits;
 283         frac &= MixerFracMask;
 284         return output;
 285     });
 286 }
 287
 288 template<>
 289 void Resample_<FastBSincTag,SSETag>(const InterpState *state, const al::span<const float> src,
 290     uint frac, const uint increment, const al::span<float> dst)
 291 {
 292     const auto &bsinc = std::get<BsincState>(*state);
 293     const auto m = size_t{bsinc.m};
 294     ASSUME(m > 0);
 295     ASSUME(m <= MaxResamplerPadding);
 296     ASSUME(frac < MixerFracOne);
 297
 298     const auto filter = bsinc.filter.first(2_uz*m*BSincPhaseCount);
 299
 300     ASSUME(bsinc.l <= MaxResamplerEdge);
 301     size_t pos{MaxResamplerEdge-bsinc.l};
 302     std::generate(dst.begin(), dst.end(), [&pos,&frac,src,increment,filter,m]() -> float
 303     {
 304         // Calculate the phase index and factor.
 305         const size_t pi{frac >> BSincPhaseDiffBits}; ASSUME(pi < BSincPhaseCount);
 306         const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
 307
 308         // Apply the phase interpolated filter.
 309         auto r4 = _mm_setzero_ps();
 310         {
 311             const auto pf4 = _mm_set1_ps(pf);
 312             const auto fil = filter.subspan(2_uz*m*pi);
 313             const auto phd = fil.subspan(m);
 314             auto td = size_t{m >> 2};
 315             auto j = size_t{0};
 316
 317             do {
 318                 /* f = fil + pf*phd */
 319                 const auto f4 = vmadd(_mm_load_ps(&fil[j]), pf4, _mm_load_ps(&phd[j]));
 320                 /* r += f*src */
 321                 r4 = vmadd(r4, f4, _mm_loadu_ps(&src[pos+j]));
 322                 j += 4;
 323             } while(--td);
 324         }
 325         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 326         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 327         const auto output = _mm_cvtss_f32(r4);
 328
 329         frac += increment;
 330         pos  += frac>>MixerFracBits;
 331         frac &= MixerFracMask;
 332         return output;
 333     });
 334 }
 335
 336
 337 template<>
 338 void MixHrtf_<SSETag>(const al::span<const float> InSamples, const al::span<float2> AccumSamples,
 339     const uint IrSize, const MixHrtfFilter *hrtfparams, const size_t SamplesToDo)
 340 { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, SamplesToDo); }
 341
 342 template<>
 343 void MixHrtfBlend_<SSETag>(const al::span<const float> InSamples,
 344     const al::span<float2> AccumSamples, const uint IrSize, const HrtfFilter *oldparams,
 345     const MixHrtfFilter *newparams, const size_t SamplesToDo)
 346 {
 347     MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
 348         SamplesToDo);
 349 }
 350
 351 template<>
 352 void MixDirectHrtf_<SSETag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
 353     const al::span<const FloatBufferLine> InSamples, const al::span<float2> AccumSamples,
 354     const al::span<float,BufferLineSize> TempBuf, const al::span<HrtfChannelState> ChanState,
 355     const size_t IrSize, const size_t SamplesToDo)
 356 {
 357     MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
 358         IrSize, SamplesToDo);
 359 }
 360
 361
 362 template<>
 363 void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
 364     const al::span<float> CurrentGains, const al::span<const float> TargetGains,
 365     const size_t Counter, const size_t OutPos)
 366 {
 367     if((OutPos&3) != 0) UNLIKELY
 368         return Mix_<CTag>(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos);
 369
 370     const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
 371     const auto fade_len = std::min(Counter, InSamples.size());
 372     const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
 373
 374     auto curgains = CurrentGains.begin();
 375     auto targetgains = TargetGains.cbegin();
 376     for(FloatBufferLine &output : OutBuffer)
 377         MixLine(InSamples, al::span{output}.subspan(OutPos), *curgains++, *targetgains++, delta,
 378             fade_len, realign_len, Counter);
 379 }
 380
 381 template<>
 382 void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<float> OutBuffer,
 383     float &CurrentGain, const float TargetGain, const size_t Counter)
 384 {
 385     if((reinterpret_cast<uintptr_t>(OutBuffer.data())&15) != 0) UNLIKELY
 386         return Mix_<CTag>(InSamples, OutBuffer, CurrentGain, TargetGain, Counter);
 387
 388     const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
 389     const auto fade_len = std::min(Counter, InSamples.size());
 390     const auto realign_len = std::min((fade_len+3_uz) & ~3_uz, InSamples.size()) - fade_len;
 391
 392     MixLine(InSamples, OutBuffer, CurrentGain, TargetGain, delta, fade_len, realign_len, Counter);
 393 }