alc/mixer/mixer_sse.cpp

   1 #include "config.h"
   2
   3 #include <xmmintrin.h>
   4
   5 #include <limits>
   6
   7 #include "AL/al.h"
   8 #include "AL/alc.h"
   9 #include "alcmain.h"
  10
  11 #include "alu.h"
  12 #include "defs.h"
  13 #include "hrtfbase.h"
  14
  15
  16 namespace {
  17
  18 inline void ApplyCoeffs(float2 *RESTRICT Values, const ALuint IrSize, const HrirArray &Coeffs,
  19     const float left, const float right)
  20 {
  21     const __m128 lrlr{_mm_setr_ps(left, right, left, right)};
  22
  23     ASSUME(IrSize >= 4);
  24     /* This isn't technically correct to test alignment, but it's true for
  25      * systems that support SSE, which is the only one that needs to know the
  26      * alignment of Values (which alternates between 8- and 16-byte aligned).
  27      */
  28     if(reinterpret_cast<intptr_t>(Values)&0x8)
  29     {
  30         __m128 imp0, imp1;
  31         __m128 coeffs{_mm_load_ps(&Coeffs[0][0])};
  32         __m128 vals{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64*>(&Values[0][0]))};
  33         imp0 = _mm_mul_ps(lrlr, coeffs);
  34         vals = _mm_add_ps(imp0, vals);
  35         _mm_storel_pi(reinterpret_cast<__m64*>(&Values[0][0]), vals);
  36         ALuint i{1};
  37         for(;i < IrSize-1;i += 2)
  38         {
  39             coeffs = _mm_load_ps(&Coeffs[i+1][0]);
  40             vals = _mm_load_ps(&Values[i][0]);
  41             imp1 = _mm_mul_ps(lrlr, coeffs);
  42             imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  43             vals = _mm_add_ps(imp0, vals);
  44             _mm_store_ps(&Values[i][0], vals);
  45             imp0 = imp1;
  46         }
  47         vals = _mm_loadl_pi(vals, reinterpret_cast<__m64*>(&Values[i][0]));
  48         imp0 = _mm_movehl_ps(imp0, imp0);
  49         vals = _mm_add_ps(imp0, vals);
  50         _mm_storel_pi(reinterpret_cast<__m64*>(&Values[i][0]), vals);
  51     }
  52     else
  53     {
  54         for(ALuint i{0};i < IrSize;i += 2)
  55         {
  56             __m128 coeffs{_mm_load_ps(&Coeffs[i][0])};
  57             __m128 vals{_mm_load_ps(&Values[i][0])};
  58             vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
  59             _mm_store_ps(&Values[i][0], vals);
  60         }
  61     }
  62 }
  63
  64 } // namespace
  65
  66 template<>
  67 const ALfloat *Resample_<BSincTag,SSETag>(const InterpState *state, const ALfloat *RESTRICT src,
  68     ALuint frac, ALuint increment, const al::span<float> dst)
  69 {
  70     const float *const filter{state->bsinc.filter};
  71     const __m128 sf4{_mm_set1_ps(state->bsinc.sf)};
  72     const size_t m{state->bsinc.m};
  73
  74     src -= state->bsinc.l;
  75     for(float &out_sample : dst)
  76     {
  77         // Calculate the phase index and factor.
  78 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
  79         const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
  80         const float pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
  81             (1.0f/(1<<FRAC_PHASE_BITDIFF))};
  82 #undef FRAC_PHASE_BITDIFF
  83
  84         // Apply the scale and phase interpolated filter.
  85         __m128 r4{_mm_setzero_ps()};
  86         {
  87             const __m128 pf4{_mm_set1_ps(pf)};
  88             const float *fil{filter + m*pi*4};
  89             const float *phd{fil + m};
  90             const float *scd{phd + m};
  91             const float *spd{scd + m};
  92             size_t td{m >> 2};
  93             size_t j{0u};
  94
  95 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
  96             do {
  97                 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
  98                 const __m128 f4 = MLA4(
  99                     MLA4(_mm_load_ps(fil), sf4, _mm_load_ps(scd)),
 100                     pf4, MLA4(_mm_load_ps(phd), sf4, _mm_load_ps(spd)));
 101                 fil += 4; scd += 4; phd += 4; spd += 4;
 102                 /* r += f*src */
 103                 r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
 104                 j += 4;
 105             } while(--td);
 106 #undef MLA4
 107         }
 108         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 109         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 110         out_sample = _mm_cvtss_f32(r4);
 111
 112         frac += increment;
 113         src  += frac>>FRACTIONBITS;
 114         frac &= FRACTIONMASK;
 115     }
 116     return dst.begin();
 117 }
 118
 119 template<>
 120 const ALfloat *Resample_<FastBSincTag,SSETag>(const InterpState *state,
 121     const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span<float> dst)
 122 {
 123     const float *const filter{state->bsinc.filter};
 124     const size_t m{state->bsinc.m};
 125
 126     src -= state->bsinc.l;
 127     for(float &out_sample : dst)
 128     {
 129         // Calculate the phase index and factor.
 130 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
 131         const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
 132         const float pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
 133             (1.0f/(1<<FRAC_PHASE_BITDIFF))};
 134 #undef FRAC_PHASE_BITDIFF
 135
 136         // Apply the phase interpolated filter.
 137         __m128 r4{_mm_setzero_ps()};
 138         {
 139             const __m128 pf4{_mm_set1_ps(pf)};
 140             const float *fil{filter + m*pi*4};
 141             const float *phd{fil + m};
 142             size_t td{m >> 2};
 143             size_t j{0u};
 144
 145 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
 146             do {
 147                 /* f = fil + pf*phd */
 148                 const __m128 f4 = MLA4(_mm_load_ps(fil), pf4, _mm_load_ps(phd));
 149                 /* r += f*src */
 150                 r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
 151                 fil += 4; phd += 4; j += 4;
 152             } while(--td);
 153 #undef MLA4
 154         }
 155         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 156         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 157         out_sample = _mm_cvtss_f32(r4);
 158
 159         frac += increment;
 160         src  += frac>>FRACTIONBITS;
 161         frac &= FRACTIONMASK;
 162     }
 163     return dst.begin();
 164 }
 165
 166
 167 template<>
 168 void MixHrtf_<SSETag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
 169     const float *InSamples, float2 *AccumSamples, const size_t OutPos, const ALuint IrSize,
 170     MixHrtfFilter *hrtfparams, const size_t BufferSize)
 171 {
 172     MixHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, OutPos, IrSize,
 173         hrtfparams, BufferSize);
 174 }
 175
 176 template<>
 177 void MixHrtfBlend_<SSETag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
 178     const float *InSamples, float2 *AccumSamples, const size_t OutPos, const ALuint IrSize,
 179     const HrtfFilter *oldparams, MixHrtfFilter *newparams, const size_t BufferSize)
 180 {
 181     MixHrtfBlendBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, OutPos, IrSize,
 182         oldparams, newparams, BufferSize);
 183 }
 184
 185 template<>
 186 void MixDirectHrtf_<SSETag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
 187     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples, DirectHrtfState *State,
 188     const size_t BufferSize)
 189 { MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, State, BufferSize); }
 190
 191
 192 template<>
 193 void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
 194     float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
 195 {
 196     const ALfloat delta{(Counter > 0) ? 1.0f / static_cast<ALfloat>(Counter) : 0.0f};
 197     const bool reached_target{InSamples.size() >= Counter};
 198     const auto min_end = reached_target ? InSamples.begin() + Counter : InSamples.end();
 199     const auto aligned_end = minz(static_cast<uintptr_t>(min_end-InSamples.begin()+3) & ~3u,
 200         InSamples.size()) + InSamples.begin();
 201     for(FloatBufferLine &output : OutBuffer)
 202     {
 203         ALfloat *RESTRICT dst{al::assume_aligned<16>(output.data()+OutPos)};
 204         ALfloat gain{*CurrentGains};
 205         const ALfloat diff{*TargetGains - gain};
 206
 207         auto in_iter = InSamples.begin();
 208         if(std::fabs(diff) > std::numeric_limits<float>::epsilon())
 209         {
 210             const ALfloat step{diff * delta};
 211             ALfloat step_count{0.0f};
 212             /* Mix with applying gain steps in aligned multiples of 4. */
 213             if(ptrdiff_t todo{(min_end-in_iter) >> 2})
 214             {
 215                 const __m128 four4{_mm_set1_ps(4.0f)};
 216                 const __m128 step4{_mm_set1_ps(step)};
 217                 const __m128 gain4{_mm_set1_ps(gain)};
 218                 __m128 step_count4{_mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f)};
 219                 do {
 220                     const __m128 val4{_mm_load_ps(in_iter)};
 221                     __m128 dry4{_mm_load_ps(dst)};
 222 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
 223                     /* dry += val * (gain + step*step_count) */
 224                     dry4 = MLA4(dry4, val4, MLA4(gain4, step4, step_count4));
 225 #undef MLA4
 226                     _mm_store_ps(dst, dry4);
 227                     step_count4 = _mm_add_ps(step_count4, four4);
 228                     in_iter += 4; dst += 4;
 229                 } while(--todo);
 230                 /* NOTE: step_count4 now represents the next four counts after
 231                  * the last four mixed samples, so the lowest element
 232                  * represents the next step count to apply.
 233                  */
 234                 step_count = _mm_cvtss_f32(step_count4);
 235             }
 236             /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 237             while(in_iter != min_end)
 238             {
 239                 *(dst++) += *(in_iter++) * (gain + step*step_count);
 240                 step_count += 1.0f;
 241             }
 242             if(reached_target)
 243                 gain = *TargetGains;
 244             else
 245                 gain += step*step_count;
 246             *CurrentGains = gain;
 247
 248             /* Mix until pos is aligned with 4 or the mix is done. */
 249             while(in_iter != aligned_end)
 250                 *(dst++) += *(in_iter++) * gain;
 251         }
 252         ++CurrentGains;
 253         ++TargetGains;
 254
 255         if(!(std::fabs(gain) > GAIN_SILENCE_THRESHOLD))
 256             continue;
 257         if(ptrdiff_t todo{(InSamples.end()-in_iter) >> 2})
 258         {
 259             const __m128 gain4{_mm_set1_ps(gain)};
 260             do {
 261                 const __m128 val4{_mm_load_ps(in_iter)};
 262                 __m128 dry4{_mm_load_ps(dst)};
 263                 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 264                 _mm_store_ps(dst, dry4);
 265                 in_iter += 4; dst += 4;
 266             } while(--todo);
 267         }
 268         while(in_iter != InSamples.end())
 269             *(dst++) += *(in_iter++) * gain;
 270     }
 271 }
 272
 273 template<>
 274 void MixRow_<SSETag>(const al::span<float> OutBuffer, const al::span<const float> Gains,
 275     const float *InSamples, const size_t InStride)
 276 {
 277     for(const float gain : Gains)
 278     {
 279         const float *RESTRICT input{InSamples};
 280         InSamples += InStride;
 281
 282         if(!(std::fabs(gain) > GAIN_SILENCE_THRESHOLD))
 283             continue;
 284
 285         auto out_iter = OutBuffer.begin();
 286         if(size_t todo{OutBuffer.size() >> 2})
 287         {
 288             const __m128 gain4 = _mm_set1_ps(gain);
 289             do {
 290                 const __m128 val4{_mm_load_ps(input)};
 291                 __m128 dry4{_mm_load_ps(out_iter)};
 292                 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 293                 _mm_store_ps(out_iter, dry4);
 294                 out_iter += 4; input += 4;
 295             } while(--todo);
 296         }
 297
 298         auto do_mix = [gain](const float cur, const float src) noexcept -> float
 299         { return cur + src*gain; };
 300         std::transform(out_iter, OutBuffer.end(), input, out_iter, do_mix);
 301     }
 302 }