alc/mixer/mixer_neon.cpp

   1 #include "config.h"
   2
   3 #include <arm_neon.h>
   4
   5 #include <limits>
   6
   7 #include "AL/al.h"
   8 #include "AL/alc.h"
   9 #include "alcmain.h"
  10 #include "alu.h"
  11 #include "hrtf.h"
  12 #include "defs.h"
  13 #include "hrtfbase.h"
  14
  15
  16
  17 template<>
  18 const ALfloat *Resample_<LerpTag,NEONTag>(const InterpState*, const ALfloat *RESTRICT src,
  19     ALuint frac, ALuint increment, const al::span<float> dst)
  20 {
  21     const int32x4_t increment4 = vdupq_n_s32(static_cast<int>(increment*4));
  22     const float32x4_t fracOne4 = vdupq_n_f32(1.0f/FRACTIONONE);
  23     const int32x4_t fracMask4 = vdupq_n_s32(FRACTIONMASK);
  24     alignas(16) ALuint pos_[4], frac_[4];
  25     int32x4_t pos4, frac4;
  26
  27     InitPosArrays(frac, increment, frac_, pos_, 4);
  28     frac4 = vld1q_s32(reinterpret_cast<int*>(frac_));
  29     pos4 = vld1q_s32(reinterpret_cast<int*>(pos_));
  30
  31     auto dst_iter = dst.begin();
  32     const auto aligned_end = (dst.size()&~3u) + dst_iter;
  33     while(dst_iter != aligned_end)
  34     {
  35         const int pos0{vgetq_lane_s32(pos4, 0)};
  36         const int pos1{vgetq_lane_s32(pos4, 1)};
  37         const int pos2{vgetq_lane_s32(pos4, 2)};
  38         const int pos3{vgetq_lane_s32(pos4, 3)};
  39         const float32x4_t val1{src[pos0], src[pos1], src[pos2], src[pos3]};
  40         const float32x4_t val2{src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1]};
  41
  42         /* val1 + (val2-val1)*mu */
  43         const float32x4_t r0{vsubq_f32(val2, val1)};
  44         const float32x4_t mu{vmulq_f32(vcvtq_f32_s32(frac4), fracOne4)};
  45         const float32x4_t out{vmlaq_f32(val1, mu, r0)};
  46
  47         vst1q_f32(dst_iter, out);
  48         dst_iter += 4;
  49
  50         frac4 = vaddq_s32(frac4, increment4);
  51         pos4 = vaddq_s32(pos4, vshrq_n_s32(frac4, FRACTIONBITS));
  52         frac4 = vandq_s32(frac4, fracMask4);
  53     }
  54
  55     if(dst_iter != dst.end())
  56     {
  57         src += static_cast<ALuint>(vgetq_lane_s32(pos4, 0));
  58         frac = static_cast<ALuint>(vgetq_lane_s32(frac4, 0));
  59
  60         do {
  61             *(dst_iter++) = lerp(src[0], src[1], static_cast<float>(frac) * (1.0f/FRACTIONONE));
  62
  63             frac += increment;
  64             src  += frac>>FRACTIONBITS;
  65             frac &= FRACTIONMASK;
  66         } while(dst_iter != dst.end());
  67     }
  68     return dst.begin();
  69 }
  70
  71 template<>
  72 const ALfloat *Resample_<BSincTag,NEONTag>(const InterpState *state, const ALfloat *RESTRICT src,
  73     ALuint frac, ALuint increment, const al::span<float> dst)
  74 {
  75     const float *const filter{state->bsinc.filter};
  76     const float32x4_t sf4{vdupq_n_f32(state->bsinc.sf)};
  77     const size_t m{state->bsinc.m};
  78
  79     src -= state->bsinc.l;
  80     for(float &out_sample : dst)
  81     {
  82         // Calculate the phase index and factor.
  83 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
  84         const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
  85         const float pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
  86             (1.0f/(1<<FRAC_PHASE_BITDIFF))};
  87 #undef FRAC_PHASE_BITDIFF
  88
  89         // Apply the scale and phase interpolated filter.
  90         float32x4_t r4{vdupq_n_f32(0.0f)};
  91         {
  92             const float32x4_t pf4{vdupq_n_f32(pf)};
  93             const float *fil{filter + m*pi*4};
  94             const float *phd{fil + m};
  95             const float *scd{phd + m};
  96             const float *spd{scd + m};
  97             size_t td{m >> 2};
  98             size_t j{0u};
  99
 100             do {
 101                 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
 102                 const float32x4_t f4 = vmlaq_f32(
 103                     vmlaq_f32(vld1q_f32(fil), sf4, vld1q_f32(scd)),
 104                     pf4, vmlaq_f32(vld1q_f32(phd), sf4, vld1q_f32(spd)));
 105                 fil += 4; scd += 4; phd += 4; spd += 4;
 106                 /* r += f*src */
 107                 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
 108                 j += 4;
 109             } while(--td);
 110         }
 111         r4 = vaddq_f32(r4, vrev64q_f32(r4));
 112         out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
 113
 114         frac += increment;
 115         src  += frac>>FRACTIONBITS;
 116         frac &= FRACTIONMASK;
 117     }
 118     return dst.begin();
 119 }
 120
 121 template<>
 122 const ALfloat *Resample_<FastBSincTag,NEONTag>(const InterpState *state,
 123     const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span<float> dst)
 124 {
 125     const float *const filter{state->bsinc.filter};
 126     const size_t m{state->bsinc.m};
 127
 128     src -= state->bsinc.l;
 129     for(float &out_sample : dst)
 130     {
 131         // Calculate the phase index and factor.
 132 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
 133         const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
 134         const float pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
 135             (1.0f/(1<<FRAC_PHASE_BITDIFF))};
 136 #undef FRAC_PHASE_BITDIFF
 137
 138         // Apply the phase interpolated filter.
 139         float32x4_t r4{vdupq_n_f32(0.0f)};
 140         {
 141             const float32x4_t pf4{vdupq_n_f32(pf)};
 142             const float *fil{filter + m*pi*4};
 143             const float *phd{fil + m};
 144             size_t td{m >> 2};
 145             size_t j{0u};
 146
 147             do {
 148                 /* f = fil + pf*phd */
 149                 const float32x4_t f4 = vmlaq_f32(vld1q_f32(fil), pf4, vld1q_f32(phd));
 150                 /* r += f*src */
 151                 r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
 152                 fil += 4; phd += 4; j += 4;
 153             } while(--td);
 154         }
 155         r4 = vaddq_f32(r4, vrev64q_f32(r4));
 156         out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
 157
 158         frac += increment;
 159         src  += frac>>FRACTIONBITS;
 160         frac &= FRACTIONMASK;
 161     }
 162     return dst.begin();
 163 }
 164
 165
 166 static inline void ApplyCoeffs(size_t /*Offset*/, float2 *RESTRICT Values, const ALuint IrSize,
 167     const HrirArray &Coeffs, const float left, const float right)
 168 {
 169     ASSUME(IrSize >= 4);
 170
 171     float32x4_t leftright4;
 172     {
 173         float32x2_t leftright2 = vdup_n_f32(0.0);
 174         leftright2 = vset_lane_f32(left, leftright2, 0);
 175         leftright2 = vset_lane_f32(right, leftright2, 1);
 176         leftright4 = vcombine_f32(leftright2, leftright2);
 177     }
 178
 179     for(ALuint c{0};c < IrSize;c += 2)
 180     {
 181         float32x4_t vals = vld1q_f32(&Values[c][0]);
 182         float32x4_t coefs = vld1q_f32(&Coeffs[c][0]);
 183
 184         vals = vmlaq_f32(vals, coefs, leftright4);
 185
 186         vst1q_f32(&Values[c][0], vals);
 187     }
 188 }
 189
 190 template<>
 191 void MixHrtf_<NEONTag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
 192     const float *InSamples, float2 *AccumSamples, const size_t OutPos, const ALuint IrSize,
 193     MixHrtfFilter *hrtfparams, const size_t BufferSize)
 194 {
 195     MixHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, OutPos, IrSize,
 196         hrtfparams, BufferSize);
 197 }
 198
 199 template<>
 200 void MixHrtfBlend_<NEONTag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
 201     const float *InSamples, float2 *AccumSamples, const size_t OutPos, const ALuint IrSize,
 202     const HrtfFilter *oldparams, MixHrtfFilter *newparams, const size_t BufferSize)
 203 {
 204     MixHrtfBlendBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, OutPos, IrSize,
 205         oldparams, newparams, BufferSize);
 206 }
 207
 208 template<>
 209 void MixDirectHrtf_<NEONTag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
 210     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples, DirectHrtfState *State,
 211     const size_t BufferSize)
 212 { MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, State, BufferSize); }
 213
 214
 215 template<>
 216 void Mix_<NEONTag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
 217     float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
 218 {
 219     const ALfloat delta{(Counter > 0) ? 1.0f / static_cast<ALfloat>(Counter) : 0.0f};
 220     const bool reached_target{InSamples.size() >= Counter};
 221     const auto min_end = reached_target ? InSamples.begin() + Counter : InSamples.end();
 222     const auto aligned_end = minz(static_cast<uintptr_t>(min_end-InSamples.begin()+3) & ~3u,
 223         InSamples.size()) + InSamples.begin();
 224     for(FloatBufferLine &output : OutBuffer)
 225     {
 226         ALfloat *RESTRICT dst{al::assume_aligned<16>(output.data()+OutPos)};
 227         ALfloat gain{*CurrentGains};
 228         const ALfloat diff{*TargetGains - gain};
 229
 230         auto in_iter = InSamples.begin();
 231         if(std::fabs(diff) > std::numeric_limits<float>::epsilon())
 232         {
 233             const ALfloat step{diff * delta};
 234             ALfloat step_count{0.0f};
 235             /* Mix with applying gain steps in aligned multiples of 4. */
 236             if(ptrdiff_t todo{(min_end-in_iter) >> 2})
 237             {
 238                 const float32x4_t four4{vdupq_n_f32(4.0f)};
 239                 const float32x4_t step4{vdupq_n_f32(step)};
 240                 const float32x4_t gain4{vdupq_n_f32(gain)};
 241                 float32x4_t step_count4{vsetq_lane_f32(0.0f,
 242                     vsetq_lane_f32(1.0f,
 243                     vsetq_lane_f32(2.0f,
 244                     vsetq_lane_f32(3.0f, vdupq_n_f32(0.0f), 3),
 245                     2), 1), 0
 246                 )};
 247                 do {
 248                     const float32x4_t val4 = vld1q_f32(in_iter);
 249                     float32x4_t dry4 = vld1q_f32(dst);
 250                     dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
 251                     step_count4 = vaddq_f32(step_count4, four4);
 252                     vst1q_f32(dst, dry4);
 253                     in_iter += 4; dst += 4;
 254                 } while(--todo);
 255                 /* NOTE: step_count4 now represents the next four counts after
 256                  * the last four mixed samples, so the lowest element
 257                  * represents the next step count to apply.
 258                  */
 259                 step_count = vgetq_lane_f32(step_count4, 0);
 260             }
 261             /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 262             while(in_iter != min_end)
 263             {
 264                 *(dst++) += *(in_iter++) * (gain + step*step_count);
 265                 step_count += 1.0f;
 266             }
 267             if(reached_target)
 268                 gain = *TargetGains;
 269             else
 270                 gain += step*step_count;
 271             *CurrentGains = gain;
 272
 273             /* Mix until pos is aligned with 4 or the mix is done. */
 274             while(in_iter != aligned_end)
 275                 *(dst++) += *(in_iter++) * gain;
 276         }
 277         ++CurrentGains;
 278         ++TargetGains;
 279
 280         if(!(std::fabs(gain) > GAIN_SILENCE_THRESHOLD))
 281             continue;
 282         if(ptrdiff_t todo{(InSamples.end()-in_iter) >> 2})
 283         {
 284             const float32x4_t gain4 = vdupq_n_f32(gain);
 285             do {
 286                 const float32x4_t val4 = vld1q_f32(in_iter);
 287                 float32x4_t dry4 = vld1q_f32(dst);
 288                 dry4 = vmlaq_f32(dry4, val4, gain4);
 289                 vst1q_f32(dst, dry4);
 290                 in_iter += 4; dst += 4;
 291             } while(--todo);
 292         }
 293         while(in_iter != InSamples.end())
 294             *(dst++) += *(in_iter++) * gain;
 295     }
 296 }
 297
 298 template<>
 299 void MixRow_<NEONTag>(const al::span<float> OutBuffer, const al::span<const float> Gains,
 300     const float *InSamples, const size_t InStride)
 301 {
 302     for(const ALfloat gain : Gains)
 303     {
 304         const ALfloat *RESTRICT input{InSamples};
 305         InSamples += InStride;
 306
 307         if(!(std::fabs(gain) > GAIN_SILENCE_THRESHOLD))
 308             continue;
 309
 310         auto out_iter = OutBuffer.begin();
 311         if(size_t todo{OutBuffer.size() >> 2})
 312         {
 313             const float32x4_t gain4{vdupq_n_f32(gain)};
 314             do {
 315                 const float32x4_t val4 = vld1q_f32(input);
 316                 float32x4_t dry4 = vld1q_f32(out_iter);
 317                 dry4 = vmlaq_f32(dry4, val4, gain4);
 318                 vst1q_f32(out_iter, dry4);
 319                 out_iter += 4; input += 4;
 320             } while(--todo);
 321         }
 322
 323         auto do_mix = [gain](const float cur, const float src) noexcept -> float
 324         { return cur + src*gain; };
 325         std::transform(out_iter, OutBuffer.end(), input, out_iter, do_mix);
 326     }
 327 }