third_party/WebKit/Source/platform/audio/VectorMath.cpp

   1 /*
   2  * Copyright (C) 2010, Google Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1.  Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2.  Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  16  * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
  17  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  18  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  19  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
  20  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  21  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  22  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23  */
  24
  25 #include "config.h"
  26
  27 #if ENABLE(WEB_AUDIO)
  28
  29 #include "platform/audio/VectorMath.h"
  30 #include "wtf/Assertions.h"
  31 #include "wtf/CPU.h"
  32 #include <stdint.h>
  33
  34 #if OS(MACOSX)
  35 #include <Accelerate/Accelerate.h>
  36 #endif
  37
  38 #if CPU(X86) || CPU(X86_64)
  39 #include <emmintrin.h>
  40 #endif
  41
  42 #if HAVE(ARM_NEON_INTRINSICS)
  43 #include <arm_neon.h>
  44 #endif
  45
  46 #include <math.h>
  47 #include <algorithm>
  48
  49 namespace blink {
  50
  51 namespace VectorMath {
  52
  53 #if OS(MACOSX)
  54 // On the Mac we use the highly optimized versions in Accelerate.framework
  55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
  56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
  57
  58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
  59 {
  60 #if CPU(X86)
  61     ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
  62 #else
  63     vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
  64 #endif
  65 }
  66
  67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
  68 {
  69 #if CPU(X86)
  70     ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
  71 #else
  72     vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
  73 #endif
  74 }
  75
  76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
  77 {
  78 #if CPU(X86)
  79     ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
  80 #else
  81     vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
  82 #endif
  83 }
  84
  85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
  86 {
  87     DSPSplitComplex sc1;
  88     DSPSplitComplex sc2;
  89     DSPSplitComplex dest;
  90     sc1.realp = const_cast<float*>(real1P);
  91     sc1.imagp = const_cast<float*>(imag1P);
  92     sc2.realp = const_cast<float*>(real2P);
  93     sc2.imagp = const_cast<float*>(imag2P);
  94     dest.realp = realDestP;
  95     dest.imagp = imagDestP;
  96 #if CPU(X86)
  97     ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
  98 #else
  99     vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
 100 #endif
 101 }
 102
 103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 104 {
 105     vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
 106 }
 107
 108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
 109 {
 110     vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
 111 }
 112
 113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
 114 {
 115     vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
 116 }
 117
 118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
 119 {
 120     vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
 121 }
 122 #else
 123
 124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 125 {
 126     int n = framesToProcess;
 127
 128 #if CPU(X86) || CPU(X86_64)
 129     if ((sourceStride == 1) && (destStride == 1)) {
 130         float k = *scale;
 131
 132         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
 133         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
 134             *destP += k * *sourceP;
 135             sourceP++;
 136             destP++;
 137             n--;
 138         }
 139
 140         // Now the sourceP is aligned, use SSE.
 141         int tailFrames = n % 4;
 142         const float* endP = destP + n - tailFrames;
 143
 144         __m128 pSource;
 145         __m128 dest;
 146         __m128 temp;
 147         __m128 mScale = _mm_set_ps1(k);
 148
 149         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
 150
 151 #define SSE2_MULT_ADD(loadInstr, storeInstr)        \
 152             while (destP < endP)                    \
 153             {                                       \
 154                 pSource = _mm_load_ps(sourceP);     \
 155                 temp = _mm_mul_ps(pSource, mScale); \
 156                 dest = _mm_##loadInstr##_ps(destP); \
 157                 dest = _mm_add_ps(dest, temp);      \
 158                 _mm_##storeInstr##_ps(destP, dest); \
 159                 sourceP += 4;                       \
 160                 destP += 4;                         \
 161             }
 162
 163         if (destAligned)
 164             SSE2_MULT_ADD(load, store)
 165         else
 166             SSE2_MULT_ADD(loadu, storeu)
 167
 168         n = tailFrames;
 169     }
 170 #elif HAVE(ARM_NEON_INTRINSICS)
 171     if ((sourceStride == 1) && (destStride == 1)) {
 172         int tailFrames = n % 4;
 173         const float* endP = destP + n - tailFrames;
 174
 175         float32x4_t k = vdupq_n_f32(*scale);
 176         while (destP < endP) {
 177             float32x4_t source = vld1q_f32(sourceP);
 178             float32x4_t dest = vld1q_f32(destP);
 179
 180             dest = vmlaq_f32(dest, source, k);
 181             vst1q_f32(destP, dest);
 182
 183             sourceP += 4;
 184             destP += 4;
 185         }
 186         n = tailFrames;
 187     }
 188 #endif
 189     while (n) {
 190         *destP += *sourceP * *scale;
 191         sourceP += sourceStride;
 192         destP += destStride;
 193         n--;
 194     }
 195 }
 196
 197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
 198 {
 199     int n = framesToProcess;
 200
 201 #if CPU(X86) || CPU(X86_64)
 202     if ((sourceStride == 1) && (destStride == 1)) {
 203         float k = *scale;
 204
 205         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
 206         while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
 207             *destP = k * *sourceP;
 208             sourceP++;
 209             destP++;
 210             n--;
 211         }
 212
 213         // Now the sourceP address is aligned and start to apply SSE.
 214         int group = n / 4;
 215         __m128 mScale = _mm_set_ps1(k);
 216         __m128* pSource;
 217         __m128* pDest;
 218         __m128 dest;
 219
 220
 221         if (reinterpret_cast<size_t>(destP) & 0x0F) {
 222             while (group--) {
 223                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
 224                 dest = _mm_mul_ps(*pSource, mScale);
 225                 _mm_storeu_ps(destP, dest);
 226
 227                 sourceP += 4;
 228                 destP += 4;
 229             }
 230         } else {
 231             while (group--) {
 232                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
 233                 pDest = reinterpret_cast<__m128*>(destP);
 234                 *pDest = _mm_mul_ps(*pSource, mScale);
 235
 236                 sourceP += 4;
 237                 destP += 4;
 238             }
 239         }
 240
 241         // Non-SSE handling for remaining frames which is less than 4.
 242         n %= 4;
 243         while (n) {
 244             *destP = k * *sourceP;
 245             sourceP++;
 246             destP++;
 247             n--;
 248         }
 249     } else { // If strides are not 1, rollback to normal algorithm.
 250 #elif HAVE(ARM_NEON_INTRINSICS)
 251     if ((sourceStride == 1) && (destStride == 1)) {
 252         float k = *scale;
 253         int tailFrames = n % 4;
 254         const float* endP = destP + n - tailFrames;
 255
 256         while (destP < endP) {
 257             float32x4_t source = vld1q_f32(sourceP);
 258             vst1q_f32(destP, vmulq_n_f32(source, k));
 259
 260             sourceP += 4;
 261             destP += 4;
 262         }
 263         n = tailFrames;
 264     }
 265 #endif
 266     float k = *scale;
 267     while (n--) {
 268         *destP = k * *sourceP;
 269         sourceP += sourceStride;
 270         destP += destStride;
 271     }
 272 #if CPU(X86) || CPU(X86_64)
 273     }
 274 #endif
 275 }
 276
 277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
 278 {
 279     int n = framesToProcess;
 280
 281 #if CPU(X86) || CPU(X86_64)
 282     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
 283         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
 284         while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
 285             *destP = *source1P + *source2P;
 286             source1P++;
 287             source2P++;
 288             destP++;
 289             n--;
 290         }
 291
 292         // Now the source1P address is aligned and start to apply SSE.
 293         int group = n / 4;
 294         __m128* pSource1;
 295         __m128* pSource2;
 296         __m128* pDest;
 297         __m128 source2;
 298         __m128 dest;
 299
 300         bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
 301         bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
 302
 303         if (source2Aligned && destAligned) { // all aligned
 304             while (group--) {
 305                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
 306                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
 307                 pDest = reinterpret_cast<__m128*>(destP);
 308                 *pDest = _mm_add_ps(*pSource1, *pSource2);
 309
 310                 source1P += 4;
 311                 source2P += 4;
 312                 destP += 4;
 313             }
 314
 315         } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
 316             while (group--) {
 317                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
 318                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
 319                 dest = _mm_add_ps(*pSource1, *pSource2);
 320                 _mm_storeu_ps(destP, dest);
 321
 322                 source1P += 4;
 323                 source2P += 4;
 324                 destP += 4;
 325             }
 326
 327         } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
 328             while (group--) {
 329                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
 330                 source2 = _mm_loadu_ps(source2P);
 331                 pDest = reinterpret_cast<__m128*>(destP);
 332                 *pDest = _mm_add_ps(*pSource1, source2);
 333
 334                 source1P += 4;
 335                 source2P += 4;
 336                 destP += 4;
 337             }
 338         } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
 339             while (group--) {
 340                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
 341                 source2 = _mm_loadu_ps(source2P);
 342                 dest = _mm_add_ps(*pSource1, source2);
 343                 _mm_storeu_ps(destP, dest);
 344
 345                 source1P += 4;
 346                 source2P += 4;
 347                 destP += 4;
 348             }
 349         }
 350
 351         // Non-SSE handling for remaining frames which is less than 4.
 352         n %= 4;
 353         while (n) {
 354             *destP = *source1P + *source2P;
 355             source1P++;
 356             source2P++;
 357             destP++;
 358             n--;
 359         }
 360     } else { // if strides are not 1, rollback to normal algorithm
 361 #elif HAVE(ARM_NEON_INTRINSICS)
 362     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
 363         int tailFrames = n % 4;
 364         const float* endP = destP + n - tailFrames;
 365
 366         while (destP < endP) {
 367             float32x4_t source1 = vld1q_f32(source1P);
 368             float32x4_t source2 = vld1q_f32(source2P);
 369             vst1q_f32(destP, vaddq_f32(source1, source2));
 370
 371             source1P += 4;
 372             source2P += 4;
 373             destP += 4;
 374         }
 375         n = tailFrames;
 376     }
 377 #endif
 378     while (n--) {
 379         *destP = *source1P + *source2P;
 380         source1P += sourceStride1;
 381         source2P += sourceStride2;
 382         destP += destStride;
 383     }
 384 #if CPU(X86) || CPU(X86_64)
 385     }
 386 #endif
 387 }
 388
 389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
 390 {
 391
 392     int n = framesToProcess;
 393
 394 #if CPU(X86) || CPU(X86_64)
 395     if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
 396         // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
 397         while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
 398             *destP = *source1P * *source2P;
 399             source1P++;
 400             source2P++;
 401             destP++;
 402             n--;
 403         }
 404
 405         // Now the source1P address aligned and start to apply SSE.
 406         int tailFrames = n % 4;
 407         const float* endP = destP + n - tailFrames;
 408         __m128 pSource1;
 409         __m128 pSource2;
 410         __m128 dest;
 411
 412         bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
 413         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
 414
 415 #define SSE2_MULT(loadInstr, storeInstr)                   \
 416             while (destP < endP)                           \
 417             {                                              \
 418                 pSource1 = _mm_load_ps(source1P);          \
 419                 pSource2 = _mm_##loadInstr##_ps(source2P); \
 420                 dest = _mm_mul_ps(pSource1, pSource2);     \
 421                 _mm_##storeInstr##_ps(destP, dest);        \
 422                 source1P += 4;                             \
 423                 source2P += 4;                             \
 424                 destP += 4;                                \
 425             }
 426
 427         if (source2Aligned && destAligned) // Both aligned.
 428             SSE2_MULT(load, store)
 429         else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
 430             SSE2_MULT(load, storeu)
 431         else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
 432             SSE2_MULT(loadu, store)
 433         else // Neither aligned.
 434             SSE2_MULT(loadu, storeu)
 435
 436         n = tailFrames;
 437     }
 438 #elif HAVE(ARM_NEON_INTRINSICS)
 439     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
 440         int tailFrames = n % 4;
 441         const float* endP = destP + n - tailFrames;
 442
 443         while (destP < endP) {
 444             float32x4_t source1 = vld1q_f32(source1P);
 445             float32x4_t source2 = vld1q_f32(source2P);
 446             vst1q_f32(destP, vmulq_f32(source1, source2));
 447
 448             source1P += 4;
 449             source2P += 4;
 450             destP += 4;
 451         }
 452         n = tailFrames;
 453     }
 454 #endif
 455     while (n) {
 456         *destP = *source1P * *source2P;
 457         source1P += sourceStride1;
 458         source2P += sourceStride2;
 459         destP += destStride;
 460         n--;
 461     }
 462 }
 463
 464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
 465 {
 466     unsigned i = 0;
 467 #if CPU(X86) || CPU(X86_64)
 468     // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
 469     // Otherwise, fall through to the scalar code below.
 470     if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
 471         && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
 472         && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
 473         && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
 474         && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
 475         && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
 476
 477         unsigned endSize = framesToProcess - framesToProcess % 4;
 478         while (i < endSize) {
 479             __m128 real1 = _mm_load_ps(real1P + i);
 480             __m128 real2 = _mm_load_ps(real2P + i);
 481             __m128 imag1 = _mm_load_ps(imag1P + i);
 482             __m128 imag2 = _mm_load_ps(imag2P + i);
 483             __m128 real = _mm_mul_ps(real1, real2);
 484             real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
 485             __m128 imag = _mm_mul_ps(real1, imag2);
 486             imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
 487             _mm_store_ps(realDestP + i, real);
 488             _mm_store_ps(imagDestP + i, imag);
 489             i += 4;
 490         }
 491     }
 492 #elif HAVE(ARM_NEON_INTRINSICS)
 493         unsigned endSize = framesToProcess - framesToProcess % 4;
 494         while (i < endSize) {
 495             float32x4_t real1 = vld1q_f32(real1P + i);
 496             float32x4_t real2 = vld1q_f32(real2P + i);
 497             float32x4_t imag1 = vld1q_f32(imag1P + i);
 498             float32x4_t imag2 = vld1q_f32(imag2P + i);
 499
 500             float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
 501             float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
 502
 503             vst1q_f32(realDestP + i, realResult);
 504             vst1q_f32(imagDestP + i, imagResult);
 505
 506             i += 4;
 507         }
 508 #endif
 509     for (; i < framesToProcess; ++i) {
 510         // Read and compute result before storing them, in case the
 511         // destination is the same as one of the sources.
 512         float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
 513         float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
 514
 515         realDestP[i] = realResult;
 516         imagDestP[i] = imagResult;
 517     }
 518 }
 519
 520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
 521 {
 522     int n = framesToProcess;
 523     float sum = 0;
 524
 525 #if CPU(X86) || CPU(X86_64)
 526     if (sourceStride == 1) {
 527         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
 528         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
 529             float sample = *sourceP;
 530             sum += sample * sample;
 531             sourceP++;
 532             n--;
 533         }
 534
 535         // Now the sourceP is aligned, use SSE.
 536         int tailFrames = n % 4;
 537         const float* endP = sourceP + n - tailFrames;
 538         __m128 source;
 539         __m128 mSum = _mm_setzero_ps();
 540
 541         while (sourceP < endP) {
 542             source = _mm_load_ps(sourceP);
 543             source = _mm_mul_ps(source, source);
 544             mSum = _mm_add_ps(mSum, source);
 545             sourceP += 4;
 546         }
 547
 548         // Summarize the SSE results.
 549         const float* groupSumP = reinterpret_cast<float*>(&mSum);
 550         sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
 551
 552         n = tailFrames;
 553     }
 554 #elif HAVE(ARM_NEON_INTRINSICS)
 555     if (sourceStride == 1) {
 556         int tailFrames = n % 4;
 557         const float* endP = sourceP + n - tailFrames;
 558
 559         float32x4_t fourSum = vdupq_n_f32(0);
 560         while (sourceP < endP) {
 561             float32x4_t source = vld1q_f32(sourceP);
 562             fourSum = vmlaq_f32(fourSum, source, source);
 563             sourceP += 4;
 564         }
 565         float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
 566
 567         float groupSum[2];
 568         vst1_f32(groupSum, twoSum);
 569         sum += groupSum[0] + groupSum[1];
 570
 571         n = tailFrames;
 572     }
 573 #endif
 574
 575     while (n--) {
 576         float sample = *sourceP;
 577         sum += sample * sample;
 578         sourceP += sourceStride;
 579     }
 580
 581     ASSERT(sumP);
 582     *sumP = sum;
 583 }
 584
 585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
 586 {
 587     int n = framesToProcess;
 588     float max = 0;
 589
 590 #if CPU(X86) || CPU(X86_64)
 591     if (sourceStride == 1) {
 592         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
 593         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
 594             max = std::max(max, fabsf(*sourceP));
 595             sourceP++;
 596             n--;
 597         }
 598
 599         // Now the sourceP is aligned, use SSE.
 600         int tailFrames = n % 4;
 601         const float* endP = sourceP + n - tailFrames;
 602         __m128 source;
 603         __m128 mMax = _mm_setzero_ps();
 604         int mask = 0x7FFFFFFF;
 605         __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
 606
 607         while (sourceP < endP) {
 608             source = _mm_load_ps(sourceP);
 609             // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
 610             source = _mm_and_ps(source, mMask);
 611             mMax = _mm_max_ps(mMax, source);
 612             sourceP += 4;
 613         }
 614
 615         // Get max from the SSE results.
 616         const float* groupMaxP = reinterpret_cast<float*>(&mMax);
 617         max = std::max(max, groupMaxP[0]);
 618         max = std::max(max, groupMaxP[1]);
 619         max = std::max(max, groupMaxP[2]);
 620         max = std::max(max, groupMaxP[3]);
 621
 622         n = tailFrames;
 623     }
 624 #elif HAVE(ARM_NEON_INTRINSICS)
 625     if (sourceStride == 1) {
 626         int tailFrames = n % 4;
 627         const float* endP = sourceP + n - tailFrames;
 628
 629         float32x4_t fourMax = vdupq_n_f32(0);
 630         while (sourceP < endP) {
 631             float32x4_t source = vld1q_f32(sourceP);
 632             fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
 633             sourceP += 4;
 634         }
 635         float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
 636
 637         float groupMax[2];
 638         vst1_f32(groupMax, twoMax);
 639         max = std::max(groupMax[0], groupMax[1]);
 640
 641         n = tailFrames;
 642     }
 643 #endif
 644
 645     while (n--) {
 646         max = std::max(max, fabsf(*sourceP));
 647         sourceP += sourceStride;
 648     }
 649
 650     ASSERT(maxP);
 651     *maxP = max;
 652 }
 653
 654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
 655 {
 656     int n = framesToProcess;
 657     float lowThreshold = *lowThresholdP;
 658     float highThreshold = *highThresholdP;
 659
 660     // FIXME: Optimize for SSE2.
 661 #if HAVE(ARM_NEON_INTRINSICS)
 662     if ((sourceStride == 1) && (destStride == 1)) {
 663         int tailFrames = n % 4;
 664         const float* endP = destP + n - tailFrames;
 665
 666         float32x4_t low = vdupq_n_f32(lowThreshold);
 667         float32x4_t high = vdupq_n_f32(highThreshold);
 668         while (destP < endP) {
 669             float32x4_t source = vld1q_f32(sourceP);
 670             vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
 671             sourceP += 4;
 672             destP += 4;
 673         }
 674         n = tailFrames;
 675     }
 676 #endif
 677     while (n--) {
 678         *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
 679         sourceP += sourceStride;
 680         destP += destStride;
 681     }
 682 }
 683
 684 #endif // OS(MACOSX)
 685
 686 } // namespace VectorMath
 687
 688 } // namespace blink
 689
 690 #endif // ENABLE(WEB_AUDIO)