2 * Copyright (C) 2010, Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "platform/audio/VectorMath.h"
30 #include "wtf/Assertions.h"
35 #include <Accelerate/Accelerate.h>
38 #if CPU(X86) || CPU(X86_64)
39 #include <emmintrin.h>
42 #if HAVE(ARM_NEON_INTRINSICS)
51 namespace VectorMath
{
54 // On the Mac we use the highly optimized versions in Accelerate.framework
55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
58 void vsmul(const float* sourceP
, int sourceStride
, const float* scale
, float* destP
, int destStride
, size_t framesToProcess
)
61 ::vsmul(sourceP
, sourceStride
, scale
, destP
, destStride
, framesToProcess
);
63 vDSP_vsmul(sourceP
, sourceStride
, scale
, destP
, destStride
, framesToProcess
);
67 void vadd(const float* source1P
, int sourceStride1
, const float* source2P
, int sourceStride2
, float* destP
, int destStride
, size_t framesToProcess
)
70 ::vadd(source1P
, sourceStride1
, source2P
, sourceStride2
, destP
, destStride
, framesToProcess
);
72 vDSP_vadd(source1P
, sourceStride1
, source2P
, sourceStride2
, destP
, destStride
, framesToProcess
);
76 void vmul(const float* source1P
, int sourceStride1
, const float* source2P
, int sourceStride2
, float* destP
, int destStride
, size_t framesToProcess
)
79 ::vmul(source1P
, sourceStride1
, source2P
, sourceStride2
, destP
, destStride
, framesToProcess
);
81 vDSP_vmul(source1P
, sourceStride1
, source2P
, sourceStride2
, destP
, destStride
, framesToProcess
);
85 void zvmul(const float* real1P
, const float* imag1P
, const float* real2P
, const float* imag2P
, float* realDestP
, float* imagDestP
, size_t framesToProcess
)
90 sc1
.realp
= const_cast<float*>(real1P
);
91 sc1
.imagp
= const_cast<float*>(imag1P
);
92 sc2
.realp
= const_cast<float*>(real2P
);
93 sc2
.imagp
= const_cast<float*>(imag2P
);
94 dest
.realp
= realDestP
;
95 dest
.imagp
= imagDestP
;
97 ::zvmul(&sc1
, 1, &sc2
, 1, &dest
, 1, framesToProcess
, 1);
99 vDSP_zvmul(&sc1
, 1, &sc2
, 1, &dest
, 1, framesToProcess
, 1);
103 void vsma(const float* sourceP
, int sourceStride
, const float* scale
, float* destP
, int destStride
, size_t framesToProcess
)
105 vDSP_vsma(sourceP
, sourceStride
, scale
, destP
, destStride
, destP
, destStride
, framesToProcess
);
108 void vmaxmgv(const float* sourceP
, int sourceStride
, float* maxP
, size_t framesToProcess
)
110 vDSP_maxmgv(sourceP
, sourceStride
, maxP
, framesToProcess
);
113 void vsvesq(const float* sourceP
, int sourceStride
, float* sumP
, size_t framesToProcess
)
115 vDSP_svesq(const_cast<float*>(sourceP
), sourceStride
, sumP
, framesToProcess
);
118 void vclip(const float* sourceP
, int sourceStride
, const float* lowThresholdP
, const float* highThresholdP
, float* destP
, int destStride
, size_t framesToProcess
)
120 vDSP_vclip(const_cast<float*>(sourceP
), sourceStride
, const_cast<float*>(lowThresholdP
), const_cast<float*>(highThresholdP
), destP
, destStride
, framesToProcess
);
124 void vsma(const float* sourceP
, int sourceStride
, const float* scale
, float* destP
, int destStride
, size_t framesToProcess
)
126 int n
= framesToProcess
;
128 #if CPU(X86) || CPU(X86_64)
129 if ((sourceStride
== 1) && (destStride
== 1)) {
132 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
133 while ((reinterpret_cast<uintptr_t>(sourceP
) & 0x0F) && n
) {
134 *destP
+= k
* *sourceP
;
140 // Now the sourceP is aligned, use SSE.
141 int tailFrames
= n
% 4;
142 const float* endP
= destP
+ n
- tailFrames
;
147 __m128 mScale
= _mm_set_ps1(k
);
149 bool destAligned
= !(reinterpret_cast<uintptr_t>(destP
) & 0x0F);
151 #define SSE2_MULT_ADD(loadInstr, storeInstr) \
152 while (destP < endP) \
154 pSource = _mm_load_ps(sourceP); \
155 temp = _mm_mul_ps(pSource, mScale); \
156 dest = _mm_##loadInstr##_ps(destP); \
157 dest = _mm_add_ps(dest, temp); \
158 _mm_##storeInstr##_ps(destP, dest); \
164 SSE2_MULT_ADD(load
, store
)
166 SSE2_MULT_ADD(loadu
, storeu
)
170 #elif HAVE(ARM_NEON_INTRINSICS)
171 if ((sourceStride
== 1) && (destStride
== 1)) {
172 int tailFrames
= n
% 4;
173 const float* endP
= destP
+ n
- tailFrames
;
175 float32x4_t k
= vdupq_n_f32(*scale
);
176 while (destP
< endP
) {
177 float32x4_t source
= vld1q_f32(sourceP
);
178 float32x4_t dest
= vld1q_f32(destP
);
180 dest
= vmlaq_f32(dest
, source
, k
);
181 vst1q_f32(destP
, dest
);
190 *destP
+= *sourceP
* *scale
;
191 sourceP
+= sourceStride
;
197 void vsmul(const float* sourceP
, int sourceStride
, const float* scale
, float* destP
, int destStride
, size_t framesToProcess
)
199 int n
= framesToProcess
;
201 #if CPU(X86) || CPU(X86_64)
202 if ((sourceStride
== 1) && (destStride
== 1)) {
205 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
206 while ((reinterpret_cast<size_t>(sourceP
) & 0x0F) && n
) {
207 *destP
= k
* *sourceP
;
213 // Now the sourceP address is aligned and start to apply SSE.
215 __m128 mScale
= _mm_set_ps1(k
);
221 if (reinterpret_cast<size_t>(destP
) & 0x0F) {
223 pSource
= reinterpret_cast<__m128
*>(const_cast<float*>(sourceP
));
224 dest
= _mm_mul_ps(*pSource
, mScale
);
225 _mm_storeu_ps(destP
, dest
);
232 pSource
= reinterpret_cast<__m128
*>(const_cast<float*>(sourceP
));
233 pDest
= reinterpret_cast<__m128
*>(destP
);
234 *pDest
= _mm_mul_ps(*pSource
, mScale
);
241 // Non-SSE handling for remaining frames which is less than 4.
244 *destP
= k
* *sourceP
;
249 } else { // If strides are not 1, rollback to normal algorithm.
250 #elif HAVE(ARM_NEON_INTRINSICS)
251 if ((sourceStride
== 1) && (destStride
== 1)) {
253 int tailFrames
= n
% 4;
254 const float* endP
= destP
+ n
- tailFrames
;
256 while (destP
< endP
) {
257 float32x4_t source
= vld1q_f32(sourceP
);
258 vst1q_f32(destP
, vmulq_n_f32(source
, k
));
268 *destP
= k
* *sourceP
;
269 sourceP
+= sourceStride
;
272 #if CPU(X86) || CPU(X86_64)
277 void vadd(const float* source1P
, int sourceStride1
, const float* source2P
, int sourceStride2
, float* destP
, int destStride
, size_t framesToProcess
)
279 int n
= framesToProcess
;
281 #if CPU(X86) || CPU(X86_64)
282 if ((sourceStride1
==1) && (sourceStride2
== 1) && (destStride
== 1)) {
283 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
284 while ((reinterpret_cast<size_t>(source1P
) & 0x0F) && n
) {
285 *destP
= *source1P
+ *source2P
;
292 // Now the source1P address is aligned and start to apply SSE.
300 bool source2Aligned
= !(reinterpret_cast<size_t>(source2P
) & 0x0F);
301 bool destAligned
= !(reinterpret_cast<size_t>(destP
) & 0x0F);
303 if (source2Aligned
&& destAligned
) { // all aligned
305 pSource1
= reinterpret_cast<__m128
*>(const_cast<float*>(source1P
));
306 pSource2
= reinterpret_cast<__m128
*>(const_cast<float*>(source2P
));
307 pDest
= reinterpret_cast<__m128
*>(destP
);
308 *pDest
= _mm_add_ps(*pSource1
, *pSource2
);
315 } else if (source2Aligned
&& !destAligned
) { // source2 aligned but dest not aligned
317 pSource1
= reinterpret_cast<__m128
*>(const_cast<float*>(source1P
));
318 pSource2
= reinterpret_cast<__m128
*>(const_cast<float*>(source2P
));
319 dest
= _mm_add_ps(*pSource1
, *pSource2
);
320 _mm_storeu_ps(destP
, dest
);
327 } else if (!source2Aligned
&& destAligned
) { // source2 not aligned but dest aligned
329 pSource1
= reinterpret_cast<__m128
*>(const_cast<float*>(source1P
));
330 source2
= _mm_loadu_ps(source2P
);
331 pDest
= reinterpret_cast<__m128
*>(destP
);
332 *pDest
= _mm_add_ps(*pSource1
, source2
);
338 } else if (!source2Aligned
&& !destAligned
) { // both source2 and dest not aligned
340 pSource1
= reinterpret_cast<__m128
*>(const_cast<float*>(source1P
));
341 source2
= _mm_loadu_ps(source2P
);
342 dest
= _mm_add_ps(*pSource1
, source2
);
343 _mm_storeu_ps(destP
, dest
);
351 // Non-SSE handling for remaining frames which is less than 4.
354 *destP
= *source1P
+ *source2P
;
360 } else { // if strides are not 1, rollback to normal algorithm
361 #elif HAVE(ARM_NEON_INTRINSICS)
362 if ((sourceStride1
==1) && (sourceStride2
== 1) && (destStride
== 1)) {
363 int tailFrames
= n
% 4;
364 const float* endP
= destP
+ n
- tailFrames
;
366 while (destP
< endP
) {
367 float32x4_t source1
= vld1q_f32(source1P
);
368 float32x4_t source2
= vld1q_f32(source2P
);
369 vst1q_f32(destP
, vaddq_f32(source1
, source2
));
379 *destP
= *source1P
+ *source2P
;
380 source1P
+= sourceStride1
;
381 source2P
+= sourceStride2
;
384 #if CPU(X86) || CPU(X86_64)
389 void vmul(const float* source1P
, int sourceStride1
, const float* source2P
, int sourceStride2
, float* destP
, int destStride
, size_t framesToProcess
)
392 int n
= framesToProcess
;
394 #if CPU(X86) || CPU(X86_64)
395 if ((sourceStride1
== 1) && (sourceStride2
== 1) && (destStride
== 1)) {
396 // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
397 while ((reinterpret_cast<uintptr_t>(source1P
) & 0x0F) && n
) {
398 *destP
= *source1P
* *source2P
;
405 // Now the source1P address aligned and start to apply SSE.
406 int tailFrames
= n
% 4;
407 const float* endP
= destP
+ n
- tailFrames
;
412 bool source2Aligned
= !(reinterpret_cast<uintptr_t>(source2P
) & 0x0F);
413 bool destAligned
= !(reinterpret_cast<uintptr_t>(destP
) & 0x0F);
415 #define SSE2_MULT(loadInstr, storeInstr) \
416 while (destP < endP) \
418 pSource1 = _mm_load_ps(source1P); \
419 pSource2 = _mm_##loadInstr##_ps(source2P); \
420 dest = _mm_mul_ps(pSource1, pSource2); \
421 _mm_##storeInstr##_ps(destP, dest); \
427 if (source2Aligned
&& destAligned
) // Both aligned.
428 SSE2_MULT(load
, store
)
429 else if (source2Aligned
&& !destAligned
) // Source2 is aligned but dest not.
430 SSE2_MULT(load
, storeu
)
431 else if (!source2Aligned
&& destAligned
) // Dest is aligned but source2 not.
432 SSE2_MULT(loadu
, store
)
433 else // Neither aligned.
434 SSE2_MULT(loadu
, storeu
)
438 #elif HAVE(ARM_NEON_INTRINSICS)
439 if ((sourceStride1
==1) && (sourceStride2
== 1) && (destStride
== 1)) {
440 int tailFrames
= n
% 4;
441 const float* endP
= destP
+ n
- tailFrames
;
443 while (destP
< endP
) {
444 float32x4_t source1
= vld1q_f32(source1P
);
445 float32x4_t source2
= vld1q_f32(source2P
);
446 vst1q_f32(destP
, vmulq_f32(source1
, source2
));
456 *destP
= *source1P
* *source2P
;
457 source1P
+= sourceStride1
;
458 source2P
+= sourceStride2
;
464 void zvmul(const float* real1P
, const float* imag1P
, const float* real2P
, const float* imag2P
, float* realDestP
, float* imagDestP
, size_t framesToProcess
)
467 #if CPU(X86) || CPU(X86_64)
468 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
469 // Otherwise, fall through to the scalar code below.
470 if (!(reinterpret_cast<uintptr_t>(real1P
) & 0x0F)
471 && !(reinterpret_cast<uintptr_t>(imag1P
) & 0x0F)
472 && !(reinterpret_cast<uintptr_t>(real2P
) & 0x0F)
473 && !(reinterpret_cast<uintptr_t>(imag2P
) & 0x0F)
474 && !(reinterpret_cast<uintptr_t>(realDestP
) & 0x0F)
475 && !(reinterpret_cast<uintptr_t>(imagDestP
) & 0x0F)) {
477 unsigned endSize
= framesToProcess
- framesToProcess
% 4;
478 while (i
< endSize
) {
479 __m128 real1
= _mm_load_ps(real1P
+ i
);
480 __m128 real2
= _mm_load_ps(real2P
+ i
);
481 __m128 imag1
= _mm_load_ps(imag1P
+ i
);
482 __m128 imag2
= _mm_load_ps(imag2P
+ i
);
483 __m128 real
= _mm_mul_ps(real1
, real2
);
484 real
= _mm_sub_ps(real
, _mm_mul_ps(imag1
, imag2
));
485 __m128 imag
= _mm_mul_ps(real1
, imag2
);
486 imag
= _mm_add_ps(imag
, _mm_mul_ps(imag1
, real2
));
487 _mm_store_ps(realDestP
+ i
, real
);
488 _mm_store_ps(imagDestP
+ i
, imag
);
492 #elif HAVE(ARM_NEON_INTRINSICS)
493 unsigned endSize
= framesToProcess
- framesToProcess
% 4;
494 while (i
< endSize
) {
495 float32x4_t real1
= vld1q_f32(real1P
+ i
);
496 float32x4_t real2
= vld1q_f32(real2P
+ i
);
497 float32x4_t imag1
= vld1q_f32(imag1P
+ i
);
498 float32x4_t imag2
= vld1q_f32(imag2P
+ i
);
500 float32x4_t realResult
= vmlsq_f32(vmulq_f32(real1
, real2
), imag1
, imag2
);
501 float32x4_t imagResult
= vmlaq_f32(vmulq_f32(real1
, imag2
), imag1
, real2
);
503 vst1q_f32(realDestP
+ i
, realResult
);
504 vst1q_f32(imagDestP
+ i
, imagResult
);
509 for (; i
< framesToProcess
; ++i
) {
510 // Read and compute result before storing them, in case the
511 // destination is the same as one of the sources.
512 float realResult
= real1P
[i
] * real2P
[i
] - imag1P
[i
] * imag2P
[i
];
513 float imagResult
= real1P
[i
] * imag2P
[i
] + imag1P
[i
] * real2P
[i
];
515 realDestP
[i
] = realResult
;
516 imagDestP
[i
] = imagResult
;
520 void vsvesq(const float* sourceP
, int sourceStride
, float* sumP
, size_t framesToProcess
)
522 int n
= framesToProcess
;
525 #if CPU(X86) || CPU(X86_64)
526 if (sourceStride
== 1) {
527 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
528 while ((reinterpret_cast<uintptr_t>(sourceP
) & 0x0F) && n
) {
529 float sample
= *sourceP
;
530 sum
+= sample
* sample
;
535 // Now the sourceP is aligned, use SSE.
536 int tailFrames
= n
% 4;
537 const float* endP
= sourceP
+ n
- tailFrames
;
539 __m128 mSum
= _mm_setzero_ps();
541 while (sourceP
< endP
) {
542 source
= _mm_load_ps(sourceP
);
543 source
= _mm_mul_ps(source
, source
);
544 mSum
= _mm_add_ps(mSum
, source
);
548 // Summarize the SSE results.
549 const float* groupSumP
= reinterpret_cast<float*>(&mSum
);
550 sum
+= groupSumP
[0] + groupSumP
[1] + groupSumP
[2] + groupSumP
[3];
554 #elif HAVE(ARM_NEON_INTRINSICS)
555 if (sourceStride
== 1) {
556 int tailFrames
= n
% 4;
557 const float* endP
= sourceP
+ n
- tailFrames
;
559 float32x4_t fourSum
= vdupq_n_f32(0);
560 while (sourceP
< endP
) {
561 float32x4_t source
= vld1q_f32(sourceP
);
562 fourSum
= vmlaq_f32(fourSum
, source
, source
);
565 float32x2_t twoSum
= vadd_f32(vget_low_f32(fourSum
), vget_high_f32(fourSum
));
568 vst1_f32(groupSum
, twoSum
);
569 sum
+= groupSum
[0] + groupSum
[1];
576 float sample
= *sourceP
;
577 sum
+= sample
* sample
;
578 sourceP
+= sourceStride
;
585 void vmaxmgv(const float* sourceP
, int sourceStride
, float* maxP
, size_t framesToProcess
)
587 int n
= framesToProcess
;
590 #if CPU(X86) || CPU(X86_64)
591 if (sourceStride
== 1) {
592 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
593 while ((reinterpret_cast<uintptr_t>(sourceP
) & 0x0F) && n
) {
594 max
= std::max(max
, fabsf(*sourceP
));
599 // Now the sourceP is aligned, use SSE.
600 int tailFrames
= n
% 4;
601 const float* endP
= sourceP
+ n
- tailFrames
;
603 __m128 mMax
= _mm_setzero_ps();
604 int mask
= 0x7FFFFFFF;
605 __m128 mMask
= _mm_set1_ps(*reinterpret_cast<float*>(&mask
));
607 while (sourceP
< endP
) {
608 source
= _mm_load_ps(sourceP
);
609 // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
610 source
= _mm_and_ps(source
, mMask
);
611 mMax
= _mm_max_ps(mMax
, source
);
615 // Get max from the SSE results.
616 const float* groupMaxP
= reinterpret_cast<float*>(&mMax
);
617 max
= std::max(max
, groupMaxP
[0]);
618 max
= std::max(max
, groupMaxP
[1]);
619 max
= std::max(max
, groupMaxP
[2]);
620 max
= std::max(max
, groupMaxP
[3]);
624 #elif HAVE(ARM_NEON_INTRINSICS)
625 if (sourceStride
== 1) {
626 int tailFrames
= n
% 4;
627 const float* endP
= sourceP
+ n
- tailFrames
;
629 float32x4_t fourMax
= vdupq_n_f32(0);
630 while (sourceP
< endP
) {
631 float32x4_t source
= vld1q_f32(sourceP
);
632 fourMax
= vmaxq_f32(fourMax
, vabsq_f32(source
));
635 float32x2_t twoMax
= vmax_f32(vget_low_f32(fourMax
), vget_high_f32(fourMax
));
638 vst1_f32(groupMax
, twoMax
);
639 max
= std::max(groupMax
[0], groupMax
[1]);
646 max
= std::max(max
, fabsf(*sourceP
));
647 sourceP
+= sourceStride
;
654 void vclip(const float* sourceP
, int sourceStride
, const float* lowThresholdP
, const float* highThresholdP
, float* destP
, int destStride
, size_t framesToProcess
)
656 int n
= framesToProcess
;
657 float lowThreshold
= *lowThresholdP
;
658 float highThreshold
= *highThresholdP
;
660 // FIXME: Optimize for SSE2.
661 #if HAVE(ARM_NEON_INTRINSICS)
662 if ((sourceStride
== 1) && (destStride
== 1)) {
663 int tailFrames
= n
% 4;
664 const float* endP
= destP
+ n
- tailFrames
;
666 float32x4_t low
= vdupq_n_f32(lowThreshold
);
667 float32x4_t high
= vdupq_n_f32(highThreshold
);
668 while (destP
< endP
) {
669 float32x4_t source
= vld1q_f32(sourceP
);
670 vst1q_f32(destP
, vmaxq_f32(vminq_f32(source
, high
), low
));
678 *destP
= std::max(std::min(*sourceP
, highThreshold
), lowThreshold
);
679 sourceP
+= sourceStride
;
686 } // namespace VectorMath
690 #endif // ENABLE(WEB_AUDIO)