1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "media/base/vector_math.h"
6 #include "media/base/vector_math_testing.h"
10 #include "base/logging.h"
11 #include "build/build_config.h"
13 // NaCl does not allow intrinsics.
14 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
15 #include <xmmintrin.h>
16 #define FMAC_FUNC FMAC_SSE
17 #define FMUL_FUNC FMUL_SSE
18 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE
19 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
21 #define FMAC_FUNC FMAC_NEON
22 #define FMUL_FUNC FMUL_NEON
23 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON
25 #define FMAC_FUNC FMAC_C
26 #define FMUL_FUNC FMUL_C
27 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C
31 namespace vector_math
{
33 void FMAC(const float src
[], float scale
, int len
, float dest
[]) {
34 // Ensure |src| and |dest| are 16-byte aligned.
35 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src
) & (kRequiredAlignment
- 1));
36 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest
) & (kRequiredAlignment
- 1));
37 return FMAC_FUNC(src
, scale
, len
, dest
);
40 void FMAC_C(const float src
[], float scale
, int len
, float dest
[]) {
41 for (int i
= 0; i
< len
; ++i
)
42 dest
[i
] += src
[i
] * scale
;
45 void FMUL(const float src
[], float scale
, int len
, float dest
[]) {
46 // Ensure |src| and |dest| are 16-byte aligned.
47 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src
) & (kRequiredAlignment
- 1));
48 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest
) & (kRequiredAlignment
- 1));
49 return FMUL_FUNC(src
, scale
, len
, dest
);
52 void FMUL_C(const float src
[], float scale
, int len
, float dest
[]) {
53 for (int i
= 0; i
< len
; ++i
)
54 dest
[i
] = src
[i
] * scale
;
57 void Crossfade(const float src
[], int len
, float dest
[]) {
59 const float cf_increment
= 1.0f
/ len
;
60 for (int i
= 0; i
< len
; ++i
, cf_ratio
+= cf_increment
)
61 dest
[i
] = (1.0f
- cf_ratio
) * src
[i
] + cf_ratio
* dest
[i
];
64 std::pair
<float, float> EWMAAndMaxPower(
65 float initial_value
, const float src
[], int len
, float smoothing_factor
) {
66 // Ensure |src| is 16-byte aligned.
67 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src
) & (kRequiredAlignment
- 1));
68 return EWMAAndMaxPower_FUNC(initial_value
, src
, len
, smoothing_factor
);
71 std::pair
<float, float> EWMAAndMaxPower_C(
72 float initial_value
, const float src
[], int len
, float smoothing_factor
) {
73 std::pair
<float, float> result(initial_value
, 0.0f
);
74 const float weight_prev
= 1.0f
- smoothing_factor
;
75 for (int i
= 0; i
< len
; ++i
) {
76 result
.first
*= weight_prev
;
77 const float sample
= src
[i
];
78 const float sample_squared
= sample
* sample
;
79 result
.first
+= sample_squared
* smoothing_factor
;
80 result
.second
= std::max(result
.second
, sample_squared
);
85 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
86 void FMUL_SSE(const float src
[], float scale
, int len
, float dest
[]) {
87 const int rem
= len
% 4;
88 const int last_index
= len
- rem
;
89 __m128 m_scale
= _mm_set_ps1(scale
);
90 for (int i
= 0; i
< last_index
; i
+= 4)
91 _mm_store_ps(dest
+ i
, _mm_mul_ps(_mm_load_ps(src
+ i
), m_scale
));
93 // Handle any remaining values that wouldn't fit in an SSE pass.
94 for (int i
= last_index
; i
< len
; ++i
)
95 dest
[i
] = src
[i
] * scale
;
98 void FMAC_SSE(const float src
[], float scale
, int len
, float dest
[]) {
99 const int rem
= len
% 4;
100 const int last_index
= len
- rem
;
101 __m128 m_scale
= _mm_set_ps1(scale
);
102 for (int i
= 0; i
< last_index
; i
+= 4) {
103 _mm_store_ps(dest
+ i
, _mm_add_ps(_mm_load_ps(dest
+ i
),
104 _mm_mul_ps(_mm_load_ps(src
+ i
), m_scale
)));
107 // Handle any remaining values that wouldn't fit in an SSE pass.
108 for (int i
= last_index
; i
< len
; ++i
)
109 dest
[i
] += src
[i
] * scale
;
112 // Convenience macro to extract float 0 through 3 from the vector |a|. This is
113 // needed because compilers other than clang don't support access via
115 #define EXTRACT_FLOAT(a, i) \
118 _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
120 std::pair
<float, float> EWMAAndMaxPower_SSE(
121 float initial_value
, const float src
[], int len
, float smoothing_factor
) {
122 // When the recurrence is unrolled, we see that we can split it into 4
123 // separate lanes of evaluation:
125 // y[n] = a(S[n]^2) + (1-a)(y[n-1])
126 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
127 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
129 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
131 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
132 // each of the 4 lanes, and then combine them to give y[n].
134 const int rem
= len
% 4;
135 const int last_index
= len
- rem
;
137 const __m128 smoothing_factor_x4
= _mm_set_ps1(smoothing_factor
);
138 const float weight_prev
= 1.0f
- smoothing_factor
;
139 const __m128 weight_prev_x4
= _mm_set_ps1(weight_prev
);
140 const __m128 weight_prev_squared_x4
=
141 _mm_mul_ps(weight_prev_x4
, weight_prev_x4
);
142 const __m128 weight_prev_4th_x4
=
143 _mm_mul_ps(weight_prev_squared_x4
, weight_prev_squared_x4
);
145 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and
147 __m128 max_x4
= _mm_setzero_ps();
148 __m128 ewma_x4
= _mm_setr_ps(0.0f
, 0.0f
, 0.0f
, initial_value
);
150 for (i
= 0; i
< last_index
; i
+= 4) {
151 ewma_x4
= _mm_mul_ps(ewma_x4
, weight_prev_4th_x4
);
152 const __m128 sample_x4
= _mm_load_ps(src
+ i
);
153 const __m128 sample_squared_x4
= _mm_mul_ps(sample_x4
, sample_x4
);
154 max_x4
= _mm_max_ps(max_x4
, sample_squared_x4
);
155 // Note: The compiler optimizes this to a single multiply-and-accumulate
157 ewma_x4
= _mm_add_ps(ewma_x4
,
158 _mm_mul_ps(sample_squared_x4
, smoothing_factor_x4
));
161 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
162 float ewma
= EXTRACT_FLOAT(ewma_x4
, 3);
163 ewma_x4
= _mm_mul_ps(ewma_x4
, weight_prev_x4
);
164 ewma
+= EXTRACT_FLOAT(ewma_x4
, 2);
165 ewma_x4
= _mm_mul_ps(ewma_x4
, weight_prev_x4
);
166 ewma
+= EXTRACT_FLOAT(ewma_x4
, 1);
167 ewma_x4
= _mm_mul_ss(ewma_x4
, weight_prev_x4
);
168 ewma
+= EXTRACT_FLOAT(ewma_x4
, 0);
170 // Fold the maximums together to get the overall maximum.
171 max_x4
= _mm_max_ps(max_x4
,
172 _mm_shuffle_ps(max_x4
, max_x4
, _MM_SHUFFLE(3, 3, 1, 1)));
173 max_x4
= _mm_max_ss(max_x4
, _mm_shuffle_ps(max_x4
, max_x4
, 2));
175 std::pair
<float, float> result(ewma
, EXTRACT_FLOAT(max_x4
, 0));
177 // Handle remaining values at the end of |src|.
178 for (; i
< len
; ++i
) {
179 result
.first
*= weight_prev
;
180 const float sample
= src
[i
];
181 const float sample_squared
= sample
* sample
;
182 result
.first
+= sample_squared
* smoothing_factor
;
183 result
.second
= std::max(result
.second
, sample_squared
);
190 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
191 void FMAC_NEON(const float src
[], float scale
, int len
, float dest
[]) {
192 const int rem
= len
% 4;
193 const int last_index
= len
- rem
;
194 float32x4_t m_scale
= vmovq_n_f32(scale
);
195 for (int i
= 0; i
< last_index
; i
+= 4) {
196 vst1q_f32(dest
+ i
, vmlaq_f32(
197 vld1q_f32(dest
+ i
), vld1q_f32(src
+ i
), m_scale
));
200 // Handle any remaining values that wouldn't fit in an NEON pass.
201 for (int i
= last_index
; i
< len
; ++i
)
202 dest
[i
] += src
[i
] * scale
;
205 void FMUL_NEON(const float src
[], float scale
, int len
, float dest
[]) {
206 const int rem
= len
% 4;
207 const int last_index
= len
- rem
;
208 float32x4_t m_scale
= vmovq_n_f32(scale
);
209 for (int i
= 0; i
< last_index
; i
+= 4)
210 vst1q_f32(dest
+ i
, vmulq_f32(vld1q_f32(src
+ i
), m_scale
));
212 // Handle any remaining values that wouldn't fit in an NEON pass.
213 for (int i
= last_index
; i
< len
; ++i
)
214 dest
[i
] = src
[i
] * scale
;
217 std::pair
<float, float> EWMAAndMaxPower_NEON(
218 float initial_value
, const float src
[], int len
, float smoothing_factor
) {
219 // When the recurrence is unrolled, we see that we can split it into 4
220 // separate lanes of evaluation:
222 // y[n] = a(S[n]^2) + (1-a)(y[n-1])
223 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
224 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
226 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
228 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
229 // each of the 4 lanes, and then combine them to give y[n].
231 const int rem
= len
% 4;
232 const int last_index
= len
- rem
;
234 const float32x4_t smoothing_factor_x4
= vdupq_n_f32(smoothing_factor
);
235 const float weight_prev
= 1.0f
- smoothing_factor
;
236 const float32x4_t weight_prev_x4
= vdupq_n_f32(weight_prev
);
237 const float32x4_t weight_prev_squared_x4
=
238 vmulq_f32(weight_prev_x4
, weight_prev_x4
);
239 const float32x4_t weight_prev_4th_x4
=
240 vmulq_f32(weight_prev_squared_x4
, weight_prev_squared_x4
);
242 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and
244 float32x4_t max_x4
= vdupq_n_f32(0.0f
);
245 float32x4_t ewma_x4
= vsetq_lane_f32(initial_value
, vdupq_n_f32(0.0f
), 3);
247 for (i
= 0; i
< last_index
; i
+= 4) {
248 ewma_x4
= vmulq_f32(ewma_x4
, weight_prev_4th_x4
);
249 const float32x4_t sample_x4
= vld1q_f32(src
+ i
);
250 const float32x4_t sample_squared_x4
= vmulq_f32(sample_x4
, sample_x4
);
251 max_x4
= vmaxq_f32(max_x4
, sample_squared_x4
);
252 ewma_x4
= vmlaq_f32(ewma_x4
, sample_squared_x4
, smoothing_factor_x4
);
255 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
256 float ewma
= vgetq_lane_f32(ewma_x4
, 3);
257 ewma_x4
= vmulq_f32(ewma_x4
, weight_prev_x4
);
258 ewma
+= vgetq_lane_f32(ewma_x4
, 2);
259 ewma_x4
= vmulq_f32(ewma_x4
, weight_prev_x4
);
260 ewma
+= vgetq_lane_f32(ewma_x4
, 1);
261 ewma_x4
= vmulq_f32(ewma_x4
, weight_prev_x4
);
262 ewma
+= vgetq_lane_f32(ewma_x4
, 0);
264 // Fold the maximums together to get the overall maximum.
265 float32x2_t max_x2
= vpmax_f32(vget_low_f32(max_x4
), vget_high_f32(max_x4
));
266 max_x2
= vpmax_f32(max_x2
, max_x2
);
268 std::pair
<float, float> result(ewma
, vget_lane_f32(max_x2
, 0));
270 // Handle remaining values at the end of |src|.
271 for (; i
< len
; ++i
) {
272 result
.first
*= weight_prev
;
273 const float sample
= src
[i
];
274 const float sample_squared
= sample
* sample
;
275 result
.first
+= sample_squared
* smoothing_factor
;
276 result
.second
= std::max(result
.second
, sample_squared
);
283 } // namespace vector_math