1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "media/base/vector_math_testing.h"
9 #include <xmmintrin.h> // NOLINT
12 namespace vector_math
{
14 void FMUL_SSE(const float src
[], float scale
, int len
, float dest
[]) {
15 const int rem
= len
% 4;
16 const int last_index
= len
- rem
;
17 __m128 m_scale
= _mm_set_ps1(scale
);
18 for (int i
= 0; i
< last_index
; i
+= 4)
19 _mm_store_ps(dest
+ i
, _mm_mul_ps(_mm_load_ps(src
+ i
), m_scale
));
21 // Handle any remaining values that wouldn't fit in an SSE pass.
22 for (int i
= last_index
; i
< len
; ++i
)
23 dest
[i
] = src
[i
] * scale
;
26 void FMAC_SSE(const float src
[], float scale
, int len
, float dest
[]) {
27 const int rem
= len
% 4;
28 const int last_index
= len
- rem
;
29 __m128 m_scale
= _mm_set_ps1(scale
);
30 for (int i
= 0; i
< last_index
; i
+= 4) {
31 _mm_store_ps(dest
+ i
, _mm_add_ps(_mm_load_ps(dest
+ i
),
32 _mm_mul_ps(_mm_load_ps(src
+ i
), m_scale
)));
35 // Handle any remaining values that wouldn't fit in an SSE pass.
36 for (int i
= last_index
; i
< len
; ++i
)
37 dest
[i
] += src
[i
] * scale
;
40 // Convenience macro to extract float 0 through 3 from the vector |a|. This is
41 // needed because compilers other than clang don't support access via
43 #define EXTRACT_FLOAT(a, i) \
46 _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
48 std::pair
<float, float> EWMAAndMaxPower_SSE(
49 float initial_value
, const float src
[], int len
, float smoothing_factor
) {
50 // When the recurrence is unrolled, we see that we can split it into 4
51 // separate lanes of evaluation:
53 // y[n] = a(S[n]^2) + (1-a)(y[n-1])
54 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
55 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
57 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
59 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
60 // each of the 4 lanes, and then combine them to give y[n].
62 const int rem
= len
% 4;
63 const int last_index
= len
- rem
;
65 const __m128 smoothing_factor_x4
= _mm_set_ps1(smoothing_factor
);
66 const float weight_prev
= 1.0f
- smoothing_factor
;
67 const __m128 weight_prev_x4
= _mm_set_ps1(weight_prev
);
68 const __m128 weight_prev_squared_x4
=
69 _mm_mul_ps(weight_prev_x4
, weight_prev_x4
);
70 const __m128 weight_prev_4th_x4
=
71 _mm_mul_ps(weight_prev_squared_x4
, weight_prev_squared_x4
);
73 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and
75 __m128 max_x4
= _mm_setzero_ps();
76 __m128 ewma_x4
= _mm_setr_ps(0.0f
, 0.0f
, 0.0f
, initial_value
);
78 for (i
= 0; i
< last_index
; i
+= 4) {
79 ewma_x4
= _mm_mul_ps(ewma_x4
, weight_prev_4th_x4
);
80 const __m128 sample_x4
= _mm_load_ps(src
+ i
);
81 const __m128 sample_squared_x4
= _mm_mul_ps(sample_x4
, sample_x4
);
82 max_x4
= _mm_max_ps(max_x4
, sample_squared_x4
);
83 // Note: The compiler optimizes this to a single multiply-and-accumulate
85 ewma_x4
= _mm_add_ps(ewma_x4
,
86 _mm_mul_ps(sample_squared_x4
, smoothing_factor_x4
));
89 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
90 float ewma
= EXTRACT_FLOAT(ewma_x4
, 3);
91 ewma_x4
= _mm_mul_ps(ewma_x4
, weight_prev_x4
);
92 ewma
+= EXTRACT_FLOAT(ewma_x4
, 2);
93 ewma_x4
= _mm_mul_ps(ewma_x4
, weight_prev_x4
);
94 ewma
+= EXTRACT_FLOAT(ewma_x4
, 1);
95 ewma_x4
= _mm_mul_ss(ewma_x4
, weight_prev_x4
);
96 ewma
+= EXTRACT_FLOAT(ewma_x4
, 0);
98 // Fold the maximums together to get the overall maximum.
99 max_x4
= _mm_max_ps(max_x4
,
100 _mm_shuffle_ps(max_x4
, max_x4
, _MM_SHUFFLE(3, 3, 1, 1)));
101 max_x4
= _mm_max_ss(max_x4
, _mm_shuffle_ps(max_x4
, max_x4
, 2));
103 std::pair
<float, float> result(ewma
, EXTRACT_FLOAT(max_x4
, 0));
105 // Handle remaining values at the end of |src|.
106 for (; i
< len
; ++i
) {
107 result
.first
*= weight_prev
;
108 const float sample
= src
[i
];
109 const float sample_squared
= sample
* sample
;
110 result
.first
+= sample_squared
* smoothing_factor
;
111 result
.second
= std::max(result
.second
, sample_squared
);
117 } // namespace vector_math