1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "build/build_config.h"
6 #include "media/base/simd/convert_rgb_to_yuv.h"
8 #if defined(COMPILER_MSVC)
12 #include <emmintrin.h>
15 #if defined(COMPILER_MSVC)
16 #define SIMD_ALIGNED(var) __declspec(align(16)) var
18 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
24 #define FIX(x) ((x) * (1 << FIX_SHIFT))
26 // Define a convenient macro to do static cast.
27 #define INT16_FIX(x) static_cast<int16>(FIX(x))
29 // Android's pixel layout is RGBA, while other platforms
31 #if defined(OS_ANDROID)
32 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable
[8 * 3]) = {
33 INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
34 INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
35 -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
36 -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
37 INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
38 INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
41 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable
[8 * 3]) = {
42 INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
43 INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
44 INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
45 INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
46 -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
47 -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
53 // This is the final offset for the conversion from signed yuv values to
54 // unsigned values. It is arranged so that offset of 16 is applied to Y
55 // components and 128 is added to UV components for 2 pixels.
56 SIMD_ALIGNED(const int32 kYOffset
[4]) = {16, 16, 16, 16};
58 static inline uint8
Clamp(int value
) {
63 return static_cast<uint8
>(value
);
66 static inline uint8
RGBToY(int r
, int g
, int b
) {
67 int y
= ConvertRGBAToYUV_kTable
[0] * b
+
68 ConvertRGBAToYUV_kTable
[1] * g
+
69 ConvertRGBAToYUV_kTable
[2] * r
;
74 static inline uint8
RGBToU(int r
, int g
, int b
, int shift
) {
75 int u
= ConvertRGBAToYUV_kTable
[8] * b
+
76 ConvertRGBAToYUV_kTable
[9] * g
+
77 ConvertRGBAToYUV_kTable
[10] * r
;
78 u
>>= FIX_SHIFT
+ shift
;
79 return Clamp(u
+ 128);
82 static inline uint8
RGBToV(int r
, int g
, int b
, int shift
) {
83 int v
= ConvertRGBAToYUV_kTable
[16] * b
+
84 ConvertRGBAToYUV_kTable
[17] * g
+
85 ConvertRGBAToYUV_kTable
[18] * r
;
86 v
>>= FIX_SHIFT
+ shift
;
87 return Clamp(v
+ 128);
90 #define CONVERT_Y(rgb_buf, y_buf) \
98 *y_buf++ = RGBToY(r, g, b);
100 static inline void ConvertRGBToYUV_V2H2(const uint8
* rgb_buf_1
,
101 const uint8
* rgb_buf_2
,
113 CONVERT_Y(rgb_buf_1
, y_buf_1
);
114 CONVERT_Y(rgb_buf_1
, y_buf_1
);
115 CONVERT_Y(rgb_buf_2
, y_buf_2
);
116 CONVERT_Y(rgb_buf_2
, y_buf_2
);
117 *u_buf
++ = RGBToU(sum_r
, sum_g
, sum_b
, 2);
118 *v_buf
++ = RGBToV(sum_r
, sum_g
, sum_b
, 2);
121 static inline void ConvertRGBToYUV_V2H1(const uint8
* rgb_buf_1
,
122 const uint8
* rgb_buf_2
,
132 CONVERT_Y(rgb_buf_1
, y_buf_1
);
133 CONVERT_Y(rgb_buf_2
, y_buf_2
);
134 *u_buf
++ = RGBToU(sum_r
, sum_g
, sum_b
, 1);
135 *v_buf
++ = RGBToV(sum_r
, sum_g
, sum_b
, 1);
138 static inline void ConvertRGBToYUV_V1H2(const uint8
* rgb_buf
,
147 CONVERT_Y(rgb_buf
, y_buf
);
148 CONVERT_Y(rgb_buf
, y_buf
);
149 *u_buf
++ = RGBToU(sum_r
, sum_g
, sum_b
, 1);
150 *v_buf
++ = RGBToV(sum_r
, sum_g
, sum_b
, 1);
153 static inline void ConvertRGBToYUV_V1H1(const uint8
* rgb_buf
,
162 CONVERT_Y(rgb_buf
, y_buf
);
163 *u_buf
++ = RGBToU(r
, g
, b
, 0);
164 *v_buf
++ = RGBToV(r
, g
, b
, 0);
167 static void ConvertRGB32ToYUVRow_SSE2(const uint8
* rgb_buf_1
,
168 const uint8
* rgb_buf_2
,
175 // Name for the Y pixels:
179 // First row 4 pixels.
180 __m128i rgb_row_1
= _mm_loadu_si128(
181 reinterpret_cast<const __m128i
*>(rgb_buf_1
));
182 __m128i zero_1
= _mm_xor_si128(rgb_row_1
, rgb_row_1
);
184 __m128i y_table
= _mm_load_si128(
185 reinterpret_cast<const __m128i
*>(ConvertRGBAToYUV_kTable
));
187 __m128i rgb_a_b
= _mm_unpackhi_epi8(rgb_row_1
, zero_1
);
188 rgb_a_b
= _mm_madd_epi16(rgb_a_b
, y_table
);
190 __m128i rgb_c_d
= _mm_unpacklo_epi8(rgb_row_1
, zero_1
);
191 rgb_c_d
= _mm_madd_epi16(rgb_c_d
, y_table
);
193 // Do a crazh shuffle so that we get:
194 // v------------ Multiply Add
197 __m128i bg_abcd
= _mm_castps_si128(
199 _mm_castsi128_ps(rgb_c_d
),
200 _mm_castsi128_ps(rgb_a_b
),
201 (3 << 6) | (1 << 4) | (3 << 2) | 1));
202 __m128i r_abcd
= _mm_castps_si128(
204 _mm_castsi128_ps(rgb_c_d
),
205 _mm_castsi128_ps(rgb_a_b
),
206 (2 << 6) | (2 << 2)));
207 __m128i y_abcd
= _mm_add_epi32(bg_abcd
, r_abcd
);
209 // Down shift back to 8bits range.
210 __m128i y_offset
= _mm_load_si128(
211 reinterpret_cast<const __m128i
*>(kYOffset
));
212 y_abcd
= _mm_srai_epi32(y_abcd
, FIX_SHIFT
);
213 y_abcd
= _mm_add_epi32(y_abcd
, y_offset
);
214 y_abcd
= _mm_packs_epi32(y_abcd
, y_abcd
);
215 y_abcd
= _mm_packus_epi16(y_abcd
, y_abcd
);
216 *reinterpret_cast<uint32
*>(y_buf_1
) = _mm_cvtsi128_si32(y_abcd
);
219 // Second row 4 pixels.
220 __m128i rgb_row_2
= _mm_loadu_si128(
221 reinterpret_cast<const __m128i
*>(rgb_buf_2
));
222 __m128i zero_2
= _mm_xor_si128(rgb_row_2
, rgb_row_2
);
223 __m128i rgb_e_f
= _mm_unpackhi_epi8(rgb_row_2
, zero_2
);
224 __m128i rgb_g_h
= _mm_unpacklo_epi8(rgb_row_2
, zero_2
);
226 // Add two rows together.
228 _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1
, zero_2
), rgb_e_f
);
230 _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1
, zero_2
), rgb_g_h
);
232 // Multiply add like the previous row.
233 rgb_e_f
= _mm_madd_epi16(rgb_e_f
, y_table
);
234 rgb_g_h
= _mm_madd_epi16(rgb_g_h
, y_table
);
236 __m128i bg_efgh
= _mm_castps_si128(
237 _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h
),
238 _mm_castsi128_ps(rgb_e_f
),
239 (3 << 6) | (1 << 4) | (3 << 2) | 1));
240 __m128i r_efgh
= _mm_castps_si128(
241 _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h
),
242 _mm_castsi128_ps(rgb_e_f
),
243 (2 << 6) | (2 << 2)));
244 __m128i y_efgh
= _mm_add_epi32(bg_efgh
, r_efgh
);
245 y_efgh
= _mm_srai_epi32(y_efgh
, FIX_SHIFT
);
246 y_efgh
= _mm_add_epi32(y_efgh
, y_offset
);
247 y_efgh
= _mm_packs_epi32(y_efgh
, y_efgh
);
248 y_efgh
= _mm_packus_epi16(y_efgh
, y_efgh
);
249 *reinterpret_cast<uint32
*>(y_buf_2
) = _mm_cvtsi128_si32(y_efgh
);
252 __m128i rgb_ae_cg
= _mm_castps_si128(
253 _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh
),
254 _mm_castsi128_ps(rgb_ae_bf
),
255 (3 << 6) | (2 << 4) | (3 << 2) | 2));
256 __m128i rgb_bf_dh
= _mm_castps_si128(
257 _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh
),
258 _mm_castsi128_ps(rgb_ae_bf
),
259 (1 << 6) | (1 << 2)));
261 // This is a 2x2 subsampling for 2 pixels.
262 __m128i rgb_abef_cdgh
= _mm_add_epi16(rgb_ae_cg
, rgb_bf_dh
);
264 // Do a multiply add with U table.
265 __m128i u_a_b
= _mm_madd_epi16(
268 reinterpret_cast<const __m128i
*>(ConvertRGBAToYUV_kTable
+ 8)));
269 u_a_b
= _mm_add_epi32(_mm_shuffle_epi32(u_a_b
, ((3 << 2) | 1)),
270 _mm_shuffle_epi32(u_a_b
, (2 << 2)));
271 // Right shift 14 because of 12 from fixed point and 2 from subsampling.
272 u_a_b
= _mm_srai_epi32(u_a_b
, FIX_SHIFT
+ 2);
273 __m128i uv_offset
= _mm_slli_epi32(y_offset
, 3);
274 u_a_b
= _mm_add_epi32(u_a_b
, uv_offset
);
275 u_a_b
= _mm_packs_epi32(u_a_b
, u_a_b
);
276 u_a_b
= _mm_packus_epi16(u_a_b
, u_a_b
);
277 *reinterpret_cast<uint16
*>(u_buf
) =
278 static_cast<uint16
>(_mm_extract_epi16(u_a_b
, 0));
281 __m128i v_a_b
= _mm_madd_epi16(
284 reinterpret_cast<const __m128i
*>(ConvertRGBAToYUV_kTable
+ 16)));
285 v_a_b
= _mm_add_epi32(_mm_shuffle_epi32(v_a_b
, ((3 << 2) | 1)),
286 _mm_shuffle_epi32(v_a_b
, (2 << 2)));
287 v_a_b
= _mm_srai_epi32(v_a_b
, FIX_SHIFT
+ 2);
288 v_a_b
= _mm_add_epi32(v_a_b
, uv_offset
);
289 v_a_b
= _mm_packs_epi32(v_a_b
, v_a_b
);
290 v_a_b
= _mm_packus_epi16(v_a_b
, v_a_b
);
291 *reinterpret_cast<uint16
*>(v_buf
) =
292 static_cast<uint16
>(_mm_extract_epi16(v_a_b
, 0));
298 // Move forward by 4 pixels.
302 // Just use C code to convert the remaining pixels.
304 ConvertRGBToYUV_V2H2(rgb_buf_1
, rgb_buf_2
, y_buf_1
, y_buf_2
, u_buf
, v_buf
);
315 ConvertRGBToYUV_V2H1(rgb_buf_1
, rgb_buf_2
, y_buf_1
, y_buf_2
, u_buf
, v_buf
);
318 extern void ConvertRGB32ToYUV_SSE2(const uint8
* rgbframe
,
327 while (height
>= 2) {
328 ConvertRGB32ToYUVRow_SSE2(rgbframe
,
329 rgbframe
+ rgbstride
,
335 rgbframe
+= 2 * rgbstride
;
336 yplane
+= 2 * ystride
;
345 // Handle the last row.
347 ConvertRGBToYUV_V1H2(rgbframe
, yplane
, uplane
, vplane
);
356 ConvertRGBToYUV_V1H1(rgbframe
, yplane
, uplane
, vplane
);
359 void ConvertRGB32ToYUV_SSE2_Reference(const uint8
* rgbframe
,
368 while (height
>= 2) {
371 // Convert a 2x2 block.
372 while (i
+ 2 <= width
) {
373 ConvertRGBToYUV_V2H2(rgbframe
+ i
* 4,
374 rgbframe
+ rgbstride
+ i
* 4,
376 yplane
+ ystride
+ i
,
382 // Convert the last pixel of two rows.
384 ConvertRGBToYUV_V2H1(rgbframe
+ i
* 4,
385 rgbframe
+ rgbstride
+ i
* 4,
387 yplane
+ ystride
+ i
,
392 rgbframe
+= 2 * rgbstride
;
393 yplane
+= 2 * ystride
;
402 // Handle the last row.
404 ConvertRGBToYUV_V1H2(rgbframe
, yplane
, uplane
, vplane
);
412 // Handle the last pixel in the last row.
414 ConvertRGBToYUV_V1H1(rgbframe
, yplane
, uplane
, vplane
);