1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // This webpage shows layout of YV12 and other YUV formats
6 // http://www.fourcc.org/yuv.php
7 // The actual conversion is best described here
8 // http://en.wikipedia.org/wiki/YUV
9 // An article on optimizing YUV conversion using tables instead of multiplies
10 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
12 // YV12 is a full plane of Y and a half height, half width chroma planes
13 // YV16 is a full plane of Y and a full height, half width chroma planes
15 // ARGB pixel format is output, which on little endian is stored as BGRA.
16 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
18 #include "media/base/yuv_convert.h"
21 #include "base/logging.h"
22 #include "base/memory/scoped_ptr.h"
23 #include "build/build_config.h"
24 #include "media/base/simd/convert_rgb_to_yuv.h"
25 #include "media/base/simd/convert_yuv_to_rgb.h"
26 #include "media/base/simd/filter_yuv.h"
28 #if defined(ARCH_CPU_X86_FAMILY)
29 #if defined(COMPILER_MSVC)
36 // Assembly functions are declared without namespace.
38 void EmptyRegisterState_MMX();
43 static FilterYUVRowsProc
ChooseFilterYUVRowsProc() {
44 #if defined(ARCH_CPU_X86_FAMILY)
47 return &FilterYUVRows_SSE2
;
49 #if defined(MEDIA_MMX_INTRINSICS_AVAILABLE)
51 return &FilterYUVRows_MMX
;
52 #endif // defined(MEDIA_MMX_INTRINSICS_AVAILABLE)
53 #endif // defined(ARCH_CPU_X86_FAMILY)
54 return &FilterYUVRows_C
;
57 static ConvertYUVToRGB32RowProc
ChooseConvertYUVToRGB32RowProc() {
58 #if defined(ARCH_CPU_X86_FAMILY)
61 return &ConvertYUVToRGB32Row_SSE
;
63 return &ConvertYUVToRGB32Row_MMX
;
65 return &ConvertYUVToRGB32Row_C
;
68 static ScaleYUVToRGB32RowProc
ChooseScaleYUVToRGB32RowProc() {
69 #if defined(ARCH_CPU_X86_64)
70 // Use 64-bits version if possible.
71 return &ScaleYUVToRGB32Row_SSE2_X64
;
72 #elif defined(ARCH_CPU_X86_FAMILY)
74 // Choose the best one on 32-bits system.
76 return &ScaleYUVToRGB32Row_SSE
;
78 return &ScaleYUVToRGB32Row_MMX
;
79 #endif // defined(ARCH_CPU_X86_64)
80 return &ScaleYUVToRGB32Row_C
;
83 static ScaleYUVToRGB32RowProc
ChooseLinearScaleYUVToRGB32RowProc() {
84 #if defined(ARCH_CPU_X86_64)
85 // Use 64-bits version if possible.
86 return &LinearScaleYUVToRGB32Row_MMX_X64
;
87 #elif defined(ARCH_CPU_X86_FAMILY)
91 return &LinearScaleYUVToRGB32Row_SSE
;
93 return &LinearScaleYUVToRGB32Row_MMX
;
94 #endif // defined(ARCH_CPU_X86_64)
95 return &LinearScaleYUVToRGB32Row_C
;
98 // Empty SIMD registers state after using them.
99 void EmptyRegisterState() {
100 #if defined(ARCH_CPU_X86_FAMILY)
101 static bool checked
= false;
102 static bool has_mmx
= false;
105 has_mmx
= cpu
.has_mmx();
110 #if defined(MEDIA_MMX_INTRINSICS_AVAILABLE)
113 EmptyRegisterState_MMX();
114 #endif // defined(MEDIA_MMX_INTRINSICS_AVAILABLE)
117 #endif // defined(ARCH_CPU_X86_FAMILY)
120 // 16.16 fixed point arithmetic
121 const int kFractionBits
= 16;
122 const int kFractionMax
= 1 << kFractionBits
;
123 const int kFractionMask
= ((1 << kFractionBits
) - 1);
125 // Scale a frame of YUV to 32 bit ARGB.
126 void ScaleYUVToRGB32(const uint8
* y_buf
,
139 ScaleFilter filter
) {
140 static FilterYUVRowsProc filter_proc
= NULL
;
141 static ConvertYUVToRGB32RowProc convert_proc
= NULL
;
142 static ScaleYUVToRGB32RowProc scale_proc
= NULL
;
143 static ScaleYUVToRGB32RowProc linear_scale_proc
= NULL
;
146 filter_proc
= ChooseFilterYUVRowsProc();
148 convert_proc
= ChooseConvertYUVToRGB32RowProc();
150 scale_proc
= ChooseScaleYUVToRGB32RowProc();
151 if (!linear_scale_proc
)
152 linear_scale_proc
= ChooseLinearScaleYUVToRGB32RowProc();
154 // Handle zero sized sources and destinations.
155 if ((yuv_type
== YV12
&& (source_width
< 2 || source_height
< 2)) ||
156 (yuv_type
== YV16
&& (source_width
< 2 || source_height
< 1)) ||
157 width
== 0 || height
== 0)
160 // 4096 allows 3 buffers to fit in 12k.
161 // Helps performance on CPU with 16K L1 cache.
162 // Large enough for 3830x2160 and 30" displays which are 2560x1600.
163 const int kFilterBufferSize
= 4096;
164 // Disable filtering if the screen is too big (to avoid buffer overflows).
165 // This should never happen to regular users: they don't have monitors
166 // wider than 4096 pixels.
167 // TODO(fbarchard): Allow rotated videos to filter.
168 if (source_width
> kFilterBufferSize
|| view_rotate
)
169 filter
= FILTER_NONE
;
171 unsigned int y_shift
= yuv_type
;
172 // Diagram showing origin and direction of source sampling.
178 // Rotations that start at right side of image.
179 if ((view_rotate
== ROTATE_180
) ||
180 (view_rotate
== ROTATE_270
) ||
181 (view_rotate
== MIRROR_ROTATE_0
) ||
182 (view_rotate
== MIRROR_ROTATE_90
)) {
183 y_buf
+= source_width
- 1;
184 u_buf
+= source_width
/ 2 - 1;
185 v_buf
+= source_width
/ 2 - 1;
186 source_width
= -source_width
;
188 // Rotations that start at bottom of image.
189 if ((view_rotate
== ROTATE_90
) ||
190 (view_rotate
== ROTATE_180
) ||
191 (view_rotate
== MIRROR_ROTATE_90
) ||
192 (view_rotate
== MIRROR_ROTATE_180
)) {
193 y_buf
+= (source_height
- 1) * y_pitch
;
194 u_buf
+= ((source_height
>> y_shift
) - 1) * uv_pitch
;
195 v_buf
+= ((source_height
>> y_shift
) - 1) * uv_pitch
;
196 source_height
= -source_height
;
199 int source_dx
= source_width
* kFractionMax
/ width
;
201 if ((view_rotate
== ROTATE_90
) ||
202 (view_rotate
== ROTATE_270
)) {
207 source_height
= source_width
;
209 int source_dy
= source_height
* kFractionMax
/ height
;
210 source_dx
= ((source_dy
>> kFractionBits
) * y_pitch
) << kFractionBits
;
211 if (view_rotate
== ROTATE_90
) {
214 source_height
= -source_height
;
221 // Need padding because FilterRows() will write 1 to 16 extra pixels
222 // after the end for SSE2 version.
223 uint8 yuvbuf
[16 + kFilterBufferSize
* 3 + 16];
225 reinterpret_cast<uint8
*>(reinterpret_cast<uintptr_t>(yuvbuf
+ 15) & ~15);
226 uint8
* ubuf
= ybuf
+ kFilterBufferSize
;
227 uint8
* vbuf
= ubuf
+ kFilterBufferSize
;
229 // TODO(fbarchard): Fixed point math is off by 1 on negatives.
231 // We take a y-coordinate in [0,1] space in the source image space, and
232 // transform to a y-coordinate in [0,1] space in the destination image space.
233 // Note that the coordinate endpoints lie on pixel boundaries, not on pixel
234 // centers: e.g. a two-pixel-high image will have pixel centers at 0.25 and
235 // 0.75. The formula is as follows (in fixed-point arithmetic):
236 // y_dst = dst_height * ((y_src + 0.5) / src_height)
237 // dst_pixel = clamp([0, dst_height - 1], floor(y_dst - 0.5))
238 // Implement this here as an accumulator + delta, to avoid expensive math
240 int source_y_subpixel_accum
=
241 ((kFractionMax
/ 2) * source_height
) / height
- (kFractionMax
/ 2);
242 int source_y_subpixel_delta
= ((1 << kFractionBits
) * source_height
) / height
;
244 // TODO(fbarchard): Split this into separate function for better efficiency.
245 for (int y
= 0; y
< height
; ++y
) {
246 uint8
* dest_pixel
= rgb_buf
+ y
* rgb_pitch
;
247 int source_y_subpixel
= source_y_subpixel_accum
;
248 source_y_subpixel_accum
+= source_y_subpixel_delta
;
249 if (source_y_subpixel
< 0)
250 source_y_subpixel
= 0;
251 else if (source_y_subpixel
> ((source_height
- 1) << kFractionBits
))
252 source_y_subpixel
= (source_height
- 1) << kFractionBits
;
254 const uint8
* y_ptr
= NULL
;
255 const uint8
* u_ptr
= NULL
;
256 const uint8
* v_ptr
= NULL
;
257 // Apply vertical filtering if necessary.
258 // TODO(fbarchard): Remove memcpy when not necessary.
259 if (filter
& media::FILTER_BILINEAR_V
) {
260 int source_y
= source_y_subpixel
>> kFractionBits
;
261 y_ptr
= y_buf
+ source_y
* y_pitch
;
262 u_ptr
= u_buf
+ (source_y
>> y_shift
) * uv_pitch
;
263 v_ptr
= v_buf
+ (source_y
>> y_shift
) * uv_pitch
;
265 // Vertical scaler uses 16.8 fixed point.
266 int source_y_fraction
=
267 (source_y_subpixel
& kFractionMask
) >> 8;
268 if (source_y_fraction
!= 0) {
269 filter_proc(ybuf
, y_ptr
, y_ptr
+ y_pitch
, source_width
,
272 memcpy(ybuf
, y_ptr
, source_width
);
275 ybuf
[source_width
] = ybuf
[source_width
-1];
277 int uv_source_width
= (source_width
+ 1) / 2;
278 int source_uv_fraction
;
280 // For formats with half-height UV planes, each even-numbered pixel row
281 // should not interpolate, since the next row to interpolate from should
282 // be a duplicate of the current row.
283 if (y_shift
&& (source_y
& 0x1) == 0)
284 source_uv_fraction
= 0;
286 source_uv_fraction
= source_y_fraction
;
288 if (source_uv_fraction
!= 0) {
289 filter_proc(ubuf
, u_ptr
, u_ptr
+ uv_pitch
, uv_source_width
,
291 filter_proc(vbuf
, v_ptr
, v_ptr
+ uv_pitch
, uv_source_width
,
294 memcpy(ubuf
, u_ptr
, uv_source_width
);
295 memcpy(vbuf
, v_ptr
, uv_source_width
);
299 ubuf
[uv_source_width
] = ubuf
[uv_source_width
- 1];
300 vbuf
[uv_source_width
] = vbuf
[uv_source_width
- 1];
302 // Offset by 1/2 pixel for center sampling.
303 int source_y
= (source_y_subpixel
+ (kFractionMax
/ 2)) >> kFractionBits
;
304 y_ptr
= y_buf
+ source_y
* y_pitch
;
305 u_ptr
= u_buf
+ (source_y
>> y_shift
) * uv_pitch
;
306 v_ptr
= v_buf
+ (source_y
>> y_shift
) * uv_pitch
;
308 if (source_dx
== kFractionMax
) { // Not scaled
309 convert_proc(y_ptr
, u_ptr
, v_ptr
, dest_pixel
, width
);
311 if (filter
& FILTER_BILINEAR_H
) {
312 linear_scale_proc(y_ptr
, u_ptr
, v_ptr
, dest_pixel
, width
, source_dx
);
314 scale_proc(y_ptr
, u_ptr
, v_ptr
, dest_pixel
, width
, source_dx
);
319 EmptyRegisterState();
322 // Scale a frame of YV12 to 32 bit ARGB for a specific rectangle.
323 void ScaleYUVToRGB32WithRect(const uint8
* y_buf
,
334 int dest_rect_bottom
,
338 static FilterYUVRowsProc filter_proc
= NULL
;
340 filter_proc
= ChooseFilterYUVRowsProc();
342 // This routine doesn't currently support up-scaling.
343 CHECK_LE(dest_width
, source_width
);
344 CHECK_LE(dest_height
, source_height
);
346 // Sanity-check the destination rectangle.
347 DCHECK(dest_rect_left
>= 0 && dest_rect_right
<= dest_width
);
348 DCHECK(dest_rect_top
>= 0 && dest_rect_bottom
<= dest_height
);
349 DCHECK(dest_rect_right
> dest_rect_left
);
350 DCHECK(dest_rect_bottom
> dest_rect_top
);
352 // Fixed-point value of vertical and horizontal scale down factor.
353 // Values are in the format 16.16.
354 int y_step
= kFractionMax
* source_height
/ dest_height
;
355 int x_step
= kFractionMax
* source_width
/ dest_width
;
357 // Determine the coordinates of the rectangle in 16.16 coords.
358 // NB: Our origin is the *center* of the top/left pixel, NOT its top/left.
359 // If we're down-scaling by more than a factor of two, we start with a 50%
360 // fraction to avoid degenerating to point-sampling - we should really just
361 // fix the fraction at 50% for all pixels in that case.
362 int source_left
= dest_rect_left
* x_step
;
363 int source_right
= (dest_rect_right
- 1) * x_step
;
364 if (x_step
< kFractionMax
* 2) {
365 source_left
+= ((x_step
- kFractionMax
) / 2);
366 source_right
+= ((x_step
- kFractionMax
) / 2);
368 source_left
+= kFractionMax
/ 2;
369 source_right
+= kFractionMax
/ 2;
371 int source_top
= dest_rect_top
* y_step
;
372 if (y_step
< kFractionMax
* 2) {
373 source_top
+= ((y_step
- kFractionMax
) / 2);
375 source_top
+= kFractionMax
/ 2;
378 // Determine the parts of the Y, U and V buffers to interpolate.
379 int source_y_left
= source_left
>> kFractionBits
;
380 int source_y_right
= std::min(
381 (source_right
>> kFractionBits
) + 2,
384 int source_uv_left
= source_y_left
/ 2;
385 int source_uv_right
= std::min(
386 (source_right
>> (kFractionBits
+ 1)) + 2,
387 (source_width
+ 1) / 2);
389 int source_y_width
= source_y_right
- source_y_left
;
390 int source_uv_width
= source_uv_right
- source_uv_left
;
392 // Determine number of pixels in each output row.
393 int dest_rect_width
= dest_rect_right
- dest_rect_left
;
395 // Intermediate buffer for vertical interpolation.
396 // 4096 bytes allows 3 buffers to fit in 12k, which fits in a 16K L1 cache,
397 // and is bigger than most users will generally need.
398 // The buffer is 16-byte aligned and padded with 16 extra bytes; some of the
399 // FilterYUVRowProcs have alignment requirements, and the SSE version can
400 // write up to 16 bytes past the end of the buffer.
401 const int kFilterBufferSize
= 4096;
402 if (source_width
> kFilterBufferSize
)
404 uint8 yuv_temp
[16 + kFilterBufferSize
* 3 + 16];
406 reinterpret_cast<uint8
*>(
407 reinterpret_cast<uintptr_t>(yuv_temp
+ 15) & ~15);
408 uint8
* u_temp
= y_temp
+ kFilterBufferSize
;
409 uint8
* v_temp
= u_temp
+ kFilterBufferSize
;
411 // Move to the top-left pixel of output.
412 rgb_buf
+= dest_rect_top
* rgb_pitch
;
413 rgb_buf
+= dest_rect_left
* 4;
415 // For each destination row perform interpolation and color space
416 // conversion to produce the output.
417 for (int row
= dest_rect_top
; row
< dest_rect_bottom
; ++row
) {
418 // Round the fixed-point y position to get the current row.
419 int source_row
= source_top
>> kFractionBits
;
420 int source_uv_row
= source_row
/ 2;
421 DCHECK(source_row
< source_height
);
423 // Locate the first row for each plane for interpolation.
424 const uint8
* y0_ptr
= y_buf
+ y_pitch
* source_row
+ source_y_left
;
425 const uint8
* u0_ptr
= u_buf
+ uv_pitch
* source_uv_row
+ source_uv_left
;
426 const uint8
* v0_ptr
= v_buf
+ uv_pitch
* source_uv_row
+ source_uv_left
;
427 const uint8
* y1_ptr
= NULL
;
428 const uint8
* u1_ptr
= NULL
;
429 const uint8
* v1_ptr
= NULL
;
431 // Locate the second row for interpolation, being careful not to overrun.
432 if (source_row
+ 1 >= source_height
) {
435 y1_ptr
= y0_ptr
+ y_pitch
;
437 if (source_uv_row
+ 1 >= (source_height
+ 1) / 2) {
441 u1_ptr
= u0_ptr
+ uv_pitch
;
442 v1_ptr
= v0_ptr
+ uv_pitch
;
446 // Vertical scaler uses 16.8 fixed point.
447 int fraction
= (source_top
& kFractionMask
) >> 8;
448 filter_proc(y_temp
+ source_y_left
, y0_ptr
, y1_ptr
,
449 source_y_width
, fraction
);
450 filter_proc(u_temp
+ source_uv_left
, u0_ptr
, u1_ptr
,
451 source_uv_width
, fraction
);
452 filter_proc(v_temp
+ source_uv_left
, v0_ptr
, v1_ptr
,
453 source_uv_width
, fraction
);
455 // Perform horizontal interpolation and color space conversion.
456 // TODO(hclam): Use the MMX version after more testing.
457 LinearScaleYUVToRGB32RowWithRange_C(
458 y_temp
, u_temp
, v_temp
, rgb_buf
,
459 dest_rect_width
, source_left
, x_step
);
461 // If the frame is too large then we linear scale a single row.
462 LinearScaleYUVToRGB32RowWithRange_C(
463 y0_ptr
, u0_ptr
, v0_ptr
, rgb_buf
,
464 dest_rect_width
, source_left
, x_step
);
467 // Advance vertically in the source and destination image.
468 source_top
+= y_step
;
469 rgb_buf
+= rgb_pitch
;
472 EmptyRegisterState();
475 void ConvertRGB32ToYUV(const uint8
* rgbframe
,
484 static void (*convert_proc
)(const uint8
*, uint8
*, uint8
*, uint8
*,
485 int, int, int, int, int) = NULL
;
487 #if defined(ARCH_CPU_ARM_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY)
488 // For ARM and MIPS processors, always use C version.
489 // TODO(hclam): Implement a NEON version.
490 convert_proc
= &ConvertRGB32ToYUV_C
;
492 // TODO(hclam): Switch to SSSE3 version when the cyan problem is solved.
493 // See: crbug.com/100462
496 convert_proc
= &ConvertRGB32ToYUV_SSE2
;
498 convert_proc
= &ConvertRGB32ToYUV_C
;
502 convert_proc(rgbframe
, yplane
, uplane
, vplane
, width
, height
,
503 rgbstride
, ystride
, uvstride
);
506 void ConvertRGB24ToYUV(const uint8
* rgbframe
,
515 #if defined(ARCH_CPU_ARM_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY)
516 ConvertRGB24ToYUV_C(rgbframe
, yplane
, uplane
, vplane
, width
, height
,
517 rgbstride
, ystride
, uvstride
);
519 static void (*convert_proc
)(const uint8
*, uint8
*, uint8
*, uint8
*,
520 int, int, int, int, int) = NULL
;
524 convert_proc
= &ConvertRGB24ToYUV_SSSE3
;
526 convert_proc
= &ConvertRGB24ToYUV_C
;
528 convert_proc(rgbframe
, yplane
, uplane
, vplane
, width
, height
,
529 rgbstride
, ystride
, uvstride
);
533 void ConvertYUY2ToYUV(const uint8
* src
,
539 for (int i
= 0; i
< height
/ 2; ++i
) {
540 for (int j
= 0; j
< (width
/ 2); ++j
) {
550 for (int j
= 0; j
< (width
/ 2); ++j
) {
559 void ConvertNV21ToYUV(const uint8
* src
,
565 int y_plane_size
= width
* height
;
566 memcpy(yplane
, src
, y_plane_size
);
569 int u_plane_size
= y_plane_size
>> 2;
570 for (int i
= 0; i
< u_plane_size
; ++i
) {
576 void ConvertYUVToRGB32(const uint8
* yplane
,
586 #if defined(ARCH_CPU_ARM_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY)
587 ConvertYUVToRGB32_C(yplane
, uplane
, vplane
, rgbframe
,
588 width
, height
, ystride
, uvstride
, rgbstride
, yuv_type
);
590 static ConvertYUVToRGB32Proc convert_proc
= NULL
;
594 convert_proc
= &ConvertYUVToRGB32_SSE
;
595 else if (cpu
.has_mmx())
596 convert_proc
= &ConvertYUVToRGB32_MMX
;
598 convert_proc
= &ConvertYUVToRGB32_C
;
601 convert_proc(yplane
, uplane
, vplane
, rgbframe
,
602 width
, height
, ystride
, uvstride
, rgbstride
, yuv_type
);
606 void ConvertYUVAToARGB(const uint8
* yplane
,
618 #if defined(ARCH_CPU_ARM_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY)
619 ConvertYUVAToARGB_C(yplane
, uplane
, vplane
, aplane
, rgbframe
,
620 width
, height
, ystride
, uvstride
, astride
, rgbstride
,
623 static ConvertYUVAToARGBProc convert_proc
= NULL
;
627 convert_proc
= &ConvertYUVAToARGB_MMX
;
629 convert_proc
= &ConvertYUVAToARGB_C
;
631 convert_proc(yplane
, uplane
, vplane
, aplane
, rgbframe
,
632 width
, height
, ystride
, uvstride
, astride
, rgbstride
, yuv_type
);