1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
2 --- a/gfx/ycbcr/yuv_convert.cpp
3 +++ b/gfx/ycbcr/yuv_convert.cpp
5 // http://www.fourcc.org/yuv.php
6 // The actual conversion is best described here
7 // http://en.wikipedia.org/wiki/YUV
8 // An article on optimizing YUV conversion using tables instead of multiplies
9 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
11 // YV12 is a full plane of Y and a half height, half width chroma planes
12 // YV16 is a full plane of Y and a full height, half width chroma planes
13 +// YV24 is a full plane of Y and a full height, full width chroma planes
15 // ARGB pixel format is output, which on little endian is stored as BGRA.
16 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
18 -#include "media/base/yuv_convert.h"
19 +#include "yuv_convert.h"
21 // Header for low level row functions.
22 -#include "media/base/yuv_row.h"
25 -#if defined(_MSC_VER)
28 -#include <mmintrin.h>
33 -#include <emmintrin.h>
39 +#include "mozilla/SSE.h"
41 +#ifdef HAVE_YCBCR_TO_RGB565
42 +void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
49 // 16.16 fixed point arithmetic
50 const int kFractionBits = 16;
51 const int kFractionMax = 1 << kFractionBits;
52 const int kFractionMask = ((1 << kFractionBits) - 1);
55 +// Convert a frame of YUV to 16 bit RGB565.
56 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
69 +#ifdef HAVE_YCBCR_TO_RGB565
70 + for (int i = 0; i < pic_height; i++) {
71 + yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
72 + y_buf + y_pitch * i,
73 + u_buf + uv_pitch * (i / 2),
74 + v_buf + uv_pitch * (i / 2),
81 // Convert a frame of YUV to 32 bit ARGB.
82 -void ConvertYUVToRGB32(const uint8* y_buf,
92 - unsigned int y_shift = yuv_type;
93 - for (int y = 0; y < height; ++y) {
94 - uint8* rgb_row = rgb_buf + y * rgb_pitch;
95 - const uint8* y_ptr = y_buf + y * y_pitch;
96 - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
97 - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
99 - FastConvertYUVToRGB32Row(y_ptr,
105 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
106 + const uint8* u_buf,
107 + const uint8* v_buf,
116 + YUVType yuv_type) {
117 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
118 + unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
119 + // Test for SSE because the optimized code uses movntq, which is not part of MMX.
120 + bool has_sse = supports_mmx() && supports_sse();
121 + // There is no optimized YV24 SSE routine so we check for this and
122 + // fall back to the C code.
123 + has_sse &= yuv_type != YV24;
124 + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
125 + int x_width = odd_pic_x ? pic_width - 1 : pic_width;
127 + for (int y = pic_y; y < pic_height + pic_y; ++y) {
128 + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
129 + const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
130 + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
131 + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
134 + // Handle the single odd pixel manually and use the
135 + // fast routines for the remaining.
136 + FastConvertYUVToRGB32Row_C(y_ptr++,
146 + FastConvertYUVToRGB32Row(y_ptr,
153 + FastConvertYUVToRGB32Row_C(y_ptr,
162 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
167 -// FilterRows combines two rows of the image using linear interpolation.
168 -// SSE2 version does 16 pixels at a time
170 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
171 - int source_width, int source_y_fraction) {
172 - __m128i zero = _mm_setzero_si128();
173 - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
174 - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
176 - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
177 - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
178 - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
179 - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
182 - __m128i y0 = _mm_loadu_si128(y0_ptr128);
183 - __m128i y1 = _mm_loadu_si128(y1_ptr128);
184 - __m128i y2 = _mm_unpackhi_epi8(y0, zero);
185 - __m128i y3 = _mm_unpackhi_epi8(y1, zero);
186 - y0 = _mm_unpacklo_epi8(y0, zero);
187 - y1 = _mm_unpacklo_epi8(y1, zero);
188 - y0 = _mm_mullo_epi16(y0, y0_fraction);
189 - y1 = _mm_mullo_epi16(y1, y1_fraction);
190 - y2 = _mm_mullo_epi16(y2, y0_fraction);
191 - y3 = _mm_mullo_epi16(y3, y1_fraction);
192 - y0 = _mm_add_epi16(y0, y1);
193 - y2 = _mm_add_epi16(y2, y3);
194 - y0 = _mm_srli_epi16(y0, 8);
195 - y2 = _mm_srli_epi16(y2, 8);
196 - y0 = _mm_packus_epi16(y0, y2);
200 - } while (dest128 < end128);
203 -// MMX version does 8 pixels at a time
204 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
205 - int source_width, int source_y_fraction) {
206 - __m64 zero = _mm_setzero_si64();
207 - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
208 - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
210 - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
211 - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
212 - __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
213 - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
216 - __m64 y0 = *y0_ptr64++;
217 - __m64 y1 = *y1_ptr64++;
218 - __m64 y2 = _mm_unpackhi_pi8(y0, zero);
219 - __m64 y3 = _mm_unpackhi_pi8(y1, zero);
220 - y0 = _mm_unpacklo_pi8(y0, zero);
221 - y1 = _mm_unpacklo_pi8(y1, zero);
222 - y0 = _mm_mullo_pi16(y0, y0_fraction);
223 - y1 = _mm_mullo_pi16(y1, y1_fraction);
224 - y2 = _mm_mullo_pi16(y2, y0_fraction);
225 - y3 = _mm_mullo_pi16(y3, y1_fraction);
226 - y0 = _mm_add_pi16(y0, y1);
227 - y2 = _mm_add_pi16(y2, y3);
228 - y0 = _mm_srli_pi16(y0, 8);
229 - y2 = _mm_srli_pi16(y2, 8);
230 - y0 = _mm_packs_pu16(y0, y2);
232 - } while (dest64 < end64);
234 -#else // no MMX or SSE2
239 // C version does 8 at a time to mimic MMX code
240 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
241 - int source_width, int source_y_fraction) {
242 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
243 + int source_width, int source_y_fraction) {
244 int y1_fraction = source_y_fraction;
245 int y0_fraction = 256 - y1_fraction;
246 uint8* end = ybuf + source_width;
248 ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
249 ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
250 ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
251 ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
252 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
253 ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
254 ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
255 ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
259 } while (ybuf < end);
263 +#ifdef MOZILLA_MAY_SUPPORT_MMX
264 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
265 + int source_width, int source_y_fraction);
268 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
269 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
270 + int source_width, int source_y_fraction);
273 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
274 + const uint8* y1_ptr, int source_width,
275 + int source_y_fraction) {
276 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
277 + if (mozilla::supports_sse2()) {
278 + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
283 +#ifdef MOZILLA_MAY_SUPPORT_MMX
284 + if (mozilla::supports_mmx()) {
285 + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
290 + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
294 // Scale a frame of YUV to 32 bit ARGB.
295 -void ScaleYUVToRGB32(const uint8* y_buf,
296 - const uint8* u_buf,
297 - const uint8* v_buf,
307 - Rotate view_rotate,
308 - ScaleFilter filter) {
309 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
310 + const uint8* u_buf,
311 + const uint8* v_buf,
321 + Rotate view_rotate,
322 + ScaleFilter filter) {
323 + bool has_mmx = supports_mmx();
325 // 4096 allows 3 buffers to fit in 12k.
326 // Helps performance on CPU with 16K L1 cache.
327 // Large enough for 3830x2160 and 30" displays which are 2560x1600.
328 const int kFilterBufferSize = 4096;
329 // Disable filtering if the screen is too big (to avoid buffer overflows).
330 // This should never happen to regular users: they don't have monitors
331 // wider than 4096 pixels.
332 // TODO(fbarchard): Allow rotated videos to filter.
333 if (source_width > kFilterBufferSize || view_rotate)
334 filter = FILTER_NONE;
336 - unsigned int y_shift = yuv_type;
337 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
338 // Diagram showing origin and direction of source sampling.
344 // Rotations that start at right side of image.
345 if ((view_rotate == ROTATE_180) ||
346 @@ -243,17 +262,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
351 // Need padding because FilterRows() will write 1 to 16 extra pixels
352 // after the end for SSE2 version.
353 uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
355 - reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
356 + reinterpret_cast<uint8*>(reinterpret_cast<PRUptrdiff>(yuvbuf + 15) & ~15);
357 uint8* ubuf = ybuf + kFilterBufferSize;
358 uint8* vbuf = ubuf + kFilterBufferSize;
359 // TODO(fbarchard): Fixed point math is off by 1 on negatives.
360 int yscale_fixed = (source_height << kFractionBits) / height;
362 // TODO(fbarchard): Split this into separate function for better efficiency.
363 for (int y = 0; y < height; ++y) {
364 uint8* dest_pixel = rgb_buf + y * rgb_pitch;
365 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
366 int source_uv_fraction =
367 ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
369 const uint8* y_ptr = y0_ptr;
370 const uint8* u_ptr = u0_ptr;
371 const uint8* v_ptr = v0_ptr;
372 // Apply vertical filtering if necessary.
373 // TODO(fbarchard): Remove memcpy when not necessary.
374 - if (filter & media::FILTER_BILINEAR_V) {
375 + if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
376 if (yscale_fixed != kFractionMax &&
377 source_y_fraction && ((source_y + 1) < source_height)) {
378 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
380 memcpy(ybuf, y0_ptr, source_width);
383 ybuf[source_width] = ybuf[source_width-1];
384 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
387 ubuf[uv_source_width] = ubuf[uv_source_width - 1];
388 vbuf[uv_source_width] = vbuf[uv_source_width - 1];
390 if (source_dx == kFractionMax) { // Not scaled
391 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
394 - if (filter & FILTER_BILINEAR_H) {
395 + } else if (filter & FILTER_BILINEAR_H) {
396 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
397 dest_pixel, width, source_dx);
399 // Specialized scalers and rotation.
400 -#if USE_MMX && defined(_MSC_VER)
401 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
402 + if(mozilla::supports_sse()) {
403 if (width == (source_width * 2)) {
404 - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
405 - dest_pixel, width);
406 + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
407 + dest_pixel, width);
408 } else if ((source_dx & kFractionMask) == 0) {
409 // Scaling by integer scale factor. ie half.
410 - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
412 - source_dx >> kFractionBits);
413 + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
415 + source_dx >> kFractionBits);
416 } else if (source_dx_uv == source_dx) { // Not rotated.
417 ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
418 dest_pixel, width, source_dx);
420 - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
422 - source_dx >> kFractionBits,
423 - source_dx_uv >> kFractionBits);
424 + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
426 + source_dx >> kFractionBits,
427 + source_dx_uv >> kFractionBits);
431 + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
432 + dest_pixel, width, source_dx);
435 - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
436 - dest_pixel, width, source_dx);
439 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
440 + dest_pixel, width, source_dx);
444 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
448 -} // namespace media
454 +} // namespace mozilla
455 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
456 --- a/gfx/ycbcr/yuv_convert.h
457 +++ b/gfx/ycbcr/yuv_convert.h
459 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
460 // Use of this source code is governed by a BSD-style license that can be
461 // found in the LICENSE file.
463 #ifndef MEDIA_BASE_YUV_CONVERT_H_
464 #define MEDIA_BASE_YUV_CONVERT_H_
466 -#include "base/basictypes.h"
470 +#include "chromium_types.h"
471 +#include "gfxCore.h"
474 +#define HAVE_YCBCR_TO_RGB565 1
481 // Type of YUV surface.
482 // The value of these enums matter as they are used to shift vertical indices.
484 - YV16 = 0, // YV16 is half width and full height chroma channels.
485 - YV12 = 1, // YV12 is half width and half height chroma channels.
486 + YV12 = 0, // YV12 is half width and half height chroma channels.
487 + YV16 = 1, // YV16 is half width and full height chroma channels.
488 + YV24 = 2 // YV24 is full width and full height chroma channels.
491 // Mirror means flip the image horizontally, as in looking in a mirror.
492 // Rotate happens after mirroring.
494 ROTATE_0, // Rotation off.
495 ROTATE_90, // Rotate clockwise.
496 ROTATE_180, // Rotate upside down.
497 ROTATE_270, // Rotate counter clockwise.
498 MIRROR_ROTATE_0, // Mirror horizontally.
499 MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
500 MIRROR_ROTATE_180, // Mirror vertically.
501 - MIRROR_ROTATE_270, // Transpose.
502 + MIRROR_ROTATE_270 // Transpose.
505 // Filter affects how scaling looks.
507 FILTER_NONE = 0, // No filter (point sampled).
508 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
509 FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
510 - FILTER_BILINEAR = 3, // Bilinear filter.
511 + FILTER_BILINEAR = 3 // Bilinear filter.
514 +// Convert a frame of YUV to 16 bit RGB565.
515 +// Pass in YV12 formats
516 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
517 + const uint8* uplane,
518 + const uint8* vplane,
529 // Convert a frame of YUV to 32 bit ARGB.
530 // Pass in YV16/YV12 depending on source format
531 -void ConvertYUVToRGB32(const uint8* yplane,
532 - const uint8* uplane,
533 - const uint8* vplane,
541 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
542 + const uint8* uplane,
543 + const uint8* vplane,
554 // Scale a frame of YUV to 32 bit ARGB.
555 // Supports rotation and mirroring.
556 -void ScaleYUVToRGB32(const uint8* yplane,
557 - const uint8* uplane,
558 - const uint8* vplane,
568 - Rotate view_rotate,
569 - ScaleFilter filter);
571 -} // namespace media
573 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
574 + const uint8* uplane,
575 + const uint8* vplane,
585 + Rotate view_rotate,
586 + ScaleFilter filter);
589 +} // namespace mozilla
591 #endif // MEDIA_BASE_YUV_CONVERT_H_
592 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
595 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
597 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
598 +// Use of this source code is governed by a BSD-style license that can be
599 +// found in the LICENSE file.
601 +#include <mmintrin.h>
602 +#include "yuv_row.h"
607 +// FilterRows combines two rows of the image using linear interpolation.
608 +// MMX version does 8 pixels at a time.
609 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
610 + int source_width, int source_y_fraction) {
611 + __m64 zero = _mm_setzero_si64();
612 + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
613 + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
615 + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
616 + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
617 + __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
618 + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
621 + __m64 y0 = *y0_ptr64++;
622 + __m64 y1 = *y1_ptr64++;
623 + __m64 y2 = _mm_unpackhi_pi8(y0, zero);
624 + __m64 y3 = _mm_unpackhi_pi8(y1, zero);
625 + y0 = _mm_unpacklo_pi8(y0, zero);
626 + y1 = _mm_unpacklo_pi8(y1, zero);
627 + y0 = _mm_mullo_pi16(y0, y0_fraction);
628 + y1 = _mm_mullo_pi16(y1, y1_fraction);
629 + y2 = _mm_mullo_pi16(y2, y0_fraction);
630 + y3 = _mm_mullo_pi16(y3, y1_fraction);
631 + y0 = _mm_add_pi16(y0, y1);
632 + y2 = _mm_add_pi16(y2, y3);
633 + y0 = _mm_srli_pi16(y0, 8);
634 + y2 = _mm_srli_pi16(y2, 8);
635 + y0 = _mm_packs_pu16(y0, y2);
637 + } while (dest64 < end64);
642 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
645 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
647 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
648 +// Use of this source code is governed by a BSD-style license that can be
649 +// found in the LICENSE file.
651 +#include <emmintrin.h>
652 +#include "yuv_row.h"
657 +// FilterRows combines two rows of the image using linear interpolation.
658 +// SSE2 version does 16 pixels at a time.
659 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
660 + int source_width, int source_y_fraction) {
661 + __m128i zero = _mm_setzero_si128();
662 + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
663 + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
665 + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
666 + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
667 + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
668 + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
671 + __m128i y0 = _mm_loadu_si128(y0_ptr128);
672 + __m128i y1 = _mm_loadu_si128(y1_ptr128);
673 + __m128i y2 = _mm_unpackhi_epi8(y0, zero);
674 + __m128i y3 = _mm_unpackhi_epi8(y1, zero);
675 + y0 = _mm_unpacklo_epi8(y0, zero);
676 + y1 = _mm_unpacklo_epi8(y1, zero);
677 + y0 = _mm_mullo_epi16(y0, y0_fraction);
678 + y1 = _mm_mullo_epi16(y1, y1_fraction);
679 + y2 = _mm_mullo_epi16(y2, y0_fraction);
680 + y3 = _mm_mullo_epi16(y3, y1_fraction);
681 + y0 = _mm_add_epi16(y0, y1);
682 + y2 = _mm_add_epi16(y2, y3);
683 + y0 = _mm_srli_epi16(y0, 8);
684 + y2 = _mm_srli_epi16(y2, 8);
685 + y0 = _mm_packus_epi16(y0, y2);
689 + } while (dest128 < end128);
694 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
695 --- a/gfx/ycbcr/yuv_row.h
696 +++ b/gfx/ycbcr/yuv_row.h
698 // yuv_row internal functions to handle YUV conversion and scaling to RGB.
699 // These functions are used from both yuv_convert.cc and yuv_scale.cc.
701 // TODO(fbarchard): Write function that can handle rotation and scaling.
703 #ifndef MEDIA_BASE_YUV_ROW_H_
704 #define MEDIA_BASE_YUV_ROW_H_
706 -#include "base/basictypes.h"
707 +#include "chromium_types.h"
711 // This is the second fastest of the scalers.
712 void FastConvertYUVToRGB32Row(const uint8* y_buf,
718 -// Can do 1x, half size or any scale down by an integer amount.
719 -// Step can be negative (mirroring, rotate 180).
720 -// This is the third fastest of the scalers.
721 -void ConvertYUVToRGB32Row(const uint8* y_buf,
722 - const uint8* u_buf,
723 - const uint8* v_buf,
728 -// Rotate is like Convert, but applies different step to Y versus U and V.
729 -// This allows rotation by 90 or 270, by stepping by stride.
730 -// This is the forth fastest of the scalers.
731 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
732 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
739 + unsigned int x_shift);
741 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
742 + const uint8* u_buf,
743 + const uint8* v_buf,
747 +// Can do 1x, half size or any scale down by an integer amount.
748 +// Step can be negative (mirroring, rotate 180).
749 +// This is the third fastest of the scalers.
750 +// Only defined on Windows x86-32.
751 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
752 + const uint8* u_buf,
753 + const uint8* v_buf,
758 +// Rotate is like Convert, but applies different step to Y versus U and V.
759 +// This allows rotation by 90 or 270, by stepping by stride.
760 +// This is the forth fastest of the scalers.
761 +// Only defined on Windows x86-32.
762 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
763 + const uint8* u_buf,
764 + const uint8* v_buf,
770 // Doubler does 4 pixels at a time. Each pixel is replicated.
771 // This is the fastest of the scalers.
772 -void DoubleYUVToRGB32Row(const uint8* y_buf,
773 - const uint8* u_buf,
774 - const uint8* v_buf,
777 +// Only defined on Windows x86-32.
778 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
779 + const uint8* u_buf,
780 + const uint8* v_buf,
784 // Handles arbitrary scaling up or down.
785 // Mirroring is supported, but not 90 or 270 degree rotation.
786 // Chroma is under sampled every 2 pixels for performance.
787 void ScaleYUVToRGB32Row(const uint8* y_buf,
794 +void ScaleYUVToRGB32Row(const uint8* y_buf,
795 + const uint8* u_buf,
796 + const uint8* v_buf,
801 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
802 + const uint8* u_buf,
803 + const uint8* v_buf,
808 // Handles arbitrary scaling up or down with bilinear filtering.
809 // Mirroring is supported, but not 90 or 270 degree rotation.
810 // Chroma is under sampled every 2 pixels for performance.
811 // This is the slowest of the scalers.
812 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
819 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
820 + const uint8* u_buf,
821 + const uint8* v_buf,
826 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
827 + const uint8* u_buf,
828 + const uint8* v_buf,
834 #if defined(_MSC_VER)
835 #define SIMD_ALIGNED(var) __declspec(align(16)) var
837 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
839 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
841 -// Method to force C version.
843 -//#define USE_SSE2 0
845 -#if !defined(USE_MMX)
846 -// Windows, Mac and Linux/BSD use MMX
847 -#if defined(__MMX__) || defined(_MSC_VER)
854 -#if !defined(USE_SSE2)
855 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
862 // x64 uses MMX2 (SSE) so emms is not required.
863 // Warning C4799: function has no EMMS instruction.
864 // EMMS() is slow and should be called by the calling function once per image.
865 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
866 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
867 #if defined(_MSC_VER)
868 #define EMMS() __asm emms
869 #pragma warning(disable: 4799)
871 #define EMMS() asm("emms")
875 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
876 --- a/gfx/ycbcr/yuv_row_c.cpp
877 +++ b/gfx/ycbcr/yuv_row_c.cpp
879 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
880 // Use of this source code is governed by a BSD-style license that can be
881 // found in the LICENSE file.
883 -#include "media/base/yuv_row.h"
886 -#include "base/logging.h"
888 +#include "yuv_row.h"
895 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
897 -// AMD64 ABI uses register paremters.
898 -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
899 - const uint8* u_buf, // rsi
900 - const uint8* v_buf, // rdx
901 - uint8* rgb_buf, // rcx
906 - "movzb (%1),%%r10\n"
908 - "movzb (%2),%%r11\n"
910 - "movq 2048(%5,%%r10,8),%%xmm0\n"
911 - "movzb (%0),%%r10\n"
912 - "movq 4096(%5,%%r11,8),%%xmm1\n"
913 - "movzb 0x1(%0),%%r11\n"
914 - "paddsw %%xmm1,%%xmm0\n"
915 - "movq (%5,%%r10,8),%%xmm2\n"
917 - "movq (%5,%%r11,8),%%xmm3\n"
918 - "paddsw %%xmm0,%%xmm2\n"
919 - "paddsw %%xmm0,%%xmm3\n"
920 - "shufps $0x44,%%xmm3,%%xmm2\n"
921 - "psraw $0x6,%%xmm2\n"
922 - "packuswb %%xmm2,%%xmm2\n"
923 - "movq %%xmm2,0x0(%3)\n"
927 - "jns convertloop\n"
933 - "movzb (%1),%%r10\n"
934 - "movq 2048(%5,%%r10,8),%%xmm0\n"
935 - "movzb (%2),%%r10\n"
936 - "movq 4096(%5,%%r10,8),%%xmm1\n"
937 - "paddsw %%xmm1,%%xmm0\n"
938 - "movzb (%0),%%r10\n"
939 - "movq (%5,%%r10,8),%%xmm1\n"
940 - "paddsw %%xmm0,%%xmm1\n"
941 - "psraw $0x6,%%xmm1\n"
942 - "packuswb %%xmm1,%%xmm1\n"
943 - "movd %%xmm1,0x0(%3)\n"
946 - : "r"(y_buf), // %0
949 - "r"(rgb_buf), // %3
951 - "r" (kCoefficientsRgbY) // %5
952 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
956 -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
957 - const uint8* u_buf, // rsi
958 - const uint8* v_buf, // rdx
959 - uint8* rgb_buf, // rcx
961 - int source_dx) { // r9
963 - "xor %%r11,%%r11\n"
968 - "mov %%r11,%%r10\n"
969 - "sar $0x11,%%r10\n"
970 - "movzb (%1,%%r10,1),%%rax\n"
971 - "movq 2048(%5,%%rax,8),%%xmm0\n"
972 - "movzb (%2,%%r10,1),%%rax\n"
973 - "movq 4096(%5,%%rax,8),%%xmm1\n"
974 - "lea (%%r11,%6),%%r10\n"
975 - "sar $0x10,%%r11\n"
976 - "movzb (%0,%%r11,1),%%rax\n"
977 - "paddsw %%xmm1,%%xmm0\n"
978 - "movq (%5,%%rax,8),%%xmm1\n"
979 - "lea (%%r10,%6),%%r11\n"
980 - "sar $0x10,%%r10\n"
981 - "movzb (%0,%%r10,1),%%rax\n"
982 - "movq (%5,%%rax,8),%%xmm2\n"
983 - "paddsw %%xmm0,%%xmm1\n"
984 - "paddsw %%xmm0,%%xmm2\n"
985 - "shufps $0x44,%%xmm2,%%xmm1\n"
986 - "psraw $0x6,%%xmm1\n"
987 - "packuswb %%xmm1,%%xmm1\n"
988 - "movq %%xmm1,0x0(%3)\n"
997 - "mov %%r11,%%r10\n"
998 - "sar $0x11,%%r10\n"
999 - "movzb (%1,%%r10,1),%%rax\n"
1000 - "movq 2048(%5,%%rax,8),%%xmm0\n"
1001 - "movzb (%2,%%r10,1),%%rax\n"
1002 - "movq 4096(%5,%%rax,8),%%xmm1\n"
1003 - "paddsw %%xmm1,%%xmm0\n"
1004 - "sar $0x10,%%r11\n"
1005 - "movzb (%0,%%r11,1),%%rax\n"
1006 - "movq (%5,%%rax,8),%%xmm1\n"
1007 - "paddsw %%xmm0,%%xmm1\n"
1008 - "psraw $0x6,%%xmm1\n"
1009 - "packuswb %%xmm1,%%xmm1\n"
1010 - "movd %%xmm1,0x0(%3)\n"
1014 - : "r"(y_buf), // %0
1017 - "r"(rgb_buf), // %3
1019 - "r" (kCoefficientsRgbY), // %5
1020 - "r"(static_cast<long>(source_dx)) // %6
1021 - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
1025 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1026 - const uint8* u_buf,
1027 - const uint8* v_buf,
1032 - "xor %%r11,%%r11\n" // x = 0
1034 - "js .lscalenext\n"
1035 - "cmp $0x20000,%6\n" // if source_dx >= 2.0
1036 - "jl .lscalehalf\n"
1037 - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1041 - "mov %%r11,%%r10\n"
1042 - "sar $0x11,%%r10\n"
1044 - "movzb (%1, %%r10, 1), %%r13 \n"
1045 - "movzb 1(%1, %%r10, 1), %%r14 \n"
1046 - "mov %%r11, %%rax \n"
1047 - "and $0x1fffe, %%rax \n"
1048 - "imul %%rax, %%r14 \n"
1049 - "xor $0x1fffe, %%rax \n"
1050 - "imul %%rax, %%r13 \n"
1051 - "add %%r14, %%r13 \n"
1052 - "shr $17, %%r13 \n"
1053 - "movq 2048(%5,%%r13,8), %%xmm0\n"
1055 - "movzb (%2, %%r10, 1), %%r13 \n"
1056 - "movzb 1(%2, %%r10, 1), %%r14 \n"
1057 - "mov %%r11, %%rax \n"
1058 - "and $0x1fffe, %%rax \n"
1059 - "imul %%rax, %%r14 \n"
1060 - "xor $0x1fffe, %%rax \n"
1061 - "imul %%rax, %%r13 \n"
1062 - "add %%r14, %%r13 \n"
1063 - "shr $17, %%r13 \n"
1064 - "movq 4096(%5,%%r13,8), %%xmm1\n"
1066 - "mov %%r11, %%rax \n"
1067 - "lea (%%r11,%6),%%r10\n"
1068 - "sar $0x10,%%r11\n"
1069 - "paddsw %%xmm1,%%xmm0\n"
1071 - "movzb (%0, %%r11, 1), %%r13 \n"
1072 - "movzb 1(%0, %%r11, 1), %%r14 \n"
1073 - "and $0xffff, %%rax \n"
1074 - "imul %%rax, %%r14 \n"
1075 - "xor $0xffff, %%rax \n"
1076 - "imul %%rax, %%r13 \n"
1077 - "add %%r14, %%r13 \n"
1078 - "shr $16, %%r13 \n"
1079 - "movq (%5,%%r13,8),%%xmm1\n"
1081 - "mov %%r10, %%rax \n"
1082 - "lea (%%r10,%6),%%r11\n"
1083 - "sar $0x10,%%r10\n"
1085 - "movzb (%0,%%r10,1), %%r13 \n"
1086 - "movzb 1(%0,%%r10,1), %%r14 \n"
1087 - "and $0xffff, %%rax \n"
1088 - "imul %%rax, %%r14 \n"
1089 - "xor $0xffff, %%rax \n"
1090 - "imul %%rax, %%r13 \n"
1091 - "add %%r14, %%r13 \n"
1092 - "shr $16, %%r13 \n"
1093 - "movq (%5,%%r13,8),%%xmm2\n"
1095 - "paddsw %%xmm0,%%xmm1\n"
1096 - "paddsw %%xmm0,%%xmm2\n"
1097 - "shufps $0x44,%%xmm2,%%xmm1\n"
1098 - "psraw $0x6,%%xmm1\n"
1099 - "packuswb %%xmm1,%%xmm1\n"
1100 - "movq %%xmm1,0x0(%3)\n"
1103 - "jns .lscaleloop\n"
1107 - "js .lscaledone\n"
1109 - "mov %%r11,%%r10\n"
1110 - "sar $0x11,%%r10\n"
1112 - "movzb (%1,%%r10,1), %%r13 \n"
1113 - "movq 2048(%5,%%r13,8),%%xmm0\n"
1115 - "movzb (%2,%%r10,1), %%r13 \n"
1116 - "movq 4096(%5,%%r13,8),%%xmm1\n"
1118 - "paddsw %%xmm1,%%xmm0\n"
1119 - "sar $0x10,%%r11\n"
1121 - "movzb (%0,%%r11,1), %%r13 \n"
1122 - "movq (%5,%%r13,8),%%xmm1\n"
1124 - "paddsw %%xmm0,%%xmm1\n"
1125 - "psraw $0x6,%%xmm1\n"
1126 - "packuswb %%xmm1,%%xmm1\n"
1127 - "movd %%xmm1,0x0(%3)\n"
1131 - : "r"(y_buf), // %0
1134 - "r"(rgb_buf), // %3
1136 - "r" (kCoefficientsRgbY), // %5
1137 - "r"(static_cast<long>(source_dx)) // %6
1138 - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1142 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1144 -// PIC version is slower because less registers are available, so
1145 -// non-PIC is used on platforms where it is possible.
1147 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1148 - const uint8* u_buf,
1149 - const uint8* v_buf,
1154 - ".global FastConvertYUVToRGB32Row\n"
1155 -"FastConvertYUVToRGB32Row:\n"
1157 - "mov 0x24(%esp),%edx\n"
1158 - "mov 0x28(%esp),%edi\n"
1159 - "mov 0x2c(%esp),%esi\n"
1160 - "mov 0x30(%esp),%ebp\n"
1161 - "mov 0x34(%esp),%ecx\n"
1162 - "jmp convertend\n"
1165 - "movzbl (%edi),%eax\n"
1167 - "movzbl (%esi),%ebx\n"
1169 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1170 - "movzbl (%edx),%eax\n"
1171 - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
1172 - "movzbl 0x1(%edx),%ebx\n"
1173 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1175 - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
1176 - "paddsw %mm0,%mm1\n"
1177 - "paddsw %mm0,%mm2\n"
1178 - "psraw $0x6,%mm1\n"
1179 - "psraw $0x6,%mm2\n"
1180 - "packuswb %mm2,%mm1\n"
1181 - "movntq %mm1,0x0(%ebp)\n"
1185 - "jns convertloop\n"
1188 - "je convertdone\n"
1190 - "movzbl (%edi),%eax\n"
1191 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1192 - "movzbl (%esi),%eax\n"
1193 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1194 - "movzbl (%edx),%eax\n"
1195 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1196 - "paddsw %mm0,%mm1\n"
1197 - "psraw $0x6,%mm1\n"
1198 - "packuswb %mm1,%mm1\n"
1199 - "movd %mm1,0x0(%ebp)\n"
1206 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1207 - const uint8* u_buf,
1208 - const uint8* v_buf,
1214 - ".global ScaleYUVToRGB32Row\n"
1215 -"ScaleYUVToRGB32Row:\n"
1217 - "mov 0x24(%esp),%edx\n"
1218 - "mov 0x28(%esp),%edi\n"
1219 - "mov 0x2c(%esp),%esi\n"
1220 - "mov 0x30(%esp),%ebp\n"
1221 - "mov 0x34(%esp),%ecx\n"
1227 - "sar $0x11,%eax\n"
1228 - "movzbl (%edi,%eax,1),%eax\n"
1229 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1231 - "sar $0x11,%eax\n"
1232 - "movzbl (%esi,%eax,1),%eax\n"
1233 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1235 - "add 0x38(%esp),%ebx\n"
1236 - "sar $0x10,%eax\n"
1237 - "movzbl (%edx,%eax,1),%eax\n"
1238 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1240 - "add 0x38(%esp),%ebx\n"
1241 - "sar $0x10,%eax\n"
1242 - "movzbl (%edx,%eax,1),%eax\n"
1243 - "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
1244 - "paddsw %mm0,%mm1\n"
1245 - "paddsw %mm0,%mm2\n"
1246 - "psraw $0x6,%mm1\n"
1247 - "psraw $0x6,%mm2\n"
1248 - "packuswb %mm2,%mm1\n"
1249 - "movntq %mm1,0x0(%ebp)\n"
1259 - "sar $0x11,%eax\n"
1260 - "movzbl (%edi,%eax,1),%eax\n"
1261 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1263 - "sar $0x11,%eax\n"
1264 - "movzbl (%esi,%eax,1),%eax\n"
1265 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1267 - "sar $0x10,%eax\n"
1268 - "movzbl (%edx,%eax,1),%eax\n"
1269 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1270 - "paddsw %mm0,%mm1\n"
1271 - "psraw $0x6,%mm1\n"
1272 - "packuswb %mm1,%mm1\n"
1273 - "movd %mm1,0x0(%ebp)\n"
1280 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1281 - const uint8* u_buf,
1282 - const uint8* v_buf,
1288 - ".global LinearScaleYUVToRGB32Row\n"
1289 -"LinearScaleYUVToRGB32Row:\n"
1291 - "mov 0x24(%esp),%edx\n"
1292 - "mov 0x28(%esp),%edi\n"
1293 - "mov 0x30(%esp),%ebp\n"
1295 - // source_width = width * source_dx + ebx
1296 - "mov 0x34(%esp), %ecx\n"
1297 - "imull 0x38(%esp), %ecx\n"
1298 - "mov %ecx, 0x34(%esp)\n"
1300 - "mov 0x38(%esp), %ecx\n"
1301 - "xor %ebx,%ebx\n" // x = 0
1302 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1304 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1305 - "jmp .lscaleend\n"
1309 - "sar $0x11,%eax\n"
1311 - "movzbl (%edi,%eax,1),%ecx\n"
1312 - "movzbl 1(%edi,%eax,1),%esi\n"
1314 - "andl $0x1fffe, %eax \n"
1315 - "imul %eax, %esi \n"
1316 - "xorl $0x1fffe, %eax \n"
1317 - "imul %eax, %ecx \n"
1318 - "addl %esi, %ecx \n"
1319 - "shrl $17, %ecx \n"
1320 - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
1322 - "mov 0x2c(%esp),%esi\n"
1324 - "sar $0x11,%eax\n"
1326 - "movzbl (%esi,%eax,1),%ecx\n"
1327 - "movzbl 1(%esi,%eax,1),%esi\n"
1329 - "andl $0x1fffe, %eax \n"
1330 - "imul %eax, %esi \n"
1331 - "xorl $0x1fffe, %eax \n"
1332 - "imul %eax, %ecx \n"
1333 - "addl %esi, %ecx \n"
1334 - "shrl $17, %ecx \n"
1335 - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
1338 - "sar $0x10,%eax\n"
1339 - "movzbl (%edx,%eax,1),%ecx\n"
1340 - "movzbl 1(%edx,%eax,1),%esi\n"
1342 - "add 0x38(%esp),%ebx\n"
1343 - "andl $0xffff, %eax \n"
1344 - "imul %eax, %esi \n"
1345 - "xorl $0xffff, %eax \n"
1346 - "imul %eax, %ecx \n"
1347 - "addl %esi, %ecx \n"
1348 - "shrl $16, %ecx \n"
1349 - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
1351 - "cmp 0x34(%esp), %ebx\n"
1352 - "jge .lscalelastpixel\n"
1355 - "sar $0x10,%eax\n"
1356 - "movzbl (%edx,%eax,1),%ecx\n"
1357 - "movzbl 1(%edx,%eax,1),%esi\n"
1359 - "add 0x38(%esp),%ebx\n"
1360 - "andl $0xffff, %eax \n"
1361 - "imul %eax, %esi \n"
1362 - "xorl $0xffff, %eax \n"
1363 - "imul %eax, %ecx \n"
1364 - "addl %esi, %ecx \n"
1365 - "shrl $16, %ecx \n"
1366 - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
1368 - "paddsw %mm0,%mm1\n"
1369 - "paddsw %mm0,%mm2\n"
1370 - "psraw $0x6,%mm1\n"
1371 - "psraw $0x6,%mm2\n"
1372 - "packuswb %mm2,%mm1\n"
1373 - "movntq %mm1,0x0(%ebp)\n"
1377 - "cmp 0x34(%esp), %ebx\n"
1378 - "jl .lscaleloop\n"
1382 -".lscalelastpixel:"
1383 - "paddsw %mm0, %mm1\n"
1384 - "psraw $6, %mm1\n"
1385 - "packuswb %mm1, %mm1\n"
1386 - "movd %mm1, (%ebp)\n"
1391 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
1393 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
1394 - const uint8* u_buf,
1395 - const uint8* v_buf,
1398 - int16 *kCoefficientsRgbY);
1401 -#if defined(OS_MACOSX)
1402 -"_PICConvertYUVToRGB32Row:\n"
1404 -"PICConvertYUVToRGB32Row:\n"
1407 - "mov 0x24(%esp),%edx\n"
1408 - "mov 0x28(%esp),%edi\n"
1409 - "mov 0x2c(%esp),%esi\n"
1410 - "mov 0x30(%esp),%ebp\n"
1411 - "mov 0x38(%esp),%ecx\n"
1413 - "jmp .Lconvertend\n"
1416 - "movzbl (%edi),%eax\n"
1418 - "movzbl (%esi),%ebx\n"
1420 - "movq 2048(%ecx,%eax,8),%mm0\n"
1421 - "movzbl (%edx),%eax\n"
1422 - "paddsw 4096(%ecx,%ebx,8),%mm0\n"
1423 - "movzbl 0x1(%edx),%ebx\n"
1424 - "movq 0(%ecx,%eax,8),%mm1\n"
1426 - "movq 0(%ecx,%ebx,8),%mm2\n"
1427 - "paddsw %mm0,%mm1\n"
1428 - "paddsw %mm0,%mm2\n"
1429 - "psraw $0x6,%mm1\n"
1430 - "psraw $0x6,%mm2\n"
1431 - "packuswb %mm2,%mm1\n"
1432 - "movntq %mm1,0x0(%ebp)\n"
1435 - "subl $0x2,0x34(%esp)\n"
1436 - "jns .Lconvertloop\n"
1438 - "andl $0x1,0x34(%esp)\n"
1439 - "je .Lconvertdone\n"
1441 - "movzbl (%edi),%eax\n"
1442 - "movq 2048(%ecx,%eax,8),%mm0\n"
1443 - "movzbl (%esi),%eax\n"
1444 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1445 - "movzbl (%edx),%eax\n"
1446 - "movq 0(%ecx,%eax,8),%mm1\n"
1447 - "paddsw %mm0,%mm1\n"
1448 - "psraw $0x6,%mm1\n"
1449 - "packuswb %mm1,%mm1\n"
1450 - "movd %mm1,0x0(%ebp)\n"
1456 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1457 - const uint8* u_buf,
1458 - const uint8* v_buf,
1461 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
1462 - &kCoefficientsRgbY[0][0]);
1465 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
1466 - const uint8* u_buf,
1467 - const uint8* v_buf,
1471 - int16 *kCoefficientsRgbY);
1475 -#if defined(OS_MACOSX)
1476 -"_PICScaleYUVToRGB32Row:\n"
1478 -"PICScaleYUVToRGB32Row:\n"
1481 - "mov 0x24(%esp),%edx\n"
1482 - "mov 0x28(%esp),%edi\n"
1483 - "mov 0x2c(%esp),%esi\n"
1484 - "mov 0x30(%esp),%ebp\n"
1485 - "mov 0x3c(%esp),%ecx\n"
1491 - "sar $0x11,%eax\n"
1492 - "movzbl (%edi,%eax,1),%eax\n"
1493 - "movq 2048(%ecx,%eax,8),%mm0\n"
1495 - "sar $0x11,%eax\n"
1496 - "movzbl (%esi,%eax,1),%eax\n"
1497 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1499 - "add 0x38(%esp),%ebx\n"
1500 - "sar $0x10,%eax\n"
1501 - "movzbl (%edx,%eax,1),%eax\n"
1502 - "movq 0(%ecx,%eax,8),%mm1\n"
1504 - "add 0x38(%esp),%ebx\n"
1505 - "sar $0x10,%eax\n"
1506 - "movzbl (%edx,%eax,1),%eax\n"
1507 - "movq 0(%ecx,%eax,8),%mm2\n"
1508 - "paddsw %mm0,%mm1\n"
1509 - "paddsw %mm0,%mm2\n"
1510 - "psraw $0x6,%mm1\n"
1511 - "psraw $0x6,%mm2\n"
1512 - "packuswb %mm2,%mm1\n"
1513 - "movntq %mm1,0x0(%ebp)\n"
1516 - "subl $0x2,0x34(%esp)\n"
1517 - "jns Lscaleloop\n"
1519 - "andl $0x1,0x34(%esp)\n"
1523 - "sar $0x11,%eax\n"
1524 - "movzbl (%edi,%eax,1),%eax\n"
1525 - "movq 2048(%ecx,%eax,8),%mm0\n"
1527 - "sar $0x11,%eax\n"
1528 - "movzbl (%esi,%eax,1),%eax\n"
1529 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1531 - "sar $0x10,%eax\n"
1532 - "movzbl (%edx,%eax,1),%eax\n"
1533 - "movq 0(%ecx,%eax,8),%mm1\n"
1534 - "paddsw %mm0,%mm1\n"
1535 - "psraw $0x6,%mm1\n"
1536 - "packuswb %mm1,%mm1\n"
1537 - "movd %mm1,0x0(%ebp)\n"
1545 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1546 - const uint8* u_buf,
1547 - const uint8* v_buf,
1551 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1552 - &kCoefficientsRgbY[0][0]);
1555 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
1556 - const uint8* u_buf,
1557 - const uint8* v_buf,
1561 - int16 *kCoefficientsRgbY);
1564 -#if defined(OS_MACOSX)
1565 -"_PICLinearScaleYUVToRGB32Row:\n"
1567 -"PICLinearScaleYUVToRGB32Row:\n"
1570 - "mov 0x24(%esp),%edx\n"
1571 - "mov 0x30(%esp),%ebp\n"
1572 - "mov 0x34(%esp),%ecx\n"
1573 - "mov 0x3c(%esp),%edi\n"
1576 - // source_width = width * source_dx + ebx
1577 - "mov 0x34(%esp), %ecx\n"
1578 - "imull 0x38(%esp), %ecx\n"
1579 - "mov %ecx, 0x34(%esp)\n"
1581 - "mov 0x38(%esp), %ecx\n"
1582 - "xor %ebx,%ebx\n" // x = 0
1583 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1585 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1586 - "jmp .lscaleend\n"
1589 - "mov 0x28(%esp),%esi\n"
1591 - "sar $0x11,%eax\n"
1593 - "movzbl (%esi,%eax,1),%ecx\n"
1594 - "movzbl 1(%esi,%eax,1),%esi\n"
1596 - "andl $0x1fffe, %eax \n"
1597 - "imul %eax, %esi \n"
1598 - "xorl $0x1fffe, %eax \n"
1599 - "imul %eax, %ecx \n"
1600 - "addl %esi, %ecx \n"
1601 - "shrl $17, %ecx \n"
1602 - "movq 2048(%edi,%ecx,8),%mm0\n"
1604 - "mov 0x2c(%esp),%esi\n"
1606 - "sar $0x11,%eax\n"
1608 - "movzbl (%esi,%eax,1),%ecx\n"
1609 - "movzbl 1(%esi,%eax,1),%esi\n"
1611 - "andl $0x1fffe, %eax \n"
1612 - "imul %eax, %esi \n"
1613 - "xorl $0x1fffe, %eax \n"
1614 - "imul %eax, %ecx \n"
1615 - "addl %esi, %ecx \n"
1616 - "shrl $17, %ecx \n"
1617 - "paddsw 4096(%edi,%ecx,8),%mm0\n"
1620 - "sar $0x10,%eax\n"
1621 - "movzbl (%edx,%eax,1),%ecx\n"
1622 - "movzbl 1(%edx,%eax,1),%esi\n"
1624 - "add 0x38(%esp),%ebx\n"
1625 - "andl $0xffff, %eax \n"
1626 - "imul %eax, %esi \n"
1627 - "xorl $0xffff, %eax \n"
1628 - "imul %eax, %ecx \n"
1629 - "addl %esi, %ecx \n"
1630 - "shrl $16, %ecx \n"
1631 - "movq (%edi,%ecx,8),%mm1\n"
1633 - "cmp 0x34(%esp), %ebx\n"
1634 - "jge .lscalelastpixel\n"
1637 - "sar $0x10,%eax\n"
1638 - "movzbl (%edx,%eax,1),%ecx\n"
1639 - "movzbl 1(%edx,%eax,1),%esi\n"
1641 - "add 0x38(%esp),%ebx\n"
1642 - "andl $0xffff, %eax \n"
1643 - "imul %eax, %esi \n"
1644 - "xorl $0xffff, %eax \n"
1645 - "imul %eax, %ecx \n"
1646 - "addl %esi, %ecx \n"
1647 - "shrl $16, %ecx \n"
1648 - "movq (%edi,%ecx,8),%mm2\n"
1650 - "paddsw %mm0,%mm1\n"
1651 - "paddsw %mm0,%mm2\n"
1652 - "psraw $0x6,%mm1\n"
1653 - "psraw $0x6,%mm2\n"
1654 - "packuswb %mm2,%mm1\n"
1655 - "movntq %mm1,0x0(%ebp)\n"
1659 - "cmp %ebx, 0x34(%esp)\n"
1660 - "jg .lscaleloop\n"
1664 -".lscalelastpixel:"
1665 - "paddsw %mm0, %mm1\n"
1666 - "psraw $6, %mm1\n"
1667 - "packuswb %mm1, %mm1\n"
1668 - "movd %mm1, (%ebp)\n"
1673 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1674 - const uint8* u_buf,
1675 - const uint8* v_buf,
1679 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1680 - &kCoefficientsRgbY[0][0]);
1685 // C reference code that mimic the YUV assembly.
1686 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
1687 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
1688 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
1690 static inline void YuvPixel(uint8 y,
1693 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
1696 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
1697 (packuswb(g) << 8) |
1698 (packuswb(r) << 16) |
1699 (packuswb(a) << 24);
1702 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1703 - const uint8* u_buf,
1704 - const uint8* v_buf,
1707 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
1708 + const uint8* u_buf,
1709 + const uint8* v_buf,
1712 + unsigned int x_shift) {
1713 for (int x = 0; x < width; x += 2) {
1714 - uint8 u = u_buf[x >> 1];
1715 - uint8 v = v_buf[x >> 1];
1716 + uint8 u = u_buf[x >> x_shift];
1717 + uint8 v = v_buf[x >> x_shift];
1718 uint8 y0 = y_buf[x];
1719 YuvPixel(y0, u, v, rgb_buf);
1720 if ((x + 1) < width) {
1721 uint8 y1 = y_buf[x + 1];
1722 + if (x_shift == 0) {
1726 YuvPixel(y1, u, v, rgb_buf + 4);
1728 rgb_buf += 8; // Advance 2 pixels.
1732 // 16.16 fixed point is used. A shift by 16 isolates the integer.
1733 // A shift by 17 is used to further subsample the chrominence channels.
1734 // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
1735 // for 1/65536 pixel accurate interpolation.
1736 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1737 - const uint8* u_buf,
1738 - const uint8* v_buf,
1742 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
1743 + const uint8* u_buf,
1744 + const uint8* v_buf,
1749 for (int i = 0; i < width; i += 2) {
1750 int y = y_buf[x >> 16];
1751 int u = u_buf[(x >> 17)];
1752 int v = v_buf[(x >> 17)];
1753 YuvPixel(y, u, v, rgb_buf);
1755 if ((i + 1) < width) {
1757 YuvPixel(y, u, v, rgb_buf+4);
1764 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1765 - const uint8* u_buf,
1766 - const uint8* v_buf,
1770 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
1771 + const uint8* u_buf,
1772 + const uint8* v_buf,
1777 if (source_dx >= 0x20000) {
1780 for (int i = 0; i < width; i += 2) {
1781 int y0 = y_buf[x >> 16];
1782 int y1 = y_buf[(x >> 16) + 1];
1783 int u0 = u_buf[(x >> 17)];
1784 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
1785 y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
1786 YuvPixel(y, u, v, rgb_buf+4);
1796 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
1797 --- a/gfx/ycbcr/yuv_row_posix.cpp
1798 +++ b/gfx/ycbcr/yuv_row_posix.cpp
1800 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
1801 // Use of this source code is governed by a BSD-style license that can be
1802 // found in the LICENSE file.
1804 -#include "media/base/yuv_row.h"
1807 -#include "base/logging.h"
1809 +#include "yuv_row.h"
1810 +#include "mozilla/SSE.h"
1817 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
1818 +#if defined(ARCH_CPU_X86_64)
1820 +// We don't need CPUID guards here, since x86-64 implies SSE2.
1822 // AMD64 ABI uses register paremters.
1823 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
1824 const uint8* u_buf, // rsi
1825 const uint8* v_buf, // rdx
1826 uint8* rgb_buf, // rcx
1829 - "jmp convertend\n"
1833 "movzb (%1),%%r10\n"
1835 "movzb (%2),%%r11\n"
1837 "movq 2048(%5,%%r10,8),%%xmm0\n"
1838 "movzb (%0),%%r10\n"
1839 "movq 4096(%5,%%r11,8),%%xmm1\n"
1840 "movzb 0x1(%0),%%r11\n"
1841 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
1842 "movq (%5,%%r11,8),%%xmm3\n"
1843 "paddsw %%xmm0,%%xmm2\n"
1844 "paddsw %%xmm0,%%xmm3\n"
1845 "shufps $0x44,%%xmm3,%%xmm2\n"
1846 "psraw $0x6,%%xmm2\n"
1847 "packuswb %%xmm2,%%xmm2\n"
1848 "movq %%xmm2,0x0(%3)\n"
1853 - "jns convertloop\n"
1860 - "js convertdone\n"
1863 "movzb (%1),%%r10\n"
1864 "movq 2048(%5,%%r10,8),%%xmm0\n"
1865 "movzb (%2),%%r10\n"
1866 "movq 4096(%5,%%r10,8),%%xmm1\n"
1867 "paddsw %%xmm1,%%xmm0\n"
1868 "movzb (%0),%%r10\n"
1869 "movq (%5,%%r10,8),%%xmm1\n"
1870 "paddsw %%xmm0,%%xmm1\n"
1871 "psraw $0x6,%%xmm1\n"
1872 "packuswb %%xmm1,%%xmm1\n"
1873 "movd %%xmm1,0x0(%3)\n"
1882 "r" (kCoefficientsRgbY) // %5
1883 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
1884 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1885 const uint8* u_buf, // rsi
1886 const uint8* v_buf, // rdx
1887 uint8* rgb_buf, // rcx
1889 int source_dx) { // r9
1901 "movzb (%1,%%r10,1),%%rax\n"
1902 "movq 2048(%5,%%rax,8),%%xmm0\n"
1903 "movzb (%2,%%r10,1),%%rax\n"
1904 "movq 4096(%5,%%rax,8),%%xmm1\n"
1905 "lea (%%r11,%6),%%r10\n"
1907 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1908 "paddsw %%xmm0,%%xmm1\n"
1909 "paddsw %%xmm0,%%xmm2\n"
1910 "shufps $0x44,%%xmm2,%%xmm1\n"
1911 "psraw $0x6,%%xmm1\n"
1912 "packuswb %%xmm1,%%xmm1\n"
1913 "movq %%xmm1,0x0(%3)\n"
1928 "movzb (%1,%%r10,1),%%rax\n"
1929 "movq 2048(%5,%%rax,8),%%xmm0\n"
1930 "movzb (%2,%%r10,1),%%rax\n"
1931 "movq 4096(%5,%%rax,8),%%xmm1\n"
1932 "paddsw %%xmm1,%%xmm0\n"
1934 "movzb (%0,%%r11,1),%%rax\n"
1935 "movq (%5,%%rax,8),%%xmm1\n"
1936 "paddsw %%xmm0,%%xmm1\n"
1937 "psraw $0x6,%%xmm1\n"
1938 "packuswb %%xmm1,%%xmm1\n"
1939 "movd %%xmm1,0x0(%3)\n"
1949 "r" (kCoefficientsRgbY), // %5
1950 "r"(static_cast<long>(source_dx)) // %6
1951 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
1958 "xor %%r11,%%r11\n" // x = 0
1960 - "js .lscalenext\n"
1962 "cmp $0x20000,%6\n" // if source_dx >= 2.0
1963 - "jl .lscalehalf\n"
1965 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1975 "movzb (%1, %%r10, 1), %%r13 \n"
1976 "movzb 1(%1, %%r10, 1), %%r14 \n"
1977 "mov %%r11, %%rax \n"
1978 "and $0x1fffe, %%rax \n"
1979 "imul %%rax, %%r14 \n"
1980 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
1981 "paddsw %%xmm0,%%xmm1\n"
1982 "paddsw %%xmm0,%%xmm2\n"
1983 "shufps $0x44,%%xmm2,%%xmm1\n"
1984 "psraw $0x6,%%xmm1\n"
1985 "packuswb %%xmm1,%%xmm1\n"
1986 "movq %%xmm1,0x0(%3)\n"
1989 - "jns .lscaleloop\n"
1996 - "js .lscaledone\n"
2002 "movzb (%1,%%r10,1), %%r13 \n"
2003 "movq 2048(%5,%%r13,8),%%xmm0\n"
2005 "movzb (%2,%%r10,1), %%r13 \n"
2006 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
2007 "movzb (%0,%%r11,1), %%r13 \n"
2008 "movq (%5,%%r13,8),%%xmm1\n"
2010 "paddsw %%xmm0,%%xmm1\n"
2011 "psraw $0x6,%%xmm1\n"
2012 "packuswb %%xmm1,%%xmm1\n"
2013 "movd %%xmm1,0x0(%3)\n"
2023 "r" (kCoefficientsRgbY), // %5
2024 "r"(static_cast<long>(source_dx)) // %6
2025 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
2029 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
2030 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
2032 // PIC version is slower because less registers are available, so
2033 // non-PIC is used on platforms where it is possible.
2035 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2036 - const uint8* u_buf,
2037 - const uint8* v_buf,
2040 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2041 + const uint8* u_buf,
2042 + const uint8* v_buf,
2047 - ".global FastConvertYUVToRGB32Row\n"
2048 -"FastConvertYUVToRGB32Row:\n"
2049 + ".global FastConvertYUVToRGB32Row_SSE\n"
2050 + ".type FastConvertYUVToRGB32Row_SSE, @function\n"
2051 +"FastConvertYUVToRGB32Row_SSE:\n"
2053 "mov 0x24(%esp),%edx\n"
2054 "mov 0x28(%esp),%edi\n"
2055 "mov 0x2c(%esp),%esi\n"
2056 "mov 0x30(%esp),%ebp\n"
2057 "mov 0x34(%esp),%ecx\n"
2058 - "jmp convertend\n"
2064 "movzbl (%edi),%eax\n"
2066 "movzbl (%esi),%ebx\n"
2068 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2069 "movzbl (%edx),%eax\n"
2070 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
2071 "movzbl 0x1(%edx),%ebx\n"
2072 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
2073 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
2074 "paddsw %mm0,%mm1\n"
2075 "paddsw %mm0,%mm2\n"
2078 "packuswb %mm2,%mm1\n"
2079 "movntq %mm1,0x0(%ebp)\n"
2084 - "jns convertloop\n"
2088 - "je convertdone\n"
2091 "movzbl (%edi),%eax\n"
2092 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2093 "movzbl (%esi),%eax\n"
2094 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2095 "movzbl (%edx),%eax\n"
2096 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2097 "paddsw %mm0,%mm1\n"
2099 "packuswb %mm1,%mm1\n"
2100 "movd %mm1,0x0(%ebp)\n"
2105 +#if !defined(XP_MACOSX)
2111 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2112 - const uint8* u_buf,
2113 - const uint8* v_buf,
2117 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
2118 + const uint8* u_buf,
2119 + const uint8* v_buf,
2123 + if (mozilla::supports_sse()) {
2124 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
2128 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2132 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2133 + const uint8* u_buf,
2134 + const uint8* v_buf,
2140 - ".global ScaleYUVToRGB32Row\n"
2141 -"ScaleYUVToRGB32Row:\n"
2142 + ".global ScaleYUVToRGB32Row_SSE\n"
2143 + ".type ScaleYUVToRGB32Row_SSE, @function\n"
2144 +"ScaleYUVToRGB32Row_SSE:\n"
2146 "mov 0x24(%esp),%edx\n"
2147 "mov 0x28(%esp),%edi\n"
2148 "mov 0x2c(%esp),%esi\n"
2149 "mov 0x30(%esp),%ebp\n"
2150 "mov 0x34(%esp),%ecx\n"
2160 "movzbl (%edi,%eax,1),%eax\n"
2161 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2164 "movzbl (%esi,%eax,1),%eax\n"
2165 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2166 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2167 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
2168 "paddsw %mm0,%mm1\n"
2169 "paddsw %mm0,%mm2\n"
2172 "packuswb %mm2,%mm1\n"
2173 "movntq %mm1,0x0(%ebp)\n"
2187 "movzbl (%edi,%eax,1),%eax\n"
2188 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2191 "movzbl (%esi,%eax,1),%eax\n"
2192 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2194 "movzbl (%edx,%eax,1),%eax\n"
2195 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2196 "paddsw %mm0,%mm1\n"
2198 "packuswb %mm1,%mm1\n"
2199 "movd %mm1,0x0(%ebp)\n"
2205 +#if !defined(XP_MACOSX)
2210 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2211 - const uint8* u_buf,
2212 - const uint8* v_buf,
2216 +void ScaleYUVToRGB32Row(const uint8* y_buf,
2217 + const uint8* u_buf,
2218 + const uint8* v_buf,
2223 + if (mozilla::supports_sse()) {
2224 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2225 + width, source_dx);
2228 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2229 + width, source_dx);
2232 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2233 + const uint8* u_buf,
2234 + const uint8* v_buf,
2240 - ".global LinearScaleYUVToRGB32Row\n"
2241 -"LinearScaleYUVToRGB32Row:\n"
2242 + ".global LinearScaleYUVToRGB32Row_SSE\n"
2243 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
2244 +"LinearScaleYUVToRGB32Row_SSE:\n"
2246 "mov 0x24(%esp),%edx\n"
2247 "mov 0x28(%esp),%edi\n"
2248 "mov 0x30(%esp),%ebp\n"
2250 // source_width = width * source_dx + ebx
2251 "mov 0x34(%esp), %ecx\n"
2252 "imull 0x38(%esp), %ecx\n"
2253 "mov %ecx, 0x34(%esp)\n"
2255 "mov 0x38(%esp), %ecx\n"
2256 "xor %ebx,%ebx\n" // x = 0
2257 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2260 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2261 - "jmp .lscaleend\n"
2265 - "sar $0x11,%eax\n"
2270 + "sar $0x11,%eax\n"
2272 "movzbl (%edi,%eax,1),%ecx\n"
2273 "movzbl 1(%edi,%eax,1),%esi\n"
2275 "andl $0x1fffe, %eax \n"
2276 "imul %eax, %esi \n"
2277 "xorl $0x1fffe, %eax \n"
2278 "imul %eax, %ecx \n"
2279 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
2280 "imul %eax, %esi \n"
2281 "xorl $0xffff, %eax \n"
2282 "imul %eax, %ecx \n"
2283 "addl %esi, %ecx \n"
2285 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
2287 "cmp 0x34(%esp), %ebx\n"
2288 - "jge .lscalelastpixel\n"
2293 "movzbl (%edx,%eax,1),%ecx\n"
2294 "movzbl 1(%edx,%eax,1),%esi\n"
2296 "add 0x38(%esp),%ebx\n"
2297 "andl $0xffff, %eax \n"
2298 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
2299 "paddsw %mm0,%mm1\n"
2300 "paddsw %mm0,%mm2\n"
2303 "packuswb %mm2,%mm1\n"
2304 "movntq %mm1,0x0(%ebp)\n"
2309 "cmp 0x34(%esp), %ebx\n"
2310 - "jl .lscaleloop\n"
2315 -".lscalelastpixel:"
2317 "paddsw %mm0, %mm1\n"
2319 "packuswb %mm1, %mm1\n"
2320 "movd %mm1, (%ebp)\n"
2323 +#if !defined(XP_MACOSX)
2328 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
2330 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
2331 - const uint8* u_buf,
2332 - const uint8* v_buf,
2335 - int16 *kCoefficientsRgbY);
2336 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2337 + const uint8* u_buf,
2338 + const uint8* v_buf,
2343 + if (mozilla::supports_sse()) {
2344 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2345 + width, source_dx);
2348 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2349 + width, source_dx);
2352 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
2354 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2355 + const uint8* u_buf,
2356 + const uint8* v_buf,
2359 + int16 *kCoefficientsRgbY);
2363 -#if defined(OS_MACOSX)
2364 -"_PICConvertYUVToRGB32Row:\n"
2365 +#if defined(XP_MACOSX)
2366 +"_PICConvertYUVToRGB32Row_SSE:\n"
2368 -"PICConvertYUVToRGB32Row:\n"
2369 +"PICConvertYUVToRGB32Row_SSE:\n"
2372 "mov 0x24(%esp),%edx\n"
2373 "mov 0x28(%esp),%edi\n"
2374 "mov 0x2c(%esp),%esi\n"
2375 "mov 0x30(%esp),%ebp\n"
2376 "mov 0x38(%esp),%ecx\n"
2378 - "jmp .Lconvertend\n"
2384 "movzbl (%edi),%eax\n"
2386 "movzbl (%esi),%ebx\n"
2388 "movq 2048(%ecx,%eax,8),%mm0\n"
2389 "movzbl (%edx),%eax\n"
2390 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
2391 "movzbl 0x1(%edx),%ebx\n"
2392 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
2393 "movq 0(%ecx,%ebx,8),%mm2\n"
2394 "paddsw %mm0,%mm1\n"
2395 "paddsw %mm0,%mm2\n"
2398 "packuswb %mm2,%mm1\n"
2399 "movntq %mm1,0x0(%ebp)\n"
2403 "subl $0x2,0x34(%esp)\n"
2404 - "jns .Lconvertloop\n"
2407 "andl $0x1,0x34(%esp)\n"
2408 - "je .Lconvertdone\n"
2411 "movzbl (%edi),%eax\n"
2412 "movq 2048(%ecx,%eax,8),%mm0\n"
2413 "movzbl (%esi),%eax\n"
2414 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2415 "movzbl (%edx),%eax\n"
2416 "movq 0(%ecx,%eax,8),%mm1\n"
2417 "paddsw %mm0,%mm1\n"
2419 "packuswb %mm1,%mm1\n"
2420 "movd %mm1,0x0(%ebp)\n"
2425 +#if !defined(XP_MACOSX)
2430 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2435 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
2436 - &kCoefficientsRgbY[0][0]);
2439 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
2442 + if (mozilla::supports_sse()) {
2443 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2444 + &kCoefficientsRgbY[0][0]);
2448 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2451 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2457 int16 *kCoefficientsRgbY);
2461 -#if defined(OS_MACOSX)
2462 -"_PICScaleYUVToRGB32Row:\n"
2463 +#if defined(XP_MACOSX)
2464 +"_PICScaleYUVToRGB32Row_SSE:\n"
2466 -"PICScaleYUVToRGB32Row:\n"
2467 +"PICScaleYUVToRGB32Row_SSE:\n"
2470 "mov 0x24(%esp),%edx\n"
2471 "mov 0x28(%esp),%edi\n"
2472 "mov 0x2c(%esp),%esi\n"
2473 "mov 0x30(%esp),%ebp\n"
2474 "mov 0x3c(%esp),%ecx\n"
2484 "movzbl (%edi,%eax,1),%eax\n"
2485 "movq 2048(%ecx,%eax,8),%mm0\n"
2488 "movzbl (%esi,%eax,1),%eax\n"
2489 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2490 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
2491 "movq 0(%ecx,%eax,8),%mm2\n"
2492 "paddsw %mm0,%mm1\n"
2493 "paddsw %mm0,%mm2\n"
2496 "packuswb %mm2,%mm1\n"
2497 "movntq %mm1,0x0(%ebp)\n"
2501 "subl $0x2,0x34(%esp)\n"
2502 - "jns Lscaleloop\n"
2505 "andl $0x1,0x34(%esp)\n"
2511 "movzbl (%edi,%eax,1),%eax\n"
2512 "movq 2048(%ecx,%eax,8),%mm0\n"
2515 "movzbl (%esi,%eax,1),%eax\n"
2516 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
2518 "movzbl (%edx,%eax,1),%eax\n"
2519 "movq 0(%ecx,%eax,8),%mm1\n"
2520 "paddsw %mm0,%mm1\n"
2522 "packuswb %mm1,%mm1\n"
2523 "movd %mm1,0x0(%ebp)\n"
2529 +#if !defined(XP_MACOSX)
2535 void ScaleYUVToRGB32Row(const uint8* y_buf,
2541 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2542 - &kCoefficientsRgbY[0][0]);
2545 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
2546 - const uint8* u_buf,
2547 - const uint8* v_buf,
2551 - int16 *kCoefficientsRgbY);
2554 + if (mozilla::supports_sse()) {
2555 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2556 + &kCoefficientsRgbY[0][0]);
2560 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2563 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2564 + const uint8* u_buf,
2565 + const uint8* v_buf,
2569 + int16 *kCoefficientsRgbY);
2573 -#if defined(OS_MACOSX)
2574 -"_PICLinearScaleYUVToRGB32Row:\n"
2575 +#if defined(XP_MACOSX)
2576 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
2578 -"PICLinearScaleYUVToRGB32Row:\n"
2579 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
2582 "mov 0x24(%esp),%edx\n"
2583 "mov 0x30(%esp),%ebp\n"
2584 "mov 0x34(%esp),%ecx\n"
2585 "mov 0x3c(%esp),%edi\n"
2588 // source_width = width * source_dx + ebx
2589 "mov 0x34(%esp), %ecx\n"
2590 "imull 0x38(%esp), %ecx\n"
2591 "mov %ecx, 0x34(%esp)\n"
2593 "mov 0x38(%esp), %ecx\n"
2594 "xor %ebx,%ebx\n" // x = 0
2595 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2598 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2599 - "jmp .lscaleend\n"
2605 "mov 0x28(%esp),%esi\n"
2609 "movzbl (%esi,%eax,1),%ecx\n"
2610 "movzbl 1(%esi,%eax,1),%esi\n"
2612 "andl $0x1fffe, %eax \n"
2613 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
2614 "imul %eax, %esi \n"
2615 "xorl $0xffff, %eax \n"
2616 "imul %eax, %ecx \n"
2617 "addl %esi, %ecx \n"
2619 "movq (%edi,%ecx,8),%mm1\n"
2621 "cmp 0x34(%esp), %ebx\n"
2622 - "jge .lscalelastpixel\n"
2627 "movzbl (%edx,%eax,1),%ecx\n"
2628 "movzbl 1(%edx,%eax,1),%esi\n"
2630 "add 0x38(%esp),%ebx\n"
2631 "andl $0xffff, %eax \n"
2632 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
2633 "paddsw %mm0,%mm1\n"
2634 "paddsw %mm0,%mm2\n"
2637 "packuswb %mm2,%mm1\n"
2638 "movntq %mm1,0x0(%ebp)\n"
2643 "cmp %ebx, 0x34(%esp)\n"
2644 - "jg .lscaleloop\n"
2649 -".lscalelastpixel:"
2651 "paddsw %mm0, %mm1\n"
2653 "packuswb %mm1, %mm1\n"
2654 "movd %mm1, (%ebp)\n"
2657 +#if !defined(XP_MACOSX)
2663 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2664 - const uint8* u_buf,
2665 - const uint8* v_buf,
2669 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2670 - &kCoefficientsRgbY[0][0]);
2675 -// C reference code that mimic the YUV assembly.
2676 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2677 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2678 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2680 -static inline void YuvPixel(uint8 y,
2685 - int b = kCoefficientsRgbY[256+u][0];
2686 - int g = kCoefficientsRgbY[256+u][1];
2687 - int r = kCoefficientsRgbY[256+u][2];
2688 - int a = kCoefficientsRgbY[256+u][3];
2690 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2691 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2692 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2693 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2695 - b = paddsw(b, kCoefficientsRgbY[y][0]);
2696 - g = paddsw(g, kCoefficientsRgbY[y][1]);
2697 - r = paddsw(r, kCoefficientsRgbY[y][2]);
2698 - a = paddsw(a, kCoefficientsRgbY[y][3]);
2705 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
2706 - (packuswb(g) << 8) |
2707 - (packuswb(r) << 16) |
2708 - (packuswb(a) << 24);
2711 + const uint8* u_buf,
2712 + const uint8* v_buf,
2717 + if (mozilla::supports_sse()) {
2718 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2719 + source_dx, &kCoefficientsRgbY[0][0]);
2723 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2726 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2731 - for (int x = 0; x < width; x += 2) {
2732 - uint8 u = u_buf[x >> 1];
2733 - uint8 v = v_buf[x >> 1];
2734 - uint8 y0 = y_buf[x];
2735 - YuvPixel(y0, u, v, rgb_buf);
2736 - if ((x + 1) < width) {
2737 - uint8 y1 = y_buf[x + 1];
2738 - YuvPixel(y1, u, v, rgb_buf + 4);
2740 - rgb_buf += 8; // Advance 2 pixels.
2744 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
2745 -// A shift by 17 is used to further subsample the chrominence channels.
2746 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
2747 -// for 1/65536 pixel accurate interpolation.
2748 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2751 void ScaleYUVToRGB32Row(const uint8* y_buf,
2758 - for (int i = 0; i < width; i += 2) {
2759 - int y = y_buf[x >> 16];
2760 - int u = u_buf[(x >> 17)];
2761 - int v = v_buf[(x >> 17)];
2762 - YuvPixel(y, u, v, rgb_buf);
2764 - if ((i + 1) < width) {
2765 - y = y_buf[x >> 16];
2766 - YuvPixel(y, u, v, rgb_buf+4);
2772 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2775 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2782 - if (source_dx >= 0x20000) {
2785 - for (int i = 0; i < width; i += 2) {
2786 - int y0 = y_buf[x >> 16];
2787 - int y1 = y_buf[(x >> 16) + 1];
2788 - int u0 = u_buf[(x >> 17)];
2789 - int u1 = u_buf[(x >> 17) + 1];
2790 - int v0 = v_buf[(x >> 17)];
2791 - int v1 = v_buf[(x >> 17) + 1];
2792 - int y_frac = (x & 65535);
2793 - int uv_frac = ((x >> 1) & 65535);
2794 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2795 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
2796 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
2797 - YuvPixel(y, u, v, rgb_buf);
2799 - if ((i + 1) < width) {
2800 - y0 = y_buf[x >> 16];
2801 - y1 = y_buf[(x >> 16) + 1];
2802 - y_frac = (x & 65535);
2803 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2804 - YuvPixel(y, u, v, rgb_buf+4);
2814 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2819 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
2820 --- a/gfx/ycbcr/yuv_row_table.cpp
2821 +++ b/gfx/ycbcr/yuv_row_table.cpp
2823 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2824 // Use of this source code is governed by a BSD-style license that can be
2825 // found in the LICENSE file.
2827 -#include "media/base/yuv_row.h"
2828 +#include "yuv_row.h"
2833 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2834 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2835 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2837 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
2838 --- a/gfx/ycbcr/yuv_row_win.cpp
2839 +++ b/gfx/ycbcr/yuv_row_win.cpp
2841 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2842 // Use of this source code is governed by a BSD-style license that can be
2843 // found in the LICENSE file.
2845 -#include "media/base/yuv_row.h"
2846 +#include "yuv_row.h"
2847 +#include "mozilla/SSE.h"
2849 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
2850 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
2856 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2857 - const uint8* u_buf,
2858 - const uint8* v_buf,
2861 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
2863 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2864 + const uint8* u_buf,
2865 + const uint8* v_buf,
2870 mov edx, [esp + 32 + 4] // Y
2871 mov edi, [esp + 32 + 8] // U
2872 mov esi, [esp + 32 + 12] // V
2873 mov ebp, [esp + 32 + 16] // rgb
2874 mov ecx, [esp + 32 + 20] // width
2876 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
2885 -void ConvertYUVToRGB32Row(const uint8* y_buf,
2886 - const uint8* u_buf,
2887 - const uint8* v_buf,
2891 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2892 + const uint8* u_buf,
2893 + const uint8* v_buf,
2899 mov edx, [esp + 32 + 4] // Y
2900 mov edi, [esp + 32 + 8] // U
2901 mov esi, [esp + 32 + 12] // V
2902 mov ebp, [esp + 32 + 16] // rgb
2903 mov ecx, [esp + 32 + 20] // width
2904 mov ebx, [esp + 32 + 24] // step
2905 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
2914 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
2915 - const uint8* u_buf,
2916 - const uint8* v_buf,
2921 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2922 + const uint8* u_buf,
2923 + const uint8* v_buf,
2930 mov edx, [esp + 32 + 4] // Y
2931 mov edi, [esp + 32 + 8] // U
2932 mov esi, [esp + 32 + 12] // V
2933 mov ebp, [esp + 32 + 16] // rgb
2934 mov ecx, [esp + 32 + 20] // width
2936 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
2945 -void DoubleYUVToRGB32Row(const uint8* y_buf,
2946 - const uint8* u_buf,
2947 - const uint8* v_buf,
2950 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
2951 + const uint8* u_buf,
2952 + const uint8* v_buf,
2957 mov edx, [esp + 32 + 4] // Y
2958 mov edi, [esp + 32 + 8] // U
2959 mov esi, [esp + 32 + 12] // V
2960 mov ebp, [esp + 32 + 16] // rgb
2961 mov ecx, [esp + 32 + 20] // width
2963 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
2971 // This version does general purpose scaling by any amount, up or down.
2972 -// The only thing it can not do it rotation by 90 or 270.
2973 -// For performance the chroma is under sampled, reducing cost of a 3x
2974 +// The only thing it cannot do is rotation by 90 or 270.
2975 +// For performance the chroma is under-sampled, reducing cost of a 3x
2976 // 1080p scale from 8.4 ms to 5.4 ms.
2978 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2979 - const uint8* u_buf,
2980 - const uint8* v_buf,
2984 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2985 + const uint8* u_buf,
2986 + const uint8* v_buf,
2992 mov edx, [esp + 32 + 4] // Y
2993 mov edi, [esp + 32 + 8] // U
2994 mov esi, [esp + 32 + 12] // V
2995 mov ebp, [esp + 32 + 16] // rgb
2996 mov ecx, [esp + 32 + 20] // width
2998 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
3007 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3008 - const uint8* u_buf,
3009 - const uint8* v_buf,
3013 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
3014 + const uint8* u_buf,
3015 + const uint8* v_buf,
3021 mov edx, [esp + 32 + 4] // Y
3022 mov edi, [esp + 32 + 8] // U
3023 // [esp + 32 + 12] // V
3024 mov ebp, [esp + 32 + 16] // rgb
3025 mov ecx, [esp + 32 + 20] // width
3026 imul ecx, [esp + 32 + 24] // source_dx
3027 @@ -438,152 +439,60 @@ lscalelastpixel:
3038 -// C reference code that mimic the YUV assembly.
3039 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
3040 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
3041 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
3043 -static inline void YuvPixel(uint8 y,
3048 - int b = kCoefficientsRgbY[256+u][0];
3049 - int g = kCoefficientsRgbY[256+u][1];
3050 - int r = kCoefficientsRgbY[256+u][2];
3051 - int a = kCoefficientsRgbY[256+u][3];
3053 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
3054 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
3055 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
3056 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
3058 - b = paddsw(b, kCoefficientsRgbY[y][0]);
3059 - g = paddsw(g, kCoefficientsRgbY[y][1]);
3060 - r = paddsw(r, kCoefficientsRgbY[y][2]);
3061 - a = paddsw(a, kCoefficientsRgbY[y][3]);
3068 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
3069 - (packuswb(g) << 8) |
3070 - (packuswb(r) << 16) |
3071 - (packuswb(a) << 24);
3075 -static inline void YuvPixel(uint8 y,
3082 - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax]
3084 - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax]
3086 - movq mm1, [kCoefficientsRgbY + 8 * eax]
3096 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3098 void FastConvertYUVToRGB32Row(const uint8* y_buf,
3103 - for (int x = 0; x < width; x += 2) {
3104 - uint8 u = u_buf[x >> 1];
3105 - uint8 v = v_buf[x >> 1];
3106 - uint8 y0 = y_buf[x];
3107 - YuvPixel(y0, u, v, rgb_buf);
3108 - if ((x + 1) < width) {
3109 - uint8 y1 = y_buf[x + 1];
3110 - YuvPixel(y1, u, v, rgb_buf + 4);
3112 - rgb_buf += 8; // Advance 2 pixels.
3116 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
3117 -// A shift by 17 is used to further subsample the chrominence channels.
3118 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
3119 -// for 1/65536 pixel accurate interpolation.
3120 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3121 + if (mozilla::supports_sse()) {
3122 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
3127 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
3130 void ScaleYUVToRGB32Row(const uint8* y_buf,
3137 - for (int i = 0; i < width; i += 2) {
3138 - int y = y_buf[x >> 16];
3139 - int u = u_buf[(x >> 17)];
3140 - int v = v_buf[(x >> 17)];
3141 - YuvPixel(y, u, v, rgb_buf);
3143 - if ((i + 1) < width) {
3144 - y = y_buf[x >> 16];
3145 - YuvPixel(y, u, v, rgb_buf+4);
3152 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3153 + if (mozilla::supports_sse()) {
3154 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3159 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3162 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3169 - if (source_dx >= 0x20000) {
3172 - for (int i = 0; i < width; i += 2) {
3173 - int y0 = y_buf[x >> 16];
3174 - int y1 = y_buf[(x >> 16) + 1];
3175 - int u0 = u_buf[(x >> 17)];
3176 - int u1 = u_buf[(x >> 17) + 1];
3177 - int v0 = v_buf[(x >> 17)];
3178 - int v1 = v_buf[(x >> 17) + 1];
3179 - int y_frac = (x & 65535);
3180 - int uv_frac = ((x >> 1) & 65535);
3181 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3182 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
3183 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
3184 - YuvPixel(y, u, v, rgb_buf);
3186 - if ((i + 1) < width) {
3187 - y0 = y_buf[x >> 16];
3188 - y1 = y_buf[(x >> 16) + 1];
3189 - y_frac = (x & 65535);
3190 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3191 - YuvPixel(y, u, v, rgb_buf+4);
3201 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3202 + if (mozilla::supports_sse()) {
3203 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
3209 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);