gfx/ycbcr/convert.patch

   1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
   2 --- a/gfx/ycbcr/yuv_convert.cpp
   3 +++ b/gfx/ycbcr/yuv_convert.cpp
   4 @@ -6,145 +6,133 @@
   5  // http://www.fourcc.org/yuv.php
   6  // The actual conversion is best described here
   7  // http://en.wikipedia.org/wiki/YUV
   8  // An article on optimizing YUV conversion using tables instead of multiplies
   9  // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
  10  //
  11  // YV12 is a full plane of Y and a half height, half width chroma planes
  12  // YV16 is a full plane of Y and a full height, half width chroma planes
  13 +// YV24 is a full plane of Y and a full height, full width chroma planes
  14  //
  15  // ARGB pixel format is output, which on little endian is stored as BGRA.
  16  // The alpha is set to 255, allowing the application to use RGBA or RGB32.
  17
  18 -#include "media/base/yuv_convert.h"
  19 +#include "yuv_convert.h"
  20
  21  // Header for low level row functions.
  22 -#include "media/base/yuv_row.h"
  23 -
  24 -#if USE_MMX
  25 -#if defined(_MSC_VER)
  26 -#include <intrin.h>
  27 -#else
  28 -#include <mmintrin.h>
  29 -#endif
  30 -#endif
  31 -
  32 -#if USE_SSE2
  33 -#include <emmintrin.h>
  34 -#endif
  35 -
  36 -namespace media {
  37 -
  38 +#include "yuv_row.h"
  39 +#include "mozilla/SSE.h"
  40 +
  41 +#ifdef HAVE_YCBCR_TO_RGB565
  42 +void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
  43 +#endif
  44 +
  45 +namespace mozilla {
  46 +
  47 +namespace gfx {
  48 +
  49  // 16.16 fixed point arithmetic
  50  const int kFractionBits = 16;
  51  const int kFractionMax = 1 << kFractionBits;
  52  const int kFractionMask = ((1 << kFractionBits) - 1);
  53
  54 +
  55 +// Convert a frame of YUV to 16 bit RGB565.
  56 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
  57 +                                  const uint8* u_buf,
  58 +                                  const uint8* v_buf,
  59 +                                  uint8* rgb_buf,
  60 +                                  int pic_x,
  61 +                                  int pic_y,
  62 +                                  int pic_width,
  63 +                                  int pic_height,
  64 +                                  int y_pitch,
  65 +                                  int uv_pitch,
  66 +                                  int rgb_pitch,
  67 +                                  YUVType yuv_type)
  68 +{
  69 +#ifdef HAVE_YCBCR_TO_RGB565
  70 +  for (int i = 0; i < pic_height; i++) {
  71 +    yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
  72 +                         y_buf + y_pitch * i,
  73 +                         u_buf + uv_pitch * (i / 2),
  74 +                         v_buf + uv_pitch * (i / 2),
  75 +                         pic_width,
  76 +                         0);
  77 +  }
  78 +#endif
  79 +}
  80 +
  81  // Convert a frame of YUV to 32 bit ARGB.
  82 -void ConvertYUVToRGB32(const uint8* y_buf,
  83 -                       const uint8* u_buf,
  84 -                       const uint8* v_buf,
  85 -                       uint8* rgb_buf,
  86 -                       int width,
  87 -                       int height,
  88 -                       int y_pitch,
  89 -                       int uv_pitch,
  90 -                       int rgb_pitch,
  91 -                       YUVType yuv_type) {
  92 -  unsigned int y_shift = yuv_type;
  93 -  for (int y = 0; y < height; ++y) {
  94 -    uint8* rgb_row = rgb_buf + y * rgb_pitch;
  95 -    const uint8* y_ptr = y_buf + y * y_pitch;
  96 -    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
  97 -    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
  98 -
  99 -    FastConvertYUVToRGB32Row(y_ptr,
 100 -                             u_ptr,
 101 -                             v_ptr,
 102 -                             rgb_row,
 103 -                             width);
 104 -  }
 105 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
 106 +                                  const uint8* u_buf,
 107 +                                  const uint8* v_buf,
 108 +                                  uint8* rgb_buf,
 109 +                                  int pic_x,
 110 +                                  int pic_y,
 111 +                                  int pic_width,
 112 +                                  int pic_height,
 113 +                                  int y_pitch,
 114 +                                  int uv_pitch,
 115 +                                  int rgb_pitch,
 116 +                                  YUVType yuv_type) {
 117 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
 118 +  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
 119 +  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
 120 +  bool has_sse = supports_mmx() && supports_sse();
 121 +  // There is no optimized YV24 SSE routine so we check for this and
 122 +  // fall back to the C code.
 123 +  has_sse &= yuv_type != YV24;
 124 +  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
 125 +  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
 126 +
 127 +  for (int y = pic_y; y < pic_height + pic_y; ++y) {
 128 +    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
 129 +    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
 130 +    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
 131 +    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
 132 +
 133 +    if (odd_pic_x) {
 134 +      // Handle the single odd pixel manually and use the
 135 +      // fast routines for the remaining.
 136 +      FastConvertYUVToRGB32Row_C(y_ptr++,
 137 +                                 u_ptr++,
 138 +                                 v_ptr++,
 139 +                                 rgb_row,
 140 +                                 1,
 141 +                                 x_shift);
 142 +      rgb_row += 4;
 143 +    }
 144 +
 145 +    if (has_sse) {
 146 +      FastConvertYUVToRGB32Row(y_ptr,
 147 +                               u_ptr,
 148 +                               v_ptr,
 149 +                               rgb_row,
 150 +                               x_width);
 151 +    }
 152 +    else {
 153 +      FastConvertYUVToRGB32Row_C(y_ptr,
 154 +                                 u_ptr,
 155 +                                 v_ptr,
 156 +                                 rgb_row,
 157 +                                 x_width,
 158 +                                 x_shift);
 159 +    }
 160 +  }
 161
 162    // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
 163 -  EMMS();
 164 -}
 165 -
 166 -#if USE_SSE2
 167 -// FilterRows combines two rows of the image using linear interpolation.
 168 -// SSE2 version does 16 pixels at a time
 169 -
 170 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 171 -                       int source_width, int source_y_fraction) {
 172 -  __m128i zero = _mm_setzero_si128();
 173 -  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
 174 -  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
 175 -
 176 -  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
 177 -  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
 178 -  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
 179 -  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
 180 -
 181 -  do {
 182 -    __m128i y0 = _mm_loadu_si128(y0_ptr128);
 183 -    __m128i y1 = _mm_loadu_si128(y1_ptr128);
 184 -    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
 185 -    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
 186 -    y0 = _mm_unpacklo_epi8(y0, zero);
 187 -    y1 = _mm_unpacklo_epi8(y1, zero);
 188 -    y0 = _mm_mullo_epi16(y0, y0_fraction);
 189 -    y1 = _mm_mullo_epi16(y1, y1_fraction);
 190 -    y2 = _mm_mullo_epi16(y2, y0_fraction);
 191 -    y3 = _mm_mullo_epi16(y3, y1_fraction);
 192 -    y0 = _mm_add_epi16(y0, y1);
 193 -    y2 = _mm_add_epi16(y2, y3);
 194 -    y0 = _mm_srli_epi16(y0, 8);
 195 -    y2 = _mm_srli_epi16(y2, 8);
 196 -    y0 = _mm_packus_epi16(y0, y2);
 197 -    *dest128++ = y0;
 198 -    ++y0_ptr128;
 199 -    ++y1_ptr128;
 200 -  } while (dest128 < end128);
 201 -}
 202 -#elif USE_MMX
 203 -// MMX version does 8 pixels at a time
 204 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 205 -                       int source_width, int source_y_fraction) {
 206 -  __m64 zero = _mm_setzero_si64();
 207 -  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
 208 -  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
 209 -
 210 -  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
 211 -  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
 212 -  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
 213 -  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
 214 -
 215 -  do {
 216 -    __m64 y0 = *y0_ptr64++;
 217 -    __m64 y1 = *y1_ptr64++;
 218 -    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
 219 -    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
 220 -    y0 = _mm_unpacklo_pi8(y0, zero);
 221 -    y1 = _mm_unpacklo_pi8(y1, zero);
 222 -    y0 = _mm_mullo_pi16(y0, y0_fraction);
 223 -    y1 = _mm_mullo_pi16(y1, y1_fraction);
 224 -    y2 = _mm_mullo_pi16(y2, y0_fraction);
 225 -    y3 = _mm_mullo_pi16(y3, y1_fraction);
 226 -    y0 = _mm_add_pi16(y0, y1);
 227 -    y2 = _mm_add_pi16(y2, y3);
 228 -    y0 = _mm_srli_pi16(y0, 8);
 229 -    y2 = _mm_srli_pi16(y2, 8);
 230 -    y0 = _mm_packs_pu16(y0, y2);
 231 -    *dest64++ = y0;
 232 -  } while (dest64 < end64);
 233 -}
 234 -#else  // no MMX or SSE2
 235 +  if (has_sse)
 236 +    EMMS();
 237 +}
 238 +
 239  // C version does 8 at a time to mimic MMX code
 240 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 241 -                       int source_width, int source_y_fraction) {
 242 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 243 +                         int source_width, int source_y_fraction) {
 244    int y1_fraction = source_y_fraction;
 245    int y0_fraction = 256 - y1_fraction;
 246    uint8* end = ybuf + source_width;
 247    do {
 248      ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
 249      ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
 250      ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
 251      ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
 252 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
 253      ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
 254      ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
 255      ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
 256      y0_ptr += 8;
 257      y1_ptr += 8;
 258      ybuf += 8;
 259    } while (ybuf < end);
 260  }
 261 -#endif
 262 +
 263 +#ifdef MOZILLA_MAY_SUPPORT_MMX
 264 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 265 +                    int source_width, int source_y_fraction);
 266 +#endif
 267 +
 268 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
 269 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 270 +                     int source_width, int source_y_fraction);
 271 +#endif
 272 +
 273 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
 274 +                              const uint8* y1_ptr, int source_width,
 275 +                              int source_y_fraction) {
 276 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
 277 +  if (mozilla::supports_sse2()) {
 278 +    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 279 +    return;
 280 +  }
 281 +#endif
 282 +
 283 +#ifdef MOZILLA_MAY_SUPPORT_MMX
 284 +  if (mozilla::supports_mmx()) {
 285 +    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 286 +    return;
 287 +  }
 288 +#endif
 289 +
 290 +  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 291 +}
 292
 293
 294  // Scale a frame of YUV to 32 bit ARGB.
 295 -void ScaleYUVToRGB32(const uint8* y_buf,
 296 -                     const uint8* u_buf,
 297 -                     const uint8* v_buf,
 298 -                     uint8* rgb_buf,
 299 -                     int source_width,
 300 -                     int source_height,
 301 -                     int width,
 302 -                     int height,
 303 -                     int y_pitch,
 304 -                     int uv_pitch,
 305 -                     int rgb_pitch,
 306 -                     YUVType yuv_type,
 307 -                     Rotate view_rotate,
 308 -                     ScaleFilter filter) {
 309 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
 310 +                                const uint8* u_buf,
 311 +                                const uint8* v_buf,
 312 +                                uint8* rgb_buf,
 313 +                                int source_width,
 314 +                                int source_height,
 315 +                                int width,
 316 +                                int height,
 317 +                                int y_pitch,
 318 +                                int uv_pitch,
 319 +                                int rgb_pitch,
 320 +                                YUVType yuv_type,
 321 +                                Rotate view_rotate,
 322 +                                ScaleFilter filter) {
 323 +  bool has_mmx = supports_mmx();
 324 +
 325    // 4096 allows 3 buffers to fit in 12k.
 326    // Helps performance on CPU with 16K L1 cache.
 327    // Large enough for 3830x2160 and 30" displays which are 2560x1600.
 328    const int kFilterBufferSize = 4096;
 329    // Disable filtering if the screen is too big (to avoid buffer overflows).
 330    // This should never happen to regular users: they don't have monitors
 331    // wider than 4096 pixels.
 332    // TODO(fbarchard): Allow rotated videos to filter.
 333    if (source_width > kFilterBufferSize || view_rotate)
 334      filter = FILTER_NONE;
 335
 336 -  unsigned int y_shift = yuv_type;
 337 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
 338    // Diagram showing origin and direction of source sampling.
 339    // ->0   4<-
 340    // 7       3
 341    //
 342    // 6       5
 343    // ->1   2<-
 344    // Rotations that start at right side of image.
 345    if ((view_rotate == ROTATE_180) ||
 346 @@ -243,17 +262,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
 347        uv_pitch = 1;
 348      }
 349    }
 350
 351    // Need padding because FilterRows() will write 1 to 16 extra pixels
 352    // after the end for SSE2 version.
 353    uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
 354    uint8* ybuf =
 355 -      reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
 356 +      reinterpret_cast<uint8*>(reinterpret_cast<PRUptrdiff>(yuvbuf + 15) & ~15);
 357    uint8* ubuf = ybuf + kFilterBufferSize;
 358    uint8* vbuf = ubuf + kFilterBufferSize;
 359    // TODO(fbarchard): Fixed point math is off by 1 on negatives.
 360    int yscale_fixed = (source_height << kFractionBits) / height;
 361
 362    // TODO(fbarchard): Split this into separate function for better efficiency.
 363    for (int y = 0; y < height; ++y) {
 364      uint8* dest_pixel = rgb_buf + y * rgb_pitch;
 365 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
 366      int source_uv_fraction =
 367          ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
 368
 369      const uint8* y_ptr = y0_ptr;
 370      const uint8* u_ptr = u0_ptr;
 371      const uint8* v_ptr = v0_ptr;
 372      // Apply vertical filtering if necessary.
 373      // TODO(fbarchard): Remove memcpy when not necessary.
 374 -    if (filter & media::FILTER_BILINEAR_V) {
 375 +    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
 376        if (yscale_fixed != kFractionMax &&
 377            source_y_fraction && ((source_y + 1) < source_height)) {
 378          FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 379        } else {
 380          memcpy(ybuf, y0_ptr, source_width);
 381        }
 382        y_ptr = ybuf;
 383        ybuf[source_width] = ybuf[source_width-1];
 384 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
 385        u_ptr = ubuf;
 386        v_ptr = vbuf;
 387        ubuf[uv_source_width] = ubuf[uv_source_width - 1];
 388        vbuf[uv_source_width] = vbuf[uv_source_width - 1];
 389      }
 390      if (source_dx == kFractionMax) {  // Not scaled
 391        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 392                                 dest_pixel, width);
 393 -    } else {
 394 -      if (filter & FILTER_BILINEAR_H) {
 395 +    } else if (filter & FILTER_BILINEAR_H) {
 396          LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 397                                   dest_pixel, width, source_dx);
 398      } else {
 399  // Specialized scalers and rotation.
 400 -#if USE_MMX && defined(_MSC_VER)
 401 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
 402 +      if(mozilla::supports_sse()) {
 403          if (width == (source_width * 2)) {
 404 -          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 405 -                              dest_pixel, width);
 406 +          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
 407 +                                  dest_pixel, width);
 408          } else if ((source_dx & kFractionMask) == 0) {
 409            // Scaling by integer scale factor. ie half.
 410 -          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 411 -                               dest_pixel, width,
 412 -                               source_dx >> kFractionBits);
 413 +          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
 414 +                                   dest_pixel, width,
 415 +                                   source_dx >> kFractionBits);
 416          } else if (source_dx_uv == source_dx) {  // Not rotated.
 417            ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 418                               dest_pixel, width, source_dx);
 419          } else {
 420 -          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 421 -                                     dest_pixel, width,
 422 -                                     source_dx >> kFractionBits,
 423 -                                     source_dx_uv >> kFractionBits);
 424 +          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
 425 +                                         dest_pixel, width,
 426 +                                         source_dx >> kFractionBits,
 427 +                                         source_dx_uv >> kFractionBits);
 428          }
 429 +      }
 430 +      else {
 431 +        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
 432 +                             dest_pixel, width, source_dx);
 433 +      }
 434  #else
 435 -        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 436 -                           dest_pixel, width, source_dx);
 437 -#endif
 438 -      }
 439 +      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 440 +                         dest_pixel, width, source_dx);
 441 +#endif
 442      }
 443    }
 444    // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
 445 -  EMMS();
 446 -}
 447 -
 448 -}  // namespace media
 449 +  if (has_mmx)
 450 +    EMMS();
 451 +}
 452 +
 453 +}  // namespace gfx
 454 +}  // namespace mozilla
 455 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
 456 --- a/gfx/ycbcr/yuv_convert.h
 457 +++ b/gfx/ycbcr/yuv_convert.h
 458 @@ -1,72 +1,98 @@
 459  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 460  // Use of this source code is governed by a BSD-style license that can be
 461  // found in the LICENSE file.
 462
 463  #ifndef MEDIA_BASE_YUV_CONVERT_H_
 464  #define MEDIA_BASE_YUV_CONVERT_H_
 465
 466 -#include "base/basictypes.h"
 467 -
 468 -namespace media {
 469 -
 470 +#include "chromium_types.h"
 471 +#include "gfxCore.h"
 472 +
 473 +#ifdef __arm__
 474 +#define HAVE_YCBCR_TO_RGB565 1
 475 +#endif
 476 +
 477 +namespace mozilla {
 478 +
 479 +namespace gfx {
 480 +
 481  // Type of YUV surface.
 482  // The value of these enums matter as they are used to shift vertical indices.
 483  enum YUVType {
 484 -  YV16 = 0,           // YV16 is half width and full height chroma channels.
 485 -  YV12 = 1,           // YV12 is half width and half height chroma channels.
 486 +  YV12 = 0,           // YV12 is half width and half height chroma channels.
 487 +  YV16 = 1,           // YV16 is half width and full height chroma channels.
 488 +  YV24 = 2            // YV24 is full width and full height chroma channels.
 489  };
 490
 491  // Mirror means flip the image horizontally, as in looking in a mirror.
 492  // Rotate happens after mirroring.
 493  enum Rotate {
 494    ROTATE_0,           // Rotation off.
 495    ROTATE_90,          // Rotate clockwise.
 496    ROTATE_180,         // Rotate upside down.
 497    ROTATE_270,         // Rotate counter clockwise.
 498    MIRROR_ROTATE_0,    // Mirror horizontally.
 499    MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
 500    MIRROR_ROTATE_180,  // Mirror vertically.
 501 -  MIRROR_ROTATE_270,  // Transpose.
 502 +  MIRROR_ROTATE_270   // Transpose.
 503  };
 504
 505  // Filter affects how scaling looks.
 506  enum ScaleFilter {
 507    FILTER_NONE = 0,        // No filter (point sampled).
 508    FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
 509    FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
 510 -  FILTER_BILINEAR = 3,    // Bilinear filter.
 511 +  FILTER_BILINEAR = 3     // Bilinear filter.
 512  };
 513
 514 +// Convert a frame of YUV to 16 bit RGB565.
 515 +// Pass in YV12 formats
 516 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
 517 +                                  const uint8* uplane,
 518 +                                  const uint8* vplane,
 519 +                                  uint8* rgbframe,
 520 +                                  int pic_x,
 521 +                                  int pic_y,
 522 +                                  int pic_width,
 523 +                                  int pic_height,
 524 +                                  int ystride,
 525 +                                  int uvstride,
 526 +                                  int rgbstride,
 527 +                                  YUVType yuv_type);
 528 +
 529  // Convert a frame of YUV to 32 bit ARGB.
 530  // Pass in YV16/YV12 depending on source format
 531 -void ConvertYUVToRGB32(const uint8* yplane,
 532 -                       const uint8* uplane,
 533 -                       const uint8* vplane,
 534 -                       uint8* rgbframe,
 535 -                       int width,
 536 -                       int height,
 537 -                       int ystride,
 538 -                       int uvstride,
 539 -                       int rgbstride,
 540 -                       YUVType yuv_type);
 541 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
 542 +                                  const uint8* uplane,
 543 +                                  const uint8* vplane,
 544 +                                  uint8* rgbframe,
 545 +                                  int pic_x,
 546 +                                  int pic_y,
 547 +                                  int pic_width,
 548 +                                  int pic_height,
 549 +                                  int ystride,
 550 +                                  int uvstride,
 551 +                                  int rgbstride,
 552 +                                  YUVType yuv_type);
 553
 554  // Scale a frame of YUV to 32 bit ARGB.
 555  // Supports rotation and mirroring.
 556 -void ScaleYUVToRGB32(const uint8* yplane,
 557 -                     const uint8* uplane,
 558 -                     const uint8* vplane,
 559 -                     uint8* rgbframe,
 560 -                     int source_width,
 561 -                     int source_height,
 562 -                     int width,
 563 -                     int height,
 564 -                     int ystride,
 565 -                     int uvstride,
 566 -                     int rgbstride,
 567 -                     YUVType yuv_type,
 568 -                     Rotate view_rotate,
 569 -                     ScaleFilter filter);
 570 -
 571 -}  // namespace media
 572 -
 573 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
 574 +                                const uint8* uplane,
 575 +                                const uint8* vplane,
 576 +                                uint8* rgbframe,
 577 +                                int source_width,
 578 +                                int source_height,
 579 +                                int width,
 580 +                                int height,
 581 +                                int ystride,
 582 +                                int uvstride,
 583 +                                int rgbstride,
 584 +                                YUVType yuv_type,
 585 +                                Rotate view_rotate,
 586 +                                ScaleFilter filter);
 587 +
 588 +}  // namespace gfx
 589 +}  // namespace mozilla
 590 +
 591  #endif  // MEDIA_BASE_YUV_CONVERT_H_
 592 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
 593 new file mode 100644
 594 --- /dev/null
 595 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
 596 @@ -0,0 +1,45 @@
 597 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 598 +// Use of this source code is governed by a BSD-style license that can be
 599 +// found in the LICENSE file.
 600 +
 601 +#include <mmintrin.h>
 602 +#include "yuv_row.h"
 603 +
 604 +namespace mozilla {
 605 +namespace gfx {
 606 +
 607 +// FilterRows combines two rows of the image using linear interpolation.
 608 +// MMX version does 8 pixels at a time.
 609 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 610 +                    int source_width, int source_y_fraction) {
 611 +  __m64 zero = _mm_setzero_si64();
 612 +  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
 613 +  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
 614 +
 615 +  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
 616 +  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
 617 +  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
 618 +  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
 619 +
 620 +  do {
 621 +    __m64 y0 = *y0_ptr64++;
 622 +    __m64 y1 = *y1_ptr64++;
 623 +    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
 624 +    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
 625 +    y0 = _mm_unpacklo_pi8(y0, zero);
 626 +    y1 = _mm_unpacklo_pi8(y1, zero);
 627 +    y0 = _mm_mullo_pi16(y0, y0_fraction);
 628 +    y1 = _mm_mullo_pi16(y1, y1_fraction);
 629 +    y2 = _mm_mullo_pi16(y2, y0_fraction);
 630 +    y3 = _mm_mullo_pi16(y3, y1_fraction);
 631 +    y0 = _mm_add_pi16(y0, y1);
 632 +    y2 = _mm_add_pi16(y2, y3);
 633 +    y0 = _mm_srli_pi16(y0, 8);
 634 +    y2 = _mm_srli_pi16(y2, 8);
 635 +    y0 = _mm_packs_pu16(y0, y2);
 636 +    *dest64++ = y0;
 637 +  } while (dest64 < end64);
 638 +}
 639 +
 640 +}
 641 +}
 642 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
 643 new file mode 100644
 644 --- /dev/null
 645 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
 646 @@ -0,0 +1,47 @@
 647 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 648 +// Use of this source code is governed by a BSD-style license that can be
 649 +// found in the LICENSE file.
 650 +
 651 +#include <emmintrin.h>
 652 +#include "yuv_row.h"
 653 +
 654 +namespace mozilla {
 655 +namespace gfx {
 656 +
 657 +// FilterRows combines two rows of the image using linear interpolation.
 658 +// SSE2 version does 16 pixels at a time.
 659 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 660 +                     int source_width, int source_y_fraction) {
 661 +  __m128i zero = _mm_setzero_si128();
 662 +  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
 663 +  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
 664 +
 665 +  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
 666 +  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
 667 +  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
 668 +  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
 669 +
 670 +  do {
 671 +    __m128i y0 = _mm_loadu_si128(y0_ptr128);
 672 +    __m128i y1 = _mm_loadu_si128(y1_ptr128);
 673 +    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
 674 +    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
 675 +    y0 = _mm_unpacklo_epi8(y0, zero);
 676 +    y1 = _mm_unpacklo_epi8(y1, zero);
 677 +    y0 = _mm_mullo_epi16(y0, y0_fraction);
 678 +    y1 = _mm_mullo_epi16(y1, y1_fraction);
 679 +    y2 = _mm_mullo_epi16(y2, y0_fraction);
 680 +    y3 = _mm_mullo_epi16(y3, y1_fraction);
 681 +    y0 = _mm_add_epi16(y0, y1);
 682 +    y2 = _mm_add_epi16(y2, y3);
 683 +    y0 = _mm_srli_epi16(y0, 8);
 684 +    y2 = _mm_srli_epi16(y2, 8);
 685 +    y0 = _mm_packus_epi16(y0, y2);
 686 +    *dest128++ = y0;
 687 +    ++y0_ptr128;
 688 +    ++y1_ptr128;
 689 +  } while (dest128 < end128);
 690 +}
 691 +
 692 +}
 693 +}
 694 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
 695 --- a/gfx/ycbcr/yuv_row.h
 696 +++ b/gfx/ycbcr/yuv_row.h
 697 @@ -5,109 +5,133 @@
 698  // yuv_row internal functions to handle YUV conversion and scaling to RGB.
 699  // These functions are used from both yuv_convert.cc and yuv_scale.cc.
 700
 701  // TODO(fbarchard): Write function that can handle rotation and scaling.
 702
 703  #ifndef MEDIA_BASE_YUV_ROW_H_
 704  #define MEDIA_BASE_YUV_ROW_H_
 705
 706 -#include "base/basictypes.h"
 707 +#include "chromium_types.h"
 708
 709  extern "C" {
 710  // Can only do 1x.
 711  // This is the second fastest of the scalers.
 712  void FastConvertYUVToRGB32Row(const uint8* y_buf,
 713                                const uint8* u_buf,
 714                                const uint8* v_buf,
 715                                uint8* rgb_buf,
 716                                int width);
 717
 718 -// Can do 1x, half size or any scale down by an integer amount.
 719 -// Step can be negative (mirroring, rotate 180).
 720 -// This is the third fastest of the scalers.
 721 -void ConvertYUVToRGB32Row(const uint8* y_buf,
 722 -                          const uint8* u_buf,
 723 -                          const uint8* v_buf,
 724 -                          uint8* rgb_buf,
 725 -                          int width,
 726 -                          int step);
 727 -
 728 -// Rotate is like Convert, but applies different step to Y versus U and V.
 729 -// This allows rotation by 90 or 270, by stepping by stride.
 730 -// This is the forth fastest of the scalers.
 731 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
 732 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
 733                                  const uint8* u_buf,
 734                                  const uint8* v_buf,
 735                                  uint8* rgb_buf,
 736                                  int width,
 737 -                                int ystep,
 738 -                                int uvstep);
 739 +                                unsigned int x_shift);
 740 +
 741 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
 742 +                              const uint8* u_buf,
 743 +                              const uint8* v_buf,
 744 +                              uint8* rgb_buf,
 745 +                              int width);
 746 +
 747 +// Can do 1x, half size or any scale down by an integer amount.
 748 +// Step can be negative (mirroring, rotate 180).
 749 +// This is the third fastest of the scalers.
 750 +// Only defined on Windows x86-32.
 751 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
 752 +                              const uint8* u_buf,
 753 +                              const uint8* v_buf,
 754 +                              uint8* rgb_buf,
 755 +                              int width,
 756 +                              int step);
 757 +
 758 +// Rotate is like Convert, but applies different step to Y versus U and V.
 759 +// This allows rotation by 90 or 270, by stepping by stride.
 760 +// This is the forth fastest of the scalers.
 761 +// Only defined on Windows x86-32.
 762 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
 763 +                                    const uint8* u_buf,
 764 +                                    const uint8* v_buf,
 765 +                                    uint8* rgb_buf,
 766 +                                    int width,
 767 +                                    int ystep,
 768 +                                    int uvstep);
 769
 770  // Doubler does 4 pixels at a time.  Each pixel is replicated.
 771  // This is the fastest of the scalers.
 772 -void DoubleYUVToRGB32Row(const uint8* y_buf,
 773 -                         const uint8* u_buf,
 774 -                         const uint8* v_buf,
 775 -                         uint8* rgb_buf,
 776 -                         int width);
 777 +// Only defined on Windows x86-32.
 778 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
 779 +                             const uint8* u_buf,
 780 +                             const uint8* v_buf,
 781 +                             uint8* rgb_buf,
 782 +                             int width);
 783
 784  // Handles arbitrary scaling up or down.
 785  // Mirroring is supported, but not 90 or 270 degree rotation.
 786  // Chroma is under sampled every 2 pixels for performance.
 787  void ScaleYUVToRGB32Row(const uint8* y_buf,
 788                          const uint8* u_buf,
 789                          const uint8* v_buf,
 790                          uint8* rgb_buf,
 791                          int width,
 792                          int source_dx);
 793
 794 +void ScaleYUVToRGB32Row(const uint8* y_buf,
 795 +                        const uint8* u_buf,
 796 +                        const uint8* v_buf,
 797 +                        uint8* rgb_buf,
 798 +                        int width,
 799 +                        int source_dx);
 800 +
 801 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
 802 +                          const uint8* u_buf,
 803 +                          const uint8* v_buf,
 804 +                          uint8* rgb_buf,
 805 +                          int width,
 806 +                          int source_dx);
 807 +
 808  // Handles arbitrary scaling up or down with bilinear filtering.
 809  // Mirroring is supported, but not 90 or 270 degree rotation.
 810  // Chroma is under sampled every 2 pixels for performance.
 811  // This is the slowest of the scalers.
 812  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
 813                                const uint8* u_buf,
 814                                const uint8* v_buf,
 815                                uint8* rgb_buf,
 816                                int width,
 817                                int source_dx);
 818
 819 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
 820 +                              const uint8* u_buf,
 821 +                              const uint8* v_buf,
 822 +                              uint8* rgb_buf,
 823 +                              int width,
 824 +                              int source_dx);
 825 +
 826 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
 827 +                                const uint8* u_buf,
 828 +                                const uint8* v_buf,
 829 +                                uint8* rgb_buf,
 830 +                                int width,
 831 +                                int source_dx);
 832 +
 833 +
 834  #if defined(_MSC_VER)
 835  #define SIMD_ALIGNED(var) __declspec(align(16)) var
 836  #else
 837  #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 838  #endif
 839  extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
 840
 841 -// Method to force C version.
 842 -//#define USE_MMX 0
 843 -//#define USE_SSE2 0
 844 -
 845 -#if !defined(USE_MMX)
 846 -// Windows, Mac and Linux/BSD use MMX
 847 -#if defined(__MMX__) || defined(_MSC_VER)
 848 -#define USE_MMX 1
 849 -#else
 850 -#define USE_MMX 0
 851 -#endif
 852 -#endif
 853 -
 854 -#if !defined(USE_SSE2)
 855 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
 856 -#define USE_SSE2 1
 857 -#else
 858 -#define USE_SSE2 0
 859 -#endif
 860 -#endif
 861 -
 862  // x64 uses MMX2 (SSE) so emms is not required.
 863  // Warning C4799: function has no EMMS instruction.
 864  // EMMS() is slow and should be called by the calling function once per image.
 865 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
 866 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
 867  #if defined(_MSC_VER)
 868  #define EMMS() __asm emms
 869  #pragma warning(disable: 4799)
 870  #else
 871  #define EMMS() asm("emms")
 872  #endif
 873  #else
 874  #define EMMS()
 875 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
 876 --- a/gfx/ycbcr/yuv_row_c.cpp
 877 +++ b/gfx/ycbcr/yuv_row_c.cpp
 878 @@ -1,812 +1,18 @@
 879  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 880  // Use of this source code is governed by a BSD-style license that can be
 881  // found in the LICENSE file.
 882
 883 -#include "media/base/yuv_row.h"
 884 -
 885 -#ifdef _DEBUG
 886 -#include "base/logging.h"
 887 -#else
 888 +#include "yuv_row.h"
 889 +
 890  #define DCHECK(a)
 891 -#endif
 892
 893  extern "C" {
 894
 895 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
 896 -
 897 -// AMD64 ABI uses register paremters.
 898 -void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
 899 -                              const uint8* u_buf,  // rsi
 900 -                              const uint8* v_buf,  // rdx
 901 -                              uint8* rgb_buf,      // rcx
 902 -                              int width) {         // r8
 903 -  asm(
 904 -  "jmp    convertend\n"
 905 -"convertloop:"
 906 -  "movzb  (%1),%%r10\n"
 907 -  "add    $0x1,%1\n"
 908 -  "movzb  (%2),%%r11\n"
 909 -  "add    $0x1,%2\n"
 910 -  "movq   2048(%5,%%r10,8),%%xmm0\n"
 911 -  "movzb  (%0),%%r10\n"
 912 -  "movq   4096(%5,%%r11,8),%%xmm1\n"
 913 -  "movzb  0x1(%0),%%r11\n"
 914 -  "paddsw %%xmm1,%%xmm0\n"
 915 -  "movq   (%5,%%r10,8),%%xmm2\n"
 916 -  "add    $0x2,%0\n"
 917 -  "movq   (%5,%%r11,8),%%xmm3\n"
 918 -  "paddsw %%xmm0,%%xmm2\n"
 919 -  "paddsw %%xmm0,%%xmm3\n"
 920 -  "shufps $0x44,%%xmm3,%%xmm2\n"
 921 -  "psraw  $0x6,%%xmm2\n"
 922 -  "packuswb %%xmm2,%%xmm2\n"
 923 -  "movq   %%xmm2,0x0(%3)\n"
 924 -  "add    $0x8,%3\n"
 925 -"convertend:"
 926 -  "sub    $0x2,%4\n"
 927 -  "jns    convertloop\n"
 928 -
 929 -"convertnext:"
 930 -  "add    $0x1,%4\n"
 931 -  "js     convertdone\n"
 932 -
 933 -  "movzb  (%1),%%r10\n"
 934 -  "movq   2048(%5,%%r10,8),%%xmm0\n"
 935 -  "movzb  (%2),%%r10\n"
 936 -  "movq   4096(%5,%%r10,8),%%xmm1\n"
 937 -  "paddsw %%xmm1,%%xmm0\n"
 938 -  "movzb  (%0),%%r10\n"
 939 -  "movq   (%5,%%r10,8),%%xmm1\n"
 940 -  "paddsw %%xmm0,%%xmm1\n"
 941 -  "psraw  $0x6,%%xmm1\n"
 942 -  "packuswb %%xmm1,%%xmm1\n"
 943 -  "movd   %%xmm1,0x0(%3)\n"
 944 -"convertdone:"
 945 -  :
 946 -  : "r"(y_buf),  // %0
 947 -    "r"(u_buf),  // %1
 948 -    "r"(v_buf),  // %2
 949 -    "r"(rgb_buf),  // %3
 950 -    "r"(width),  // %4
 951 -    "r" (kCoefficientsRgbY)  // %5
 952 -  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
 953 -);
 954 -}
 955 -
 956 -void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
 957 -                        const uint8* u_buf,  // rsi
 958 -                        const uint8* v_buf,  // rdx
 959 -                        uint8* rgb_buf,      // rcx
 960 -                        int width,           // r8
 961 -                        int source_dx) {     // r9
 962 -  asm(
 963 -  "xor    %%r11,%%r11\n"
 964 -  "sub    $0x2,%4\n"
 965 -  "js     scalenext\n"
 966 -
 967 -"scaleloop:"
 968 -  "mov    %%r11,%%r10\n"
 969 -  "sar    $0x11,%%r10\n"
 970 -  "movzb  (%1,%%r10,1),%%rax\n"
 971 -  "movq   2048(%5,%%rax,8),%%xmm0\n"
 972 -  "movzb  (%2,%%r10,1),%%rax\n"
 973 -  "movq   4096(%5,%%rax,8),%%xmm1\n"
 974 -  "lea    (%%r11,%6),%%r10\n"
 975 -  "sar    $0x10,%%r11\n"
 976 -  "movzb  (%0,%%r11,1),%%rax\n"
 977 -  "paddsw %%xmm1,%%xmm0\n"
 978 -  "movq   (%5,%%rax,8),%%xmm1\n"
 979 -  "lea    (%%r10,%6),%%r11\n"
 980 -  "sar    $0x10,%%r10\n"
 981 -  "movzb  (%0,%%r10,1),%%rax\n"
 982 -  "movq   (%5,%%rax,8),%%xmm2\n"
 983 -  "paddsw %%xmm0,%%xmm1\n"
 984 -  "paddsw %%xmm0,%%xmm2\n"
 985 -  "shufps $0x44,%%xmm2,%%xmm1\n"
 986 -  "psraw  $0x6,%%xmm1\n"
 987 -  "packuswb %%xmm1,%%xmm1\n"
 988 -  "movq   %%xmm1,0x0(%3)\n"
 989 -  "add    $0x8,%3\n"
 990 -  "sub    $0x2,%4\n"
 991 -  "jns    scaleloop\n"
 992 -
 993 -"scalenext:"
 994 -  "add    $0x1,%4\n"
 995 -  "js     scaledone\n"
 996 -
 997 -  "mov    %%r11,%%r10\n"
 998 -  "sar    $0x11,%%r10\n"
 999 -  "movzb  (%1,%%r10,1),%%rax\n"
1000 -  "movq   2048(%5,%%rax,8),%%xmm0\n"
1001 -  "movzb  (%2,%%r10,1),%%rax\n"
1002 -  "movq   4096(%5,%%rax,8),%%xmm1\n"
1003 -  "paddsw %%xmm1,%%xmm0\n"
1004 -  "sar    $0x10,%%r11\n"
1005 -  "movzb  (%0,%%r11,1),%%rax\n"
1006 -  "movq   (%5,%%rax,8),%%xmm1\n"
1007 -  "paddsw %%xmm0,%%xmm1\n"
1008 -  "psraw  $0x6,%%xmm1\n"
1009 -  "packuswb %%xmm1,%%xmm1\n"
1010 -  "movd   %%xmm1,0x0(%3)\n"
1011 -
1012 -"scaledone:"
1013 -  :
1014 -  : "r"(y_buf),  // %0
1015 -    "r"(u_buf),  // %1
1016 -    "r"(v_buf),  // %2
1017 -    "r"(rgb_buf),  // %3
1018 -    "r"(width),  // %4
1019 -    "r" (kCoefficientsRgbY),  // %5
1020 -    "r"(static_cast<long>(source_dx))  // %6
1021 -  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
1022 -);
1023 -}
1024 -
1025 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1026 -                              const uint8* u_buf,
1027 -                              const uint8* v_buf,
1028 -                              uint8* rgb_buf,
1029 -                              int width,
1030 -                              int source_dx) {
1031 -  asm(
1032 -  "xor    %%r11,%%r11\n"   // x = 0
1033 -  "sub    $0x2,%4\n"
1034 -  "js     .lscalenext\n"
1035 -  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
1036 -  "jl     .lscalehalf\n"
1037 -  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1038 -".lscalehalf:"
1039 -
1040 -".lscaleloop:"
1041 -  "mov    %%r11,%%r10\n"
1042 -  "sar    $0x11,%%r10\n"
1043 -
1044 -  "movzb  (%1, %%r10, 1), %%r13 \n"
1045 -  "movzb  1(%1, %%r10, 1), %%r14 \n"
1046 -  "mov    %%r11, %%rax \n"
1047 -  "and    $0x1fffe, %%rax \n"
1048 -  "imul   %%rax, %%r14 \n"
1049 -  "xor    $0x1fffe, %%rax \n"
1050 -  "imul   %%rax, %%r13 \n"
1051 -  "add    %%r14, %%r13 \n"
1052 -  "shr    $17, %%r13 \n"
1053 -  "movq   2048(%5,%%r13,8), %%xmm0\n"
1054 -
1055 -  "movzb  (%2, %%r10, 1), %%r13 \n"
1056 -  "movzb  1(%2, %%r10, 1), %%r14 \n"
1057 -  "mov    %%r11, %%rax \n"
1058 -  "and    $0x1fffe, %%rax \n"
1059 -  "imul   %%rax, %%r14 \n"
1060 -  "xor    $0x1fffe, %%rax \n"
1061 -  "imul   %%rax, %%r13 \n"
1062 -  "add    %%r14, %%r13 \n"
1063 -  "shr    $17, %%r13 \n"
1064 -  "movq   4096(%5,%%r13,8), %%xmm1\n"
1065 -
1066 -  "mov    %%r11, %%rax \n"
1067 -  "lea    (%%r11,%6),%%r10\n"
1068 -  "sar    $0x10,%%r11\n"
1069 -  "paddsw %%xmm1,%%xmm0\n"
1070 -
1071 -  "movzb  (%0, %%r11, 1), %%r13 \n"
1072 -  "movzb  1(%0, %%r11, 1), %%r14 \n"
1073 -  "and    $0xffff, %%rax \n"
1074 -  "imul   %%rax, %%r14 \n"
1075 -  "xor    $0xffff, %%rax \n"
1076 -  "imul   %%rax, %%r13 \n"
1077 -  "add    %%r14, %%r13 \n"
1078 -  "shr    $16, %%r13 \n"
1079 -  "movq   (%5,%%r13,8),%%xmm1\n"
1080 -
1081 -  "mov    %%r10, %%rax \n"
1082 -  "lea    (%%r10,%6),%%r11\n"
1083 -  "sar    $0x10,%%r10\n"
1084 -
1085 -  "movzb  (%0,%%r10,1), %%r13 \n"
1086 -  "movzb  1(%0,%%r10,1), %%r14 \n"
1087 -  "and    $0xffff, %%rax \n"
1088 -  "imul   %%rax, %%r14 \n"
1089 -  "xor    $0xffff, %%rax \n"
1090 -  "imul   %%rax, %%r13 \n"
1091 -  "add    %%r14, %%r13 \n"
1092 -  "shr    $16, %%r13 \n"
1093 -  "movq   (%5,%%r13,8),%%xmm2\n"
1094 -
1095 -  "paddsw %%xmm0,%%xmm1\n"
1096 -  "paddsw %%xmm0,%%xmm2\n"
1097 -  "shufps $0x44,%%xmm2,%%xmm1\n"
1098 -  "psraw  $0x6,%%xmm1\n"
1099 -  "packuswb %%xmm1,%%xmm1\n"
1100 -  "movq   %%xmm1,0x0(%3)\n"
1101 -  "add    $0x8,%3\n"
1102 -  "sub    $0x2,%4\n"
1103 -  "jns    .lscaleloop\n"
1104 -
1105 -".lscalenext:"
1106 -  "add    $0x1,%4\n"
1107 -  "js     .lscaledone\n"
1108 -
1109 -  "mov    %%r11,%%r10\n"
1110 -  "sar    $0x11,%%r10\n"
1111 -
1112 -  "movzb  (%1,%%r10,1), %%r13 \n"
1113 -  "movq   2048(%5,%%r13,8),%%xmm0\n"
1114 -
1115 -  "movzb  (%2,%%r10,1), %%r13 \n"
1116 -  "movq   4096(%5,%%r13,8),%%xmm1\n"
1117 -
1118 -  "paddsw %%xmm1,%%xmm0\n"
1119 -  "sar    $0x10,%%r11\n"
1120 -
1121 -  "movzb  (%0,%%r11,1), %%r13 \n"
1122 -  "movq   (%5,%%r13,8),%%xmm1\n"
1123 -
1124 -  "paddsw %%xmm0,%%xmm1\n"
1125 -  "psraw  $0x6,%%xmm1\n"
1126 -  "packuswb %%xmm1,%%xmm1\n"
1127 -  "movd   %%xmm1,0x0(%3)\n"
1128 -
1129 -".lscaledone:"
1130 -  :
1131 -  : "r"(y_buf),  // %0
1132 -    "r"(u_buf),  // %1
1133 -    "r"(v_buf),  // %2
1134 -    "r"(rgb_buf),  // %3
1135 -    "r"(width),  // %4
1136 -    "r" (kCoefficientsRgbY),  // %5
1137 -    "r"(static_cast<long>(source_dx))  // %6
1138 -  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1139 -);
1140 -}
1141 -
1142 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1143 -
1144 -// PIC version is slower because less registers are available, so
1145 -// non-PIC is used on platforms where it is possible.
1146 -
1147 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1148 -                              const uint8* u_buf,
1149 -                              const uint8* v_buf,
1150 -                              uint8* rgb_buf,
1151 -                              int width);
1152 -  asm(
1153 -  ".text\n"
1154 -  ".global FastConvertYUVToRGB32Row\n"
1155 -"FastConvertYUVToRGB32Row:\n"
1156 -  "pusha\n"
1157 -  "mov    0x24(%esp),%edx\n"
1158 -  "mov    0x28(%esp),%edi\n"
1159 -  "mov    0x2c(%esp),%esi\n"
1160 -  "mov    0x30(%esp),%ebp\n"
1161 -  "mov    0x34(%esp),%ecx\n"
1162 -  "jmp    convertend\n"
1163 -
1164 -"convertloop:"
1165 -  "movzbl (%edi),%eax\n"
1166 -  "add    $0x1,%edi\n"
1167 -  "movzbl (%esi),%ebx\n"
1168 -  "add    $0x1,%esi\n"
1169 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1170 -  "movzbl (%edx),%eax\n"
1171 -  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
1172 -  "movzbl 0x1(%edx),%ebx\n"
1173 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1174 -  "add    $0x2,%edx\n"
1175 -  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
1176 -  "paddsw %mm0,%mm1\n"
1177 -  "paddsw %mm0,%mm2\n"
1178 -  "psraw  $0x6,%mm1\n"
1179 -  "psraw  $0x6,%mm2\n"
1180 -  "packuswb %mm2,%mm1\n"
1181 -  "movntq %mm1,0x0(%ebp)\n"
1182 -  "add    $0x8,%ebp\n"
1183 -"convertend:"
1184 -  "sub    $0x2,%ecx\n"
1185 -  "jns    convertloop\n"
1186 -
1187 -  "and    $0x1,%ecx\n"
1188 -  "je     convertdone\n"
1189 -
1190 -  "movzbl (%edi),%eax\n"
1191 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1192 -  "movzbl (%esi),%eax\n"
1193 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1194 -  "movzbl (%edx),%eax\n"
1195 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1196 -  "paddsw %mm0,%mm1\n"
1197 -  "psraw  $0x6,%mm1\n"
1198 -  "packuswb %mm1,%mm1\n"
1199 -  "movd   %mm1,0x0(%ebp)\n"
1200 -"convertdone:"
1201 -  "popa\n"
1202 -  "ret\n"
1203 -);
1204 -
1205 -
1206 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1207 -                        const uint8* u_buf,
1208 -                        const uint8* v_buf,
1209 -                        uint8* rgb_buf,
1210 -                        int width,
1211 -                        int source_dx);
1212 -  asm(
1213 -  ".text\n"
1214 -  ".global ScaleYUVToRGB32Row\n"
1215 -"ScaleYUVToRGB32Row:\n"
1216 -  "pusha\n"
1217 -  "mov    0x24(%esp),%edx\n"
1218 -  "mov    0x28(%esp),%edi\n"
1219 -  "mov    0x2c(%esp),%esi\n"
1220 -  "mov    0x30(%esp),%ebp\n"
1221 -  "mov    0x34(%esp),%ecx\n"
1222 -  "xor    %ebx,%ebx\n"
1223 -  "jmp    scaleend\n"
1224 -
1225 -"scaleloop:"
1226 -  "mov    %ebx,%eax\n"
1227 -  "sar    $0x11,%eax\n"
1228 -  "movzbl (%edi,%eax,1),%eax\n"
1229 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1230 -  "mov    %ebx,%eax\n"
1231 -  "sar    $0x11,%eax\n"
1232 -  "movzbl (%esi,%eax,1),%eax\n"
1233 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1234 -  "mov    %ebx,%eax\n"
1235 -  "add    0x38(%esp),%ebx\n"
1236 -  "sar    $0x10,%eax\n"
1237 -  "movzbl (%edx,%eax,1),%eax\n"
1238 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1239 -  "mov    %ebx,%eax\n"
1240 -  "add    0x38(%esp),%ebx\n"
1241 -  "sar    $0x10,%eax\n"
1242 -  "movzbl (%edx,%eax,1),%eax\n"
1243 -  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
1244 -  "paddsw %mm0,%mm1\n"
1245 -  "paddsw %mm0,%mm2\n"
1246 -  "psraw  $0x6,%mm1\n"
1247 -  "psraw  $0x6,%mm2\n"
1248 -  "packuswb %mm2,%mm1\n"
1249 -  "movntq %mm1,0x0(%ebp)\n"
1250 -  "add    $0x8,%ebp\n"
1251 -"scaleend:"
1252 -  "sub    $0x2,%ecx\n"
1253 -  "jns    scaleloop\n"
1254 -
1255 -  "and    $0x1,%ecx\n"
1256 -  "je     scaledone\n"
1257 -
1258 -  "mov    %ebx,%eax\n"
1259 -  "sar    $0x11,%eax\n"
1260 -  "movzbl (%edi,%eax,1),%eax\n"
1261 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1262 -  "mov    %ebx,%eax\n"
1263 -  "sar    $0x11,%eax\n"
1264 -  "movzbl (%esi,%eax,1),%eax\n"
1265 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1266 -  "mov    %ebx,%eax\n"
1267 -  "sar    $0x10,%eax\n"
1268 -  "movzbl (%edx,%eax,1),%eax\n"
1269 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1270 -  "paddsw %mm0,%mm1\n"
1271 -  "psraw  $0x6,%mm1\n"
1272 -  "packuswb %mm1,%mm1\n"
1273 -  "movd   %mm1,0x0(%ebp)\n"
1274 -
1275 -"scaledone:"
1276 -  "popa\n"
1277 -  "ret\n"
1278 -);
1279 -
1280 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1281 -                              const uint8* u_buf,
1282 -                              const uint8* v_buf,
1283 -                              uint8* rgb_buf,
1284 -                              int width,
1285 -                              int source_dx);
1286 -  asm(
1287 -  ".text\n"
1288 -  ".global LinearScaleYUVToRGB32Row\n"
1289 -"LinearScaleYUVToRGB32Row:\n"
1290 -  "pusha\n"
1291 -  "mov    0x24(%esp),%edx\n"
1292 -  "mov    0x28(%esp),%edi\n"
1293 -  "mov    0x30(%esp),%ebp\n"
1294 -
1295 -  // source_width = width * source_dx + ebx
1296 -  "mov    0x34(%esp), %ecx\n"
1297 -  "imull  0x38(%esp), %ecx\n"
1298 -  "mov    %ecx, 0x34(%esp)\n"
1299 -
1300 -  "mov    0x38(%esp), %ecx\n"
1301 -  "xor    %ebx,%ebx\n"     // x = 0
1302 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
1303 -  "jl     .lscaleend\n"
1304 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
1305 -  "jmp    .lscaleend\n"
1306 -
1307 -".lscaleloop:"
1308 -  "mov    %ebx,%eax\n"
1309 -  "sar    $0x11,%eax\n"
1310 -
1311 -  "movzbl (%edi,%eax,1),%ecx\n"
1312 -  "movzbl 1(%edi,%eax,1),%esi\n"
1313 -  "mov    %ebx,%eax\n"
1314 -  "andl   $0x1fffe, %eax \n"
1315 -  "imul   %eax, %esi \n"
1316 -  "xorl   $0x1fffe, %eax \n"
1317 -  "imul   %eax, %ecx \n"
1318 -  "addl   %esi, %ecx \n"
1319 -  "shrl   $17, %ecx \n"
1320 -  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
1321 -
1322 -  "mov    0x2c(%esp),%esi\n"
1323 -  "mov    %ebx,%eax\n"
1324 -  "sar    $0x11,%eax\n"
1325 -
1326 -  "movzbl (%esi,%eax,1),%ecx\n"
1327 -  "movzbl 1(%esi,%eax,1),%esi\n"
1328 -  "mov    %ebx,%eax\n"
1329 -  "andl   $0x1fffe, %eax \n"
1330 -  "imul   %eax, %esi \n"
1331 -  "xorl   $0x1fffe, %eax \n"
1332 -  "imul   %eax, %ecx \n"
1333 -  "addl   %esi, %ecx \n"
1334 -  "shrl   $17, %ecx \n"
1335 -  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
1336 -
1337 -  "mov    %ebx,%eax\n"
1338 -  "sar    $0x10,%eax\n"
1339 -  "movzbl (%edx,%eax,1),%ecx\n"
1340 -  "movzbl 1(%edx,%eax,1),%esi\n"
1341 -  "mov    %ebx,%eax\n"
1342 -  "add    0x38(%esp),%ebx\n"
1343 -  "andl   $0xffff, %eax \n"
1344 -  "imul   %eax, %esi \n"
1345 -  "xorl   $0xffff, %eax \n"
1346 -  "imul   %eax, %ecx \n"
1347 -  "addl   %esi, %ecx \n"
1348 -  "shrl   $16, %ecx \n"
1349 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
1350 -
1351 -  "cmp    0x34(%esp), %ebx\n"
1352 -  "jge    .lscalelastpixel\n"
1353 -
1354 -  "mov    %ebx,%eax\n"
1355 -  "sar    $0x10,%eax\n"
1356 -  "movzbl (%edx,%eax,1),%ecx\n"
1357 -  "movzbl 1(%edx,%eax,1),%esi\n"
1358 -  "mov    %ebx,%eax\n"
1359 -  "add    0x38(%esp),%ebx\n"
1360 -  "andl   $0xffff, %eax \n"
1361 -  "imul   %eax, %esi \n"
1362 -  "xorl   $0xffff, %eax \n"
1363 -  "imul   %eax, %ecx \n"
1364 -  "addl   %esi, %ecx \n"
1365 -  "shrl   $16, %ecx \n"
1366 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
1367 -
1368 -  "paddsw %mm0,%mm1\n"
1369 -  "paddsw %mm0,%mm2\n"
1370 -  "psraw  $0x6,%mm1\n"
1371 -  "psraw  $0x6,%mm2\n"
1372 -  "packuswb %mm2,%mm1\n"
1373 -  "movntq %mm1,0x0(%ebp)\n"
1374 -  "add    $0x8,%ebp\n"
1375 -
1376 -".lscaleend:"
1377 -  "cmp    0x34(%esp), %ebx\n"
1378 -  "jl     .lscaleloop\n"
1379 -  "popa\n"
1380 -  "ret\n"
1381 -
1382 -".lscalelastpixel:"
1383 -  "paddsw %mm0, %mm1\n"
1384 -  "psraw $6, %mm1\n"
1385 -  "packuswb %mm1, %mm1\n"
1386 -  "movd %mm1, (%ebp)\n"
1387 -  "popa\n"
1388 -  "ret\n"
1389 -);
1390 -
1391 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
1392 -
1393 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
1394 -                                    const uint8* u_buf,
1395 -                                    const uint8* v_buf,
1396 -                                    uint8* rgb_buf,
1397 -                                    int width,
1398 -                                    int16 *kCoefficientsRgbY);
1399 -  asm(
1400 -  ".text\n"
1401 -#if defined(OS_MACOSX)
1402 -"_PICConvertYUVToRGB32Row:\n"
1403 -#else
1404 -"PICConvertYUVToRGB32Row:\n"
1405 -#endif
1406 -  "pusha\n"
1407 -  "mov    0x24(%esp),%edx\n"
1408 -  "mov    0x28(%esp),%edi\n"
1409 -  "mov    0x2c(%esp),%esi\n"
1410 -  "mov    0x30(%esp),%ebp\n"
1411 -  "mov    0x38(%esp),%ecx\n"
1412 -
1413 -  "jmp    .Lconvertend\n"
1414 -
1415 -".Lconvertloop:"
1416 -  "movzbl (%edi),%eax\n"
1417 -  "add    $0x1,%edi\n"
1418 -  "movzbl (%esi),%ebx\n"
1419 -  "add    $0x1,%esi\n"
1420 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1421 -  "movzbl (%edx),%eax\n"
1422 -  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
1423 -  "movzbl 0x1(%edx),%ebx\n"
1424 -  "movq   0(%ecx,%eax,8),%mm1\n"
1425 -  "add    $0x2,%edx\n"
1426 -  "movq   0(%ecx,%ebx,8),%mm2\n"
1427 -  "paddsw %mm0,%mm1\n"
1428 -  "paddsw %mm0,%mm2\n"
1429 -  "psraw  $0x6,%mm1\n"
1430 -  "psraw  $0x6,%mm2\n"
1431 -  "packuswb %mm2,%mm1\n"
1432 -  "movntq %mm1,0x0(%ebp)\n"
1433 -  "add    $0x8,%ebp\n"
1434 -".Lconvertend:"
1435 -  "subl   $0x2,0x34(%esp)\n"
1436 -  "jns    .Lconvertloop\n"
1437 -
1438 -  "andl   $0x1,0x34(%esp)\n"
1439 -  "je     .Lconvertdone\n"
1440 -
1441 -  "movzbl (%edi),%eax\n"
1442 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1443 -  "movzbl (%esi),%eax\n"
1444 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
1445 -  "movzbl (%edx),%eax\n"
1446 -  "movq   0(%ecx,%eax,8),%mm1\n"
1447 -  "paddsw %mm0,%mm1\n"
1448 -  "psraw  $0x6,%mm1\n"
1449 -  "packuswb %mm1,%mm1\n"
1450 -  "movd   %mm1,0x0(%ebp)\n"
1451 -".Lconvertdone:\n"
1452 -  "popa\n"
1453 -  "ret\n"
1454 -);
1455 -
1456 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1457 -                              const uint8* u_buf,
1458 -                              const uint8* v_buf,
1459 -                              uint8* rgb_buf,
1460 -                              int width) {
1461 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
1462 -                          &kCoefficientsRgbY[0][0]);
1463 -}
1464 -
1465 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
1466 -                               const uint8* u_buf,
1467 -                               const uint8* v_buf,
1468 -                               uint8* rgb_buf,
1469 -                               int width,
1470 -                               int source_dx,
1471 -                               int16 *kCoefficientsRgbY);
1472 -
1473 -  asm(
1474 -  ".text\n"
1475 -#if defined(OS_MACOSX)
1476 -"_PICScaleYUVToRGB32Row:\n"
1477 -#else
1478 -"PICScaleYUVToRGB32Row:\n"
1479 -#endif
1480 -  "pusha\n"
1481 -  "mov    0x24(%esp),%edx\n"
1482 -  "mov    0x28(%esp),%edi\n"
1483 -  "mov    0x2c(%esp),%esi\n"
1484 -  "mov    0x30(%esp),%ebp\n"
1485 -  "mov    0x3c(%esp),%ecx\n"
1486 -  "xor    %ebx,%ebx\n"
1487 -  "jmp    Lscaleend\n"
1488 -
1489 -"Lscaleloop:"
1490 -  "mov    %ebx,%eax\n"
1491 -  "sar    $0x11,%eax\n"
1492 -  "movzbl (%edi,%eax,1),%eax\n"
1493 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1494 -  "mov    %ebx,%eax\n"
1495 -  "sar    $0x11,%eax\n"
1496 -  "movzbl (%esi,%eax,1),%eax\n"
1497 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
1498 -  "mov    %ebx,%eax\n"
1499 -  "add    0x38(%esp),%ebx\n"
1500 -  "sar    $0x10,%eax\n"
1501 -  "movzbl (%edx,%eax,1),%eax\n"
1502 -  "movq   0(%ecx,%eax,8),%mm1\n"
1503 -  "mov    %ebx,%eax\n"
1504 -  "add    0x38(%esp),%ebx\n"
1505 -  "sar    $0x10,%eax\n"
1506 -  "movzbl (%edx,%eax,1),%eax\n"
1507 -  "movq   0(%ecx,%eax,8),%mm2\n"
1508 -  "paddsw %mm0,%mm1\n"
1509 -  "paddsw %mm0,%mm2\n"
1510 -  "psraw  $0x6,%mm1\n"
1511 -  "psraw  $0x6,%mm2\n"
1512 -  "packuswb %mm2,%mm1\n"
1513 -  "movntq %mm1,0x0(%ebp)\n"
1514 -  "add    $0x8,%ebp\n"
1515 -"Lscaleend:"
1516 -  "subl   $0x2,0x34(%esp)\n"
1517 -  "jns    Lscaleloop\n"
1518 -
1519 -  "andl   $0x1,0x34(%esp)\n"
1520 -  "je     Lscaledone\n"
1521 -
1522 -  "mov    %ebx,%eax\n"
1523 -  "sar    $0x11,%eax\n"
1524 -  "movzbl (%edi,%eax,1),%eax\n"
1525 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1526 -  "mov    %ebx,%eax\n"
1527 -  "sar    $0x11,%eax\n"
1528 -  "movzbl (%esi,%eax,1),%eax\n"
1529 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
1530 -  "mov    %ebx,%eax\n"
1531 -  "sar    $0x10,%eax\n"
1532 -  "movzbl (%edx,%eax,1),%eax\n"
1533 -  "movq   0(%ecx,%eax,8),%mm1\n"
1534 -  "paddsw %mm0,%mm1\n"
1535 -  "psraw  $0x6,%mm1\n"
1536 -  "packuswb %mm1,%mm1\n"
1537 -  "movd   %mm1,0x0(%ebp)\n"
1538 -
1539 -"Lscaledone:"
1540 -  "popa\n"
1541 -  "ret\n"
1542 -);
1543 -
1544 -
1545 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1546 -                        const uint8* u_buf,
1547 -                        const uint8* v_buf,
1548 -                        uint8* rgb_buf,
1549 -                        int width,
1550 -                        int source_dx) {
1551 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1552 -                        &kCoefficientsRgbY[0][0]);
1553 -}
1554 -
1555 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
1556 -                                 const uint8* u_buf,
1557 -                                 const uint8* v_buf,
1558 -                                 uint8* rgb_buf,
1559 -                                 int width,
1560 -                                 int source_dx,
1561 -                                 int16 *kCoefficientsRgbY);
1562 -  asm(
1563 -  ".text\n"
1564 -#if defined(OS_MACOSX)
1565 -"_PICLinearScaleYUVToRGB32Row:\n"
1566 -#else
1567 -"PICLinearScaleYUVToRGB32Row:\n"
1568 -#endif
1569 -  "pusha\n"
1570 -  "mov    0x24(%esp),%edx\n"
1571 -  "mov    0x30(%esp),%ebp\n"
1572 -  "mov    0x34(%esp),%ecx\n"
1573 -  "mov    0x3c(%esp),%edi\n"
1574 -  "xor    %ebx,%ebx\n"
1575 -
1576 -  // source_width = width * source_dx + ebx
1577 -  "mov    0x34(%esp), %ecx\n"
1578 -  "imull  0x38(%esp), %ecx\n"
1579 -  "mov    %ecx, 0x34(%esp)\n"
1580 -
1581 -  "mov    0x38(%esp), %ecx\n"
1582 -  "xor    %ebx,%ebx\n"     // x = 0
1583 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
1584 -  "jl     .lscaleend\n"
1585 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
1586 -  "jmp    .lscaleend\n"
1587 -
1588 -".lscaleloop:"
1589 -  "mov    0x28(%esp),%esi\n"
1590 -  "mov    %ebx,%eax\n"
1591 -  "sar    $0x11,%eax\n"
1592 -
1593 -  "movzbl (%esi,%eax,1),%ecx\n"
1594 -  "movzbl 1(%esi,%eax,1),%esi\n"
1595 -  "mov    %ebx,%eax\n"
1596 -  "andl   $0x1fffe, %eax \n"
1597 -  "imul   %eax, %esi \n"
1598 -  "xorl   $0x1fffe, %eax \n"
1599 -  "imul   %eax, %ecx \n"
1600 -  "addl   %esi, %ecx \n"
1601 -  "shrl   $17, %ecx \n"
1602 -  "movq   2048(%edi,%ecx,8),%mm0\n"
1603 -
1604 -  "mov    0x2c(%esp),%esi\n"
1605 -  "mov    %ebx,%eax\n"
1606 -  "sar    $0x11,%eax\n"
1607 -
1608 -  "movzbl (%esi,%eax,1),%ecx\n"
1609 -  "movzbl 1(%esi,%eax,1),%esi\n"
1610 -  "mov    %ebx,%eax\n"
1611 -  "andl   $0x1fffe, %eax \n"
1612 -  "imul   %eax, %esi \n"
1613 -  "xorl   $0x1fffe, %eax \n"
1614 -  "imul   %eax, %ecx \n"
1615 -  "addl   %esi, %ecx \n"
1616 -  "shrl   $17, %ecx \n"
1617 -  "paddsw 4096(%edi,%ecx,8),%mm0\n"
1618 -
1619 -  "mov    %ebx,%eax\n"
1620 -  "sar    $0x10,%eax\n"
1621 -  "movzbl (%edx,%eax,1),%ecx\n"
1622 -  "movzbl 1(%edx,%eax,1),%esi\n"
1623 -  "mov    %ebx,%eax\n"
1624 -  "add    0x38(%esp),%ebx\n"
1625 -  "andl   $0xffff, %eax \n"
1626 -  "imul   %eax, %esi \n"
1627 -  "xorl   $0xffff, %eax \n"
1628 -  "imul   %eax, %ecx \n"
1629 -  "addl   %esi, %ecx \n"
1630 -  "shrl   $16, %ecx \n"
1631 -  "movq   (%edi,%ecx,8),%mm1\n"
1632 -
1633 -  "cmp    0x34(%esp), %ebx\n"
1634 -  "jge    .lscalelastpixel\n"
1635 -
1636 -  "mov    %ebx,%eax\n"
1637 -  "sar    $0x10,%eax\n"
1638 -  "movzbl (%edx,%eax,1),%ecx\n"
1639 -  "movzbl 1(%edx,%eax,1),%esi\n"
1640 -  "mov    %ebx,%eax\n"
1641 -  "add    0x38(%esp),%ebx\n"
1642 -  "andl   $0xffff, %eax \n"
1643 -  "imul   %eax, %esi \n"
1644 -  "xorl   $0xffff, %eax \n"
1645 -  "imul   %eax, %ecx \n"
1646 -  "addl   %esi, %ecx \n"
1647 -  "shrl   $16, %ecx \n"
1648 -  "movq   (%edi,%ecx,8),%mm2\n"
1649 -
1650 -  "paddsw %mm0,%mm1\n"
1651 -  "paddsw %mm0,%mm2\n"
1652 -  "psraw  $0x6,%mm1\n"
1653 -  "psraw  $0x6,%mm2\n"
1654 -  "packuswb %mm2,%mm1\n"
1655 -  "movntq %mm1,0x0(%ebp)\n"
1656 -  "add    $0x8,%ebp\n"
1657 -
1658 -".lscaleend:"
1659 -  "cmp    %ebx, 0x34(%esp)\n"
1660 -  "jg     .lscaleloop\n"
1661 -  "popa\n"
1662 -  "ret\n"
1663 -
1664 -".lscalelastpixel:"
1665 -  "paddsw %mm0, %mm1\n"
1666 -  "psraw $6, %mm1\n"
1667 -  "packuswb %mm1, %mm1\n"
1668 -  "movd %mm1, (%ebp)\n"
1669 -  "popa\n"
1670 -  "ret\n"
1671 -);
1672 -
1673 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1674 -                        const uint8* u_buf,
1675 -                        const uint8* v_buf,
1676 -                        uint8* rgb_buf,
1677 -                        int width,
1678 -                        int source_dx) {
1679 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1680 -                              &kCoefficientsRgbY[0][0]);
1681 -}
1682 -
1683 -#else  // USE_MMX
1684 -
1685  // C reference code that mimic the YUV assembly.
1686  #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
1687  #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
1688      (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
1689
1690  static inline void YuvPixel(uint8 y,
1691                              uint8 u,
1692                              uint8 v,
1693 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
1694    a >>= 6;
1695
1696    *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
1697                                          (packuswb(g) << 8) |
1698                                          (packuswb(r) << 16) |
1699                                          (packuswb(a) << 24);
1700  }
1701
1702 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1703 -                              const uint8* u_buf,
1704 -                              const uint8* v_buf,
1705 -                              uint8* rgb_buf,
1706 -                              int width) {
1707 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
1708 +                                const uint8* u_buf,
1709 +                                const uint8* v_buf,
1710 +                                uint8* rgb_buf,
1711 +                                int width,
1712 +                                unsigned int x_shift) {
1713    for (int x = 0; x < width; x += 2) {
1714 -    uint8 u = u_buf[x >> 1];
1715 -    uint8 v = v_buf[x >> 1];
1716 +    uint8 u = u_buf[x >> x_shift];
1717 +    uint8 v = v_buf[x >> x_shift];
1718      uint8 y0 = y_buf[x];
1719      YuvPixel(y0, u, v, rgb_buf);
1720      if ((x + 1) < width) {
1721        uint8 y1 = y_buf[x + 1];
1722 +      if (x_shift == 0) {
1723 +        u = u_buf[x + 1];
1724 +        v = v_buf[x + 1];
1725 +      }
1726        YuvPixel(y1, u, v, rgb_buf + 4);
1727      }
1728      rgb_buf += 8;  // Advance 2 pixels.
1729    }
1730  }
1731
1732  // 16.16 fixed point is used.  A shift by 16 isolates the integer.
1733  // A shift by 17 is used to further subsample the chrominence channels.
1734  // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
1735  // for 1/65536 pixel accurate interpolation.
1736 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1737 -                        const uint8* u_buf,
1738 -                        const uint8* v_buf,
1739 -                        uint8* rgb_buf,
1740 -                        int width,
1741 -                        int source_dx) {
1742 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
1743 +                          const uint8* u_buf,
1744 +                          const uint8* v_buf,
1745 +                          uint8* rgb_buf,
1746 +                          int width,
1747 +                          int source_dx) {
1748    int x = 0;
1749    for (int i = 0; i < width; i += 2) {
1750      int y = y_buf[x >> 16];
1751      int u = u_buf[(x >> 17)];
1752      int v = v_buf[(x >> 17)];
1753      YuvPixel(y, u, v, rgb_buf);
1754      x += source_dx;
1755      if ((i + 1) < width) {
1756        y = y_buf[x >> 16];
1757        YuvPixel(y, u, v, rgb_buf+4);
1758        x += source_dx;
1759      }
1760      rgb_buf += 8;
1761    }
1762  }
1763
1764 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1765 -                              const uint8* u_buf,
1766 -                              const uint8* v_buf,
1767 -                              uint8* rgb_buf,
1768 -                              int width,
1769 -                              int source_dx) {
1770 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
1771 +                                const uint8* u_buf,
1772 +                                const uint8* v_buf,
1773 +                                uint8* rgb_buf,
1774 +                                int width,
1775 +                                int source_dx) {
1776    int x = 0;
1777    if (source_dx >= 0x20000) {
1778      x = 32768;
1779    }
1780    for (int i = 0; i < width; i += 2) {
1781      int y0 = y_buf[x >> 16];
1782      int y1 = y_buf[(x >> 16) + 1];
1783      int u0 = u_buf[(x >> 17)];
1784 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
1785        y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
1786        YuvPixel(y, u, v, rgb_buf+4);
1787        x += source_dx;
1788      }
1789      rgb_buf += 8;
1790    }
1791  }
1792
1793 -#endif  // USE_MMX
1794  }  // extern "C"
1795
1796 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
1797 --- a/gfx/ycbcr/yuv_row_posix.cpp
1798 +++ b/gfx/ycbcr/yuv_row_posix.cpp
1799 @@ -1,33 +1,32 @@
1800  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
1801  // Use of this source code is governed by a BSD-style license that can be
1802  // found in the LICENSE file.
1803
1804 -#include "media/base/yuv_row.h"
1805 -
1806 -#ifdef _DEBUG
1807 -#include "base/logging.h"
1808 -#else
1809 +#include "yuv_row.h"
1810 +#include "mozilla/SSE.h"
1811 +
1812  #define DCHECK(a)
1813 -#endif
1814
1815  extern "C" {
1816
1817 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
1818 +#if defined(ARCH_CPU_X86_64)
1819 +
1820 +// We don't need CPUID guards here, since x86-64 implies SSE2.
1821
1822  // AMD64 ABI uses register paremters.
1823  void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
1824                                const uint8* u_buf,  // rsi
1825                                const uint8* v_buf,  // rdx
1826                                uint8* rgb_buf,      // rcx
1827                                int width) {         // r8
1828    asm(
1829 -  "jmp    convertend\n"
1830 -"convertloop:"
1831 +  "jmp    1f\n"
1832 +"0:"
1833    "movzb  (%1),%%r10\n"
1834    "add    $0x1,%1\n"
1835    "movzb  (%2),%%r11\n"
1836    "add    $0x1,%2\n"
1837    "movq   2048(%5,%%r10,8),%%xmm0\n"
1838    "movzb  (%0),%%r10\n"
1839    "movq   4096(%5,%%r11,8),%%xmm1\n"
1840    "movzb  0x1(%0),%%r11\n"
1841 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
1842    "movq   (%5,%%r11,8),%%xmm3\n"
1843    "paddsw %%xmm0,%%xmm2\n"
1844    "paddsw %%xmm0,%%xmm3\n"
1845    "shufps $0x44,%%xmm3,%%xmm2\n"
1846    "psraw  $0x6,%%xmm2\n"
1847    "packuswb %%xmm2,%%xmm2\n"
1848    "movq   %%xmm2,0x0(%3)\n"
1849    "add    $0x8,%3\n"
1850 -"convertend:"
1851 +"1:"
1852    "sub    $0x2,%4\n"
1853 -  "jns    convertloop\n"
1854 -
1855 -"convertnext:"
1856 +  "jns    0b\n"
1857 +
1858 +"2:"
1859    "add    $0x1,%4\n"
1860 -  "js     convertdone\n"
1861 +  "js     3f\n"
1862
1863    "movzb  (%1),%%r10\n"
1864    "movq   2048(%5,%%r10,8),%%xmm0\n"
1865    "movzb  (%2),%%r10\n"
1866    "movq   4096(%5,%%r10,8),%%xmm1\n"
1867    "paddsw %%xmm1,%%xmm0\n"
1868    "movzb  (%0),%%r10\n"
1869    "movq   (%5,%%r10,8),%%xmm1\n"
1870    "paddsw %%xmm0,%%xmm1\n"
1871    "psraw  $0x6,%%xmm1\n"
1872    "packuswb %%xmm1,%%xmm1\n"
1873    "movd   %%xmm1,0x0(%3)\n"
1874 -"convertdone:"
1875 +"3:"
1876    :
1877    : "r"(y_buf),  // %0
1878      "r"(u_buf),  // %1
1879      "r"(v_buf),  // %2
1880      "r"(rgb_buf),  // %3
1881      "r"(width),  // %4
1882      "r" (kCoefficientsRgbY)  // %5
1883    : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
1884 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1885                          const uint8* u_buf,  // rsi
1886                          const uint8* v_buf,  // rdx
1887                          uint8* rgb_buf,      // rcx
1888                          int width,           // r8
1889                          int source_dx) {     // r9
1890    asm(
1891    "xor    %%r11,%%r11\n"
1892    "sub    $0x2,%4\n"
1893 -  "js     scalenext\n"
1894 -
1895 -"scaleloop:"
1896 +  "js     1f\n"
1897 +
1898 +"0:"
1899    "mov    %%r11,%%r10\n"
1900    "sar    $0x11,%%r10\n"
1901    "movzb  (%1,%%r10,1),%%rax\n"
1902    "movq   2048(%5,%%rax,8),%%xmm0\n"
1903    "movzb  (%2,%%r10,1),%%rax\n"
1904    "movq   4096(%5,%%rax,8),%%xmm1\n"
1905    "lea    (%%r11,%6),%%r10\n"
1906    "sar    $0x10,%%r11\n"
1907 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1908    "paddsw %%xmm0,%%xmm1\n"
1909    "paddsw %%xmm0,%%xmm2\n"
1910    "shufps $0x44,%%xmm2,%%xmm1\n"
1911    "psraw  $0x6,%%xmm1\n"
1912    "packuswb %%xmm1,%%xmm1\n"
1913    "movq   %%xmm1,0x0(%3)\n"
1914    "add    $0x8,%3\n"
1915    "sub    $0x2,%4\n"
1916 -  "jns    scaleloop\n"
1917 -
1918 -"scalenext:"
1919 +  "jns    0b\n"
1920 +
1921 +"1:"
1922    "add    $0x1,%4\n"
1923 -  "js     scaledone\n"
1924 +  "js     2f\n"
1925
1926    "mov    %%r11,%%r10\n"
1927    "sar    $0x11,%%r10\n"
1928    "movzb  (%1,%%r10,1),%%rax\n"
1929    "movq   2048(%5,%%rax,8),%%xmm0\n"
1930    "movzb  (%2,%%r10,1),%%rax\n"
1931    "movq   4096(%5,%%rax,8),%%xmm1\n"
1932    "paddsw %%xmm1,%%xmm0\n"
1933    "sar    $0x10,%%r11\n"
1934    "movzb  (%0,%%r11,1),%%rax\n"
1935    "movq   (%5,%%rax,8),%%xmm1\n"
1936    "paddsw %%xmm0,%%xmm1\n"
1937    "psraw  $0x6,%%xmm1\n"
1938    "packuswb %%xmm1,%%xmm1\n"
1939    "movd   %%xmm1,0x0(%3)\n"
1940
1941 -"scaledone:"
1942 +"2:"
1943    :
1944    : "r"(y_buf),  // %0
1945      "r"(u_buf),  // %1
1946      "r"(v_buf),  // %2
1947      "r"(rgb_buf),  // %3
1948      "r"(width),  // %4
1949      "r" (kCoefficientsRgbY),  // %5
1950      "r"(static_cast<long>(source_dx))  // %6
1951 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
1952                                const uint8* u_buf,
1953                                const uint8* v_buf,
1954                                uint8* rgb_buf,
1955                                int width,
1956                                int source_dx) {
1957    asm(
1958    "xor    %%r11,%%r11\n"   // x = 0
1959    "sub    $0x2,%4\n"
1960 -  "js     .lscalenext\n"
1961 +  "js     2f\n"
1962    "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
1963 -  "jl     .lscalehalf\n"
1964 +  "jl     0f\n"
1965    "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1966 -".lscalehalf:"
1967 -
1968 -".lscaleloop:"
1969 +"0:"
1970 +
1971 +"1:"
1972    "mov    %%r11,%%r10\n"
1973    "sar    $0x11,%%r10\n"
1974
1975    "movzb  (%1, %%r10, 1), %%r13 \n"
1976    "movzb  1(%1, %%r10, 1), %%r14 \n"
1977    "mov    %%r11, %%rax \n"
1978    "and    $0x1fffe, %%rax \n"
1979    "imul   %%rax, %%r14 \n"
1980 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
1981    "paddsw %%xmm0,%%xmm1\n"
1982    "paddsw %%xmm0,%%xmm2\n"
1983    "shufps $0x44,%%xmm2,%%xmm1\n"
1984    "psraw  $0x6,%%xmm1\n"
1985    "packuswb %%xmm1,%%xmm1\n"
1986    "movq   %%xmm1,0x0(%3)\n"
1987    "add    $0x8,%3\n"
1988    "sub    $0x2,%4\n"
1989 -  "jns    .lscaleloop\n"
1990 -
1991 -".lscalenext:"
1992 +  "jns    1b\n"
1993 +
1994 +"2:"
1995    "add    $0x1,%4\n"
1996 -  "js     .lscaledone\n"
1997 +  "js     3f\n"
1998
1999    "mov    %%r11,%%r10\n"
2000    "sar    $0x11,%%r10\n"
2001
2002    "movzb  (%1,%%r10,1), %%r13 \n"
2003    "movq   2048(%5,%%r13,8),%%xmm0\n"
2004
2005    "movzb  (%2,%%r10,1), %%r13 \n"
2006 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
2007    "movzb  (%0,%%r11,1), %%r13 \n"
2008    "movq   (%5,%%r13,8),%%xmm1\n"
2009
2010    "paddsw %%xmm0,%%xmm1\n"
2011    "psraw  $0x6,%%xmm1\n"
2012    "packuswb %%xmm1,%%xmm1\n"
2013    "movd   %%xmm1,0x0(%3)\n"
2014
2015 -".lscaledone:"
2016 +"3:"
2017    :
2018    : "r"(y_buf),  // %0
2019      "r"(u_buf),  // %1
2020      "r"(v_buf),  // %2
2021      "r"(rgb_buf),  // %3
2022      "r"(width),  // %4
2023      "r" (kCoefficientsRgbY),  // %5
2024      "r"(static_cast<long>(source_dx))  // %6
2025    : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
2026  );
2027  }
2028
2029 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
2030 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
2031
2032  // PIC version is slower because less registers are available, so
2033  // non-PIC is used on platforms where it is possible.
2034 -
2035 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2036 -                              const uint8* u_buf,
2037 -                              const uint8* v_buf,
2038 -                              uint8* rgb_buf,
2039 -                              int width);
2040 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2041 +                                  const uint8* u_buf,
2042 +                                  const uint8* v_buf,
2043 +                                  uint8* rgb_buf,
2044 +                                  int width);
2045    asm(
2046    ".text\n"
2047 -  ".global FastConvertYUVToRGB32Row\n"
2048 -"FastConvertYUVToRGB32Row:\n"
2049 +  ".global FastConvertYUVToRGB32Row_SSE\n"
2050 +  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
2051 +"FastConvertYUVToRGB32Row_SSE:\n"
2052    "pusha\n"
2053    "mov    0x24(%esp),%edx\n"
2054    "mov    0x28(%esp),%edi\n"
2055    "mov    0x2c(%esp),%esi\n"
2056    "mov    0x30(%esp),%ebp\n"
2057    "mov    0x34(%esp),%ecx\n"
2058 -  "jmp    convertend\n"
2059 -
2060 -"convertloop:"
2061 +  "jmp    1f\n"
2062 +
2063 +"0:"
2064    "movzbl (%edi),%eax\n"
2065    "add    $0x1,%edi\n"
2066    "movzbl (%esi),%ebx\n"
2067    "add    $0x1,%esi\n"
2068    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2069    "movzbl (%edx),%eax\n"
2070    "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
2071    "movzbl 0x1(%edx),%ebx\n"
2072 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
2073    "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
2074    "paddsw %mm0,%mm1\n"
2075    "paddsw %mm0,%mm2\n"
2076    "psraw  $0x6,%mm1\n"
2077    "psraw  $0x6,%mm2\n"
2078    "packuswb %mm2,%mm1\n"
2079    "movntq %mm1,0x0(%ebp)\n"
2080    "add    $0x8,%ebp\n"
2081 -"convertend:"
2082 +"1:"
2083    "sub    $0x2,%ecx\n"
2084 -  "jns    convertloop\n"
2085 +  "jns    0b\n"
2086
2087    "and    $0x1,%ecx\n"
2088 -  "je     convertdone\n"
2089 +  "je     2f\n"
2090
2091    "movzbl (%edi),%eax\n"
2092    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2093    "movzbl (%esi),%eax\n"
2094    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2095    "movzbl (%edx),%eax\n"
2096    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
2097    "paddsw %mm0,%mm1\n"
2098    "psraw  $0x6,%mm1\n"
2099    "packuswb %mm1,%mm1\n"
2100    "movd   %mm1,0x0(%ebp)\n"
2101 -"convertdone:"
2102 +"2:"
2103    "popa\n"
2104    "ret\n"
2105 +#if !defined(XP_MACOSX)
2106 +  ".previous\n"
2107 +#endif
2108  );
2109
2110 -
2111 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2112 -                        const uint8* u_buf,
2113 -                        const uint8* v_buf,
2114 -                        uint8* rgb_buf,
2115 -                        int width,
2116 -                        int source_dx);
2117 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
2118 +                              const uint8* u_buf,
2119 +                              const uint8* v_buf,
2120 +                              uint8* rgb_buf,
2121 +                              int width)
2122 +{
2123 +  if (mozilla::supports_sse()) {
2124 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
2125 +    return;
2126 +  }
2127 +
2128 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2129 +}
2130 +
2131 +
2132 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2133 +                            const uint8* u_buf,
2134 +                            const uint8* v_buf,
2135 +                            uint8* rgb_buf,
2136 +                            int width,
2137 +                            int source_dx);
2138    asm(
2139    ".text\n"
2140 -  ".global ScaleYUVToRGB32Row\n"
2141 -"ScaleYUVToRGB32Row:\n"
2142 +  ".global ScaleYUVToRGB32Row_SSE\n"
2143 +  ".type ScaleYUVToRGB32Row_SSE, @function\n"
2144 +"ScaleYUVToRGB32Row_SSE:\n"
2145    "pusha\n"
2146    "mov    0x24(%esp),%edx\n"
2147    "mov    0x28(%esp),%edi\n"
2148    "mov    0x2c(%esp),%esi\n"
2149    "mov    0x30(%esp),%ebp\n"
2150    "mov    0x34(%esp),%ecx\n"
2151    "xor    %ebx,%ebx\n"
2152 -  "jmp    scaleend\n"
2153 -
2154 -"scaleloop:"
2155 +  "jmp    1f\n"
2156 +
2157 +"0:"
2158    "mov    %ebx,%eax\n"
2159    "sar    $0x11,%eax\n"
2160    "movzbl (%edi,%eax,1),%eax\n"
2161    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2162    "mov    %ebx,%eax\n"
2163    "sar    $0x11,%eax\n"
2164    "movzbl (%esi,%eax,1),%eax\n"
2165    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2166 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2167    "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
2168    "paddsw %mm0,%mm1\n"
2169    "paddsw %mm0,%mm2\n"
2170    "psraw  $0x6,%mm1\n"
2171    "psraw  $0x6,%mm2\n"
2172    "packuswb %mm2,%mm1\n"
2173    "movntq %mm1,0x0(%ebp)\n"
2174    "add    $0x8,%ebp\n"
2175 -"scaleend:"
2176 +"1:"
2177    "sub    $0x2,%ecx\n"
2178 -  "jns    scaleloop\n"
2179 +  "jns    0b\n"
2180
2181    "and    $0x1,%ecx\n"
2182 -  "je     scaledone\n"
2183 +  "je     2f\n"
2184
2185    "mov    %ebx,%eax\n"
2186    "sar    $0x11,%eax\n"
2187    "movzbl (%edi,%eax,1),%eax\n"
2188    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2189    "mov    %ebx,%eax\n"
2190    "sar    $0x11,%eax\n"
2191    "movzbl (%esi,%eax,1),%eax\n"
2192 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2193    "sar    $0x10,%eax\n"
2194    "movzbl (%edx,%eax,1),%eax\n"
2195    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
2196    "paddsw %mm0,%mm1\n"
2197    "psraw  $0x6,%mm1\n"
2198    "packuswb %mm1,%mm1\n"
2199    "movd   %mm1,0x0(%ebp)\n"
2200
2201 -"scaledone:"
2202 +"2:"
2203    "popa\n"
2204    "ret\n"
2205 +#if !defined(XP_MACOSX)
2206 +  ".previous\n"
2207 +#endif
2208  );
2209
2210 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2211 -                              const uint8* u_buf,
2212 -                              const uint8* v_buf,
2213 -                              uint8* rgb_buf,
2214 -                              int width,
2215 -                              int source_dx);
2216 +void ScaleYUVToRGB32Row(const uint8* y_buf,
2217 +                        const uint8* u_buf,
2218 +                        const uint8* v_buf,
2219 +                        uint8* rgb_buf,
2220 +                        int width,
2221 +                        int source_dx)
2222 +{
2223 +  if (mozilla::supports_sse()) {
2224 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2225 +                           width, source_dx);
2226 +  }
2227 +
2228 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2229 +                       width, source_dx);
2230 +}
2231 +
2232 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2233 +                                  const uint8* u_buf,
2234 +                                  const uint8* v_buf,
2235 +                                  uint8* rgb_buf,
2236 +                                  int width,
2237 +                                  int source_dx);
2238    asm(
2239    ".text\n"
2240 -  ".global LinearScaleYUVToRGB32Row\n"
2241 -"LinearScaleYUVToRGB32Row:\n"
2242 +  ".global LinearScaleYUVToRGB32Row_SSE\n"
2243 +  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
2244 +"LinearScaleYUVToRGB32Row_SSE:\n"
2245    "pusha\n"
2246    "mov    0x24(%esp),%edx\n"
2247    "mov    0x28(%esp),%edi\n"
2248    "mov    0x30(%esp),%ebp\n"
2249
2250    // source_width = width * source_dx + ebx
2251    "mov    0x34(%esp), %ecx\n"
2252    "imull  0x38(%esp), %ecx\n"
2253    "mov    %ecx, 0x34(%esp)\n"
2254
2255    "mov    0x38(%esp), %ecx\n"
2256    "xor    %ebx,%ebx\n"     // x = 0
2257    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
2258 -  "jl     .lscaleend\n"
2259 +  "jl     1f\n"
2260    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
2261 -  "jmp    .lscaleend\n"
2262 -
2263 -".lscaleloop:"
2264 -  "mov    %ebx,%eax\n"
2265 -  "sar    $0x11,%eax\n"
2266 +  "jmp    1f\n"
2267 +
2268 +"0:"
2269 +  "mov    %ebx,%eax\n"
2270 +  "sar    $0x11,%eax\n"
2271
2272    "movzbl (%edi,%eax,1),%ecx\n"
2273    "movzbl 1(%edi,%eax,1),%esi\n"
2274    "mov    %ebx,%eax\n"
2275    "andl   $0x1fffe, %eax \n"
2276    "imul   %eax, %esi \n"
2277    "xorl   $0x1fffe, %eax \n"
2278    "imul   %eax, %ecx \n"
2279 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
2280    "imul   %eax, %esi \n"
2281    "xorl   $0xffff, %eax \n"
2282    "imul   %eax, %ecx \n"
2283    "addl   %esi, %ecx \n"
2284    "shrl   $16, %ecx \n"
2285    "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
2286
2287    "cmp    0x34(%esp), %ebx\n"
2288 -  "jge    .lscalelastpixel\n"
2289 +  "jge    2f\n"
2290
2291    "mov    %ebx,%eax\n"
2292    "sar    $0x10,%eax\n"
2293    "movzbl (%edx,%eax,1),%ecx\n"
2294    "movzbl 1(%edx,%eax,1),%esi\n"
2295    "mov    %ebx,%eax\n"
2296    "add    0x38(%esp),%ebx\n"
2297    "andl   $0xffff, %eax \n"
2298 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
2299    "paddsw %mm0,%mm1\n"
2300    "paddsw %mm0,%mm2\n"
2301    "psraw  $0x6,%mm1\n"
2302    "psraw  $0x6,%mm2\n"
2303    "packuswb %mm2,%mm1\n"
2304    "movntq %mm1,0x0(%ebp)\n"
2305    "add    $0x8,%ebp\n"
2306
2307 -".lscaleend:"
2308 +"1:"
2309    "cmp    0x34(%esp), %ebx\n"
2310 -  "jl     .lscaleloop\n"
2311 +  "jl     0b\n"
2312    "popa\n"
2313    "ret\n"
2314
2315 -".lscalelastpixel:"
2316 +"2:"
2317    "paddsw %mm0, %mm1\n"
2318    "psraw $6, %mm1\n"
2319    "packuswb %mm1, %mm1\n"
2320    "movd %mm1, (%ebp)\n"
2321    "popa\n"
2322    "ret\n"
2323 +#if !defined(XP_MACOSX)
2324 +  ".previous\n"
2325 +#endif
2326  );
2327
2328 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
2329 -
2330 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
2331 -                                    const uint8* u_buf,
2332 -                                    const uint8* v_buf,
2333 -                                    uint8* rgb_buf,
2334 -                                    int width,
2335 -                                    int16 *kCoefficientsRgbY);
2336 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2337 +                              const uint8* u_buf,
2338 +                              const uint8* v_buf,
2339 +                              uint8* rgb_buf,
2340 +                              int width,
2341 +                              int source_dx)
2342 +{
2343 +  if (mozilla::supports_sse()) {
2344 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2345 +                                 width, source_dx);
2346 +  }
2347 +
2348 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2349 +                             width, source_dx);
2350 +}
2351 +
2352 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
2353 +
2354 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2355 +                                 const uint8* u_buf,
2356 +                                 const uint8* v_buf,
2357 +                                 uint8* rgb_buf,
2358 +                                 int width,
2359 +                                 int16 *kCoefficientsRgbY);
2360 +
2361    asm(
2362    ".text\n"
2363 -#if defined(OS_MACOSX)
2364 -"_PICConvertYUVToRGB32Row:\n"
2365 +#if defined(XP_MACOSX)
2366 +"_PICConvertYUVToRGB32Row_SSE:\n"
2367  #else
2368 -"PICConvertYUVToRGB32Row:\n"
2369 +"PICConvertYUVToRGB32Row_SSE:\n"
2370  #endif
2371    "pusha\n"
2372    "mov    0x24(%esp),%edx\n"
2373    "mov    0x28(%esp),%edi\n"
2374    "mov    0x2c(%esp),%esi\n"
2375    "mov    0x30(%esp),%ebp\n"
2376    "mov    0x38(%esp),%ecx\n"
2377
2378 -  "jmp    .Lconvertend\n"
2379 -
2380 -".Lconvertloop:"
2381 +  "jmp    1f\n"
2382 +
2383 +"0:"
2384    "movzbl (%edi),%eax\n"
2385    "add    $0x1,%edi\n"
2386    "movzbl (%esi),%ebx\n"
2387    "add    $0x1,%esi\n"
2388    "movq   2048(%ecx,%eax,8),%mm0\n"
2389    "movzbl (%edx),%eax\n"
2390    "paddsw 4096(%ecx,%ebx,8),%mm0\n"
2391    "movzbl 0x1(%edx),%ebx\n"
2392 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
2393    "movq   0(%ecx,%ebx,8),%mm2\n"
2394    "paddsw %mm0,%mm1\n"
2395    "paddsw %mm0,%mm2\n"
2396    "psraw  $0x6,%mm1\n"
2397    "psraw  $0x6,%mm2\n"
2398    "packuswb %mm2,%mm1\n"
2399    "movntq %mm1,0x0(%ebp)\n"
2400    "add    $0x8,%ebp\n"
2401 -".Lconvertend:"
2402 +"1:"
2403    "subl   $0x2,0x34(%esp)\n"
2404 -  "jns    .Lconvertloop\n"
2405 +  "jns    0b\n"
2406
2407    "andl   $0x1,0x34(%esp)\n"
2408 -  "je     .Lconvertdone\n"
2409 +  "je     2f\n"
2410
2411    "movzbl (%edi),%eax\n"
2412    "movq   2048(%ecx,%eax,8),%mm0\n"
2413    "movzbl (%esi),%eax\n"
2414    "paddsw 4096(%ecx,%eax,8),%mm0\n"
2415    "movzbl (%edx),%eax\n"
2416    "movq   0(%ecx,%eax,8),%mm1\n"
2417    "paddsw %mm0,%mm1\n"
2418    "psraw  $0x6,%mm1\n"
2419    "packuswb %mm1,%mm1\n"
2420    "movd   %mm1,0x0(%ebp)\n"
2421 -".Lconvertdone:\n"
2422 +"2:"
2423    "popa\n"
2424    "ret\n"
2425 +#if !defined(XP_MACOSX)
2426 +  ".previous\n"
2427 +#endif
2428  );
2429
2430  void FastConvertYUVToRGB32Row(const uint8* y_buf,
2431                                const uint8* u_buf,
2432                                const uint8* v_buf,
2433                                uint8* rgb_buf,
2434 -                              int width) {
2435 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
2436 -                          &kCoefficientsRgbY[0][0]);
2437 -}
2438 -
2439 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
2440 +                              int width)
2441 +{
2442 +  if (mozilla::supports_sse()) {
2443 +    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2444 +                                &kCoefficientsRgbY[0][0]);
2445 +    return;
2446 +  }
2447 +
2448 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2449 +}
2450 +
2451 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2452                                 const uint8* u_buf,
2453                                 const uint8* v_buf,
2454                                 uint8* rgb_buf,
2455                                 int width,
2456                                 int source_dx,
2457                                 int16 *kCoefficientsRgbY);
2458
2459    asm(
2460    ".text\n"
2461 -#if defined(OS_MACOSX)
2462 -"_PICScaleYUVToRGB32Row:\n"
2463 +#if defined(XP_MACOSX)
2464 +"_PICScaleYUVToRGB32Row_SSE:\n"
2465  #else
2466 -"PICScaleYUVToRGB32Row:\n"
2467 +"PICScaleYUVToRGB32Row_SSE:\n"
2468  #endif
2469    "pusha\n"
2470    "mov    0x24(%esp),%edx\n"
2471    "mov    0x28(%esp),%edi\n"
2472    "mov    0x2c(%esp),%esi\n"
2473    "mov    0x30(%esp),%ebp\n"
2474    "mov    0x3c(%esp),%ecx\n"
2475    "xor    %ebx,%ebx\n"
2476 -  "jmp    Lscaleend\n"
2477 -
2478 -"Lscaleloop:"
2479 +  "jmp    1f\n"
2480 +
2481 +"0:"
2482    "mov    %ebx,%eax\n"
2483    "sar    $0x11,%eax\n"
2484    "movzbl (%edi,%eax,1),%eax\n"
2485    "movq   2048(%ecx,%eax,8),%mm0\n"
2486    "mov    %ebx,%eax\n"
2487    "sar    $0x11,%eax\n"
2488    "movzbl (%esi,%eax,1),%eax\n"
2489    "paddsw 4096(%ecx,%eax,8),%mm0\n"
2490 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
2491    "movq   0(%ecx,%eax,8),%mm2\n"
2492    "paddsw %mm0,%mm1\n"
2493    "paddsw %mm0,%mm2\n"
2494    "psraw  $0x6,%mm1\n"
2495    "psraw  $0x6,%mm2\n"
2496    "packuswb %mm2,%mm1\n"
2497    "movntq %mm1,0x0(%ebp)\n"
2498    "add    $0x8,%ebp\n"
2499 -"Lscaleend:"
2500 +"1:"
2501    "subl   $0x2,0x34(%esp)\n"
2502 -  "jns    Lscaleloop\n"
2503 +  "jns    0b\n"
2504
2505    "andl   $0x1,0x34(%esp)\n"
2506 -  "je     Lscaledone\n"
2507 +  "je     2f\n"
2508
2509    "mov    %ebx,%eax\n"
2510    "sar    $0x11,%eax\n"
2511    "movzbl (%edi,%eax,1),%eax\n"
2512    "movq   2048(%ecx,%eax,8),%mm0\n"
2513    "mov    %ebx,%eax\n"
2514    "sar    $0x11,%eax\n"
2515    "movzbl (%esi,%eax,1),%eax\n"
2516 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
2517    "sar    $0x10,%eax\n"
2518    "movzbl (%edx,%eax,1),%eax\n"
2519    "movq   0(%ecx,%eax,8),%mm1\n"
2520    "paddsw %mm0,%mm1\n"
2521    "psraw  $0x6,%mm1\n"
2522    "packuswb %mm1,%mm1\n"
2523    "movd   %mm1,0x0(%ebp)\n"
2524
2525 -"Lscaledone:"
2526 +"2:"
2527    "popa\n"
2528    "ret\n"
2529 +#if !defined(XP_MACOSX)
2530 +  ".previous\n"
2531 +#endif
2532  );
2533
2534 -
2535  void ScaleYUVToRGB32Row(const uint8* y_buf,
2536                          const uint8* u_buf,
2537                          const uint8* v_buf,
2538                          uint8* rgb_buf,
2539                          int width,
2540 -                        int source_dx) {
2541 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2542 -                        &kCoefficientsRgbY[0][0]);
2543 -}
2544 -
2545 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
2546 -                                 const uint8* u_buf,
2547 -                                 const uint8* v_buf,
2548 -                                 uint8* rgb_buf,
2549 -                                 int width,
2550 -                                 int source_dx,
2551 -                                 int16 *kCoefficientsRgbY);
2552 +                        int source_dx)
2553 +{
2554 +  if (mozilla::supports_sse()) {
2555 +    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2556 +                              &kCoefficientsRgbY[0][0]);
2557 +    return;
2558 +  }
2559 +
2560 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2561 +}
2562 +
2563 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2564 +                                     const uint8* u_buf,
2565 +                                     const uint8* v_buf,
2566 +                                     uint8* rgb_buf,
2567 +                                     int width,
2568 +                                     int source_dx,
2569 +                                     int16 *kCoefficientsRgbY);
2570 +
2571    asm(
2572    ".text\n"
2573 -#if defined(OS_MACOSX)
2574 -"_PICLinearScaleYUVToRGB32Row:\n"
2575 +#if defined(XP_MACOSX)
2576 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
2577  #else
2578 -"PICLinearScaleYUVToRGB32Row:\n"
2579 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
2580  #endif
2581    "pusha\n"
2582    "mov    0x24(%esp),%edx\n"
2583    "mov    0x30(%esp),%ebp\n"
2584    "mov    0x34(%esp),%ecx\n"
2585    "mov    0x3c(%esp),%edi\n"
2586    "xor    %ebx,%ebx\n"
2587
2588    // source_width = width * source_dx + ebx
2589    "mov    0x34(%esp), %ecx\n"
2590    "imull  0x38(%esp), %ecx\n"
2591    "mov    %ecx, 0x34(%esp)\n"
2592
2593    "mov    0x38(%esp), %ecx\n"
2594    "xor    %ebx,%ebx\n"     // x = 0
2595    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
2596 -  "jl     .lscaleend\n"
2597 +  "jl     1f\n"
2598    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
2599 -  "jmp    .lscaleend\n"
2600 -
2601 -".lscaleloop:"
2602 +  "jmp    1f\n"
2603 +
2604 +"0:"
2605    "mov    0x28(%esp),%esi\n"
2606    "mov    %ebx,%eax\n"
2607    "sar    $0x11,%eax\n"
2608
2609    "movzbl (%esi,%eax,1),%ecx\n"
2610    "movzbl 1(%esi,%eax,1),%esi\n"
2611    "mov    %ebx,%eax\n"
2612    "andl   $0x1fffe, %eax \n"
2613 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
2614    "imul   %eax, %esi \n"
2615    "xorl   $0xffff, %eax \n"
2616    "imul   %eax, %ecx \n"
2617    "addl   %esi, %ecx \n"
2618    "shrl   $16, %ecx \n"
2619    "movq   (%edi,%ecx,8),%mm1\n"
2620
2621    "cmp    0x34(%esp), %ebx\n"
2622 -  "jge    .lscalelastpixel\n"
2623 +  "jge    2f\n"
2624
2625    "mov    %ebx,%eax\n"
2626    "sar    $0x10,%eax\n"
2627    "movzbl (%edx,%eax,1),%ecx\n"
2628    "movzbl 1(%edx,%eax,1),%esi\n"
2629    "mov    %ebx,%eax\n"
2630    "add    0x38(%esp),%ebx\n"
2631    "andl   $0xffff, %eax \n"
2632 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
2633    "paddsw %mm0,%mm1\n"
2634    "paddsw %mm0,%mm2\n"
2635    "psraw  $0x6,%mm1\n"
2636    "psraw  $0x6,%mm2\n"
2637    "packuswb %mm2,%mm1\n"
2638    "movntq %mm1,0x0(%ebp)\n"
2639    "add    $0x8,%ebp\n"
2640
2641 -".lscaleend:"
2642 +"1:"
2643    "cmp    %ebx, 0x34(%esp)\n"
2644 -  "jg     .lscaleloop\n"
2645 +  "jg     0b\n"
2646    "popa\n"
2647    "ret\n"
2648
2649 -".lscalelastpixel:"
2650 +"2:"
2651    "paddsw %mm0, %mm1\n"
2652    "psraw $6, %mm1\n"
2653    "packuswb %mm1, %mm1\n"
2654    "movd %mm1, (%ebp)\n"
2655    "popa\n"
2656    "ret\n"
2657 +#if !defined(XP_MACOSX)
2658 +  ".previous\n"
2659 +#endif
2660  );
2661
2662 +
2663  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2664 -                        const uint8* u_buf,
2665 -                        const uint8* v_buf,
2666 -                        uint8* rgb_buf,
2667 -                        int width,
2668 -                        int source_dx) {
2669 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2670 -                              &kCoefficientsRgbY[0][0]);
2671 -}
2672 -
2673 -#else  // USE_MMX
2674 -
2675 -// C reference code that mimic the YUV assembly.
2676 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2677 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2678 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2679 -
2680 -static inline void YuvPixel(uint8 y,
2681 -                            uint8 u,
2682 -                            uint8 v,
2683 -                            uint8* rgb_buf) {
2684 -
2685 -  int b = kCoefficientsRgbY[256+u][0];
2686 -  int g = kCoefficientsRgbY[256+u][1];
2687 -  int r = kCoefficientsRgbY[256+u][2];
2688 -  int a = kCoefficientsRgbY[256+u][3];
2689 -
2690 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2691 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2692 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2693 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2694 -
2695 -  b = paddsw(b, kCoefficientsRgbY[y][0]);
2696 -  g = paddsw(g, kCoefficientsRgbY[y][1]);
2697 -  r = paddsw(r, kCoefficientsRgbY[y][2]);
2698 -  a = paddsw(a, kCoefficientsRgbY[y][3]);
2699 -
2700 -  b >>= 6;
2701 -  g >>= 6;
2702 -  r >>= 6;
2703 -  a >>= 6;
2704 -
2705 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
2706 -                                        (packuswb(g) << 8) |
2707 -                                        (packuswb(r) << 16) |
2708 -                                        (packuswb(a) << 24);
2709 -}
2710 -
2711 +                              const uint8* u_buf,
2712 +                              const uint8* v_buf,
2713 +                              uint8* rgb_buf,
2714 +                              int width,
2715 +                              int source_dx)
2716 +{
2717 +  if (mozilla::supports_sse()) {
2718 +    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2719 +                                    source_dx, &kCoefficientsRgbY[0][0]);
2720 +    return;
2721 +  }
2722 +
2723 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2724 +}
2725 +#else
2726  void FastConvertYUVToRGB32Row(const uint8* y_buf,
2727                                const uint8* u_buf,
2728                                const uint8* v_buf,
2729                                uint8* rgb_buf,
2730                                int width) {
2731 -  for (int x = 0; x < width; x += 2) {
2732 -    uint8 u = u_buf[x >> 1];
2733 -    uint8 v = v_buf[x >> 1];
2734 -    uint8 y0 = y_buf[x];
2735 -    YuvPixel(y0, u, v, rgb_buf);
2736 -    if ((x + 1) < width) {
2737 -      uint8 y1 = y_buf[x + 1];
2738 -      YuvPixel(y1, u, v, rgb_buf + 4);
2739 -    }
2740 -    rgb_buf += 8;  // Advance 2 pixels.
2741 -  }
2742 -}
2743 -
2744 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
2745 -// A shift by 17 is used to further subsample the chrominence channels.
2746 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
2747 -// for 1/65536 pixel accurate interpolation.
2748 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2749 +}
2750 +
2751  void ScaleYUVToRGB32Row(const uint8* y_buf,
2752                          const uint8* u_buf,
2753                          const uint8* v_buf,
2754                          uint8* rgb_buf,
2755                          int width,
2756                          int source_dx) {
2757 -  int x = 0;
2758 -  for (int i = 0; i < width; i += 2) {
2759 -    int y = y_buf[x >> 16];
2760 -    int u = u_buf[(x >> 17)];
2761 -    int v = v_buf[(x >> 17)];
2762 -    YuvPixel(y, u, v, rgb_buf);
2763 -    x += source_dx;
2764 -    if ((i + 1) < width) {
2765 -      y = y_buf[x >> 16];
2766 -      YuvPixel(y, u, v, rgb_buf+4);
2767 -      x += source_dx;
2768 -    }
2769 -    rgb_buf += 8;
2770 -  }
2771 -}
2772 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2773 +}
2774
2775  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2776                                const uint8* u_buf,
2777                                const uint8* v_buf,
2778                                uint8* rgb_buf,
2779                                int width,
2780                                int source_dx) {
2781 -  int x = 0;
2782 -  if (source_dx >= 0x20000) {
2783 -    x = 32768;
2784 -  }
2785 -  for (int i = 0; i < width; i += 2) {
2786 -    int y0 = y_buf[x >> 16];
2787 -    int y1 = y_buf[(x >> 16) + 1];
2788 -    int u0 = u_buf[(x >> 17)];
2789 -    int u1 = u_buf[(x >> 17) + 1];
2790 -    int v0 = v_buf[(x >> 17)];
2791 -    int v1 = v_buf[(x >> 17) + 1];
2792 -    int y_frac = (x & 65535);
2793 -    int uv_frac = ((x >> 1) & 65535);
2794 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2795 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
2796 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
2797 -    YuvPixel(y, u, v, rgb_buf);
2798 -    x += source_dx;
2799 -    if ((i + 1) < width) {
2800 -      y0 = y_buf[x >> 16];
2801 -      y1 = y_buf[(x >> 16) + 1];
2802 -      y_frac = (x & 65535);
2803 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2804 -      YuvPixel(y, u, v, rgb_buf+4);
2805 -      x += source_dx;
2806 -    }
2807 -    rgb_buf += 8;
2808 -  }
2809 -}
2810 -
2811 -#endif  // USE_MMX
2812 -}  // extern "C"
2813 -
2814 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2815 +}
2816 +#endif
2817 +
2818 +}
2819 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
2820 --- a/gfx/ycbcr/yuv_row_table.cpp
2821 +++ b/gfx/ycbcr/yuv_row_table.cpp
2822 @@ -1,13 +1,13 @@
2823  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2824  // Use of this source code is governed by a BSD-style license that can be
2825  // found in the LICENSE file.
2826
2827 -#include "media/base/yuv_row.h"
2828 +#include "yuv_row.h"
2829
2830  extern "C" {
2831
2832  #define RGBY(i) { \
2833    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2834    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2835    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2836    0 \
2837 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
2838 --- a/gfx/ycbcr/yuv_row_win.cpp
2839 +++ b/gfx/ycbcr/yuv_row_win.cpp
2840 @@ -1,26 +1,27 @@
2841  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2842  // Use of this source code is governed by a BSD-style license that can be
2843  // found in the LICENSE file.
2844
2845 -#include "media/base/yuv_row.h"
2846 +#include "yuv_row.h"
2847 +#include "mozilla/SSE.h"
2848
2849  #define kCoefficientsRgbU kCoefficientsRgbY + 2048
2850  #define kCoefficientsRgbV kCoefficientsRgbY + 4096
2851
2852  extern "C" {
2853
2854 -#if USE_MMX
2855 -__declspec(naked)
2856 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2857 -                              const uint8* u_buf,
2858 -                              const uint8* v_buf,
2859 -                              uint8* rgb_buf,
2860 -                              int width) {
2861 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
2862 +__declspec(naked)
2863 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2864 +                                  const uint8* u_buf,
2865 +                                  const uint8* v_buf,
2866 +                                  uint8* rgb_buf,
2867 +                                  int width) {
2868    __asm {
2869      pushad
2870      mov       edx, [esp + 32 + 4]   // Y
2871      mov       edi, [esp + 32 + 8]   // U
2872      mov       esi, [esp + 32 + 12]  // V
2873      mov       ebp, [esp + 32 + 16]  // rgb
2874      mov       ecx, [esp + 32 + 20]  // width
2875      jmp       convertend
2876 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
2877   convertdone :
2878
2879      popad
2880      ret
2881    }
2882  }
2883
2884  __declspec(naked)
2885 -void ConvertYUVToRGB32Row(const uint8* y_buf,
2886 -                          const uint8* u_buf,
2887 -                          const uint8* v_buf,
2888 -                          uint8* rgb_buf,
2889 -                          int width,
2890 -                          int step) {
2891 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2892 +                              const uint8* u_buf,
2893 +                              const uint8* v_buf,
2894 +                              uint8* rgb_buf,
2895 +                              int width,
2896 +                              int step) {
2897    __asm {
2898      pushad
2899      mov       edx, [esp + 32 + 4]   // Y
2900      mov       edi, [esp + 32 + 8]   // U
2901      mov       esi, [esp + 32 + 12]  // V
2902      mov       ebp, [esp + 32 + 16]  // rgb
2903      mov       ecx, [esp + 32 + 20]  // width
2904      mov       ebx, [esp + 32 + 24]  // step
2905 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
2906   wdone :
2907
2908      popad
2909      ret
2910    }
2911  }
2912
2913  __declspec(naked)
2914 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
2915 -                                const uint8* u_buf,
2916 -                                const uint8* v_buf,
2917 -                                uint8* rgb_buf,
2918 -                                int width,
2919 -                                int ystep,
2920 -                                int uvstep) {
2921 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2922 +                                    const uint8* u_buf,
2923 +                                    const uint8* v_buf,
2924 +                                    uint8* rgb_buf,
2925 +                                    int width,
2926 +                                    int ystep,
2927 +                                    int uvstep) {
2928    __asm {
2929      pushad
2930      mov       edx, [esp + 32 + 4]   // Y
2931      mov       edi, [esp + 32 + 8]   // U
2932      mov       esi, [esp + 32 + 12]  // V
2933      mov       ebp, [esp + 32 + 16]  // rgb
2934      mov       ecx, [esp + 32 + 20]  // width
2935      jmp       wend
2936 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
2937   wdone :
2938
2939      popad
2940      ret
2941    }
2942  }
2943
2944  __declspec(naked)
2945 -void DoubleYUVToRGB32Row(const uint8* y_buf,
2946 -                         const uint8* u_buf,
2947 -                         const uint8* v_buf,
2948 -                         uint8* rgb_buf,
2949 -                         int width) {
2950 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
2951 +                             const uint8* u_buf,
2952 +                             const uint8* v_buf,
2953 +                             uint8* rgb_buf,
2954 +                             int width) {
2955    __asm {
2956      pushad
2957      mov       edx, [esp + 32 + 4]   // Y
2958      mov       edi, [esp + 32 + 8]   // U
2959      mov       esi, [esp + 32 + 12]  // V
2960      mov       ebp, [esp + 32 + 16]  // rgb
2961      mov       ecx, [esp + 32 + 20]  // width
2962      jmp       wend
2963 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
2964      jns       wloop1
2965   wdone :
2966      popad
2967      ret
2968    }
2969  }
2970
2971  // This version does general purpose scaling by any amount, up or down.
2972 -// The only thing it can not do it rotation by 90 or 270.
2973 -// For performance the chroma is under sampled, reducing cost of a 3x
2974 +// The only thing it cannot do is rotation by 90 or 270.
2975 +// For performance the chroma is under-sampled, reducing cost of a 3x
2976  // 1080p scale from 8.4 ms to 5.4 ms.
2977  __declspec(naked)
2978 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2979 -                        const uint8* u_buf,
2980 -                        const uint8* v_buf,
2981 -                        uint8* rgb_buf,
2982 -                        int width,
2983 -                        int source_dx) {
2984 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2985 +                            const uint8* u_buf,
2986 +                            const uint8* v_buf,
2987 +                            uint8* rgb_buf,
2988 +                            int width,
2989 +                            int source_dx) {
2990    __asm {
2991      pushad
2992      mov       edx, [esp + 32 + 4]   // Y
2993      mov       edi, [esp + 32 + 8]   // U
2994      mov       esi, [esp + 32 + 12]  // V
2995      mov       ebp, [esp + 32 + 16]  // rgb
2996      mov       ecx, [esp + 32 + 20]  // width
2997      xor       ebx, ebx              // x
2998 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2999
3000   scaledone :
3001      popad
3002      ret
3003    }
3004  }
3005
3006  __declspec(naked)
3007 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3008 -                              const uint8* u_buf,
3009 -                              const uint8* v_buf,
3010 -                              uint8* rgb_buf,
3011 -                              int width,
3012 -                              int source_dx) {
3013 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
3014 +                                  const uint8* u_buf,
3015 +                                  const uint8* v_buf,
3016 +                                  uint8* rgb_buf,
3017 +                                  int width,
3018 +                                  int source_dx) {
3019    __asm {
3020      pushad
3021      mov       edx, [esp + 32 + 4]  // Y
3022      mov       edi, [esp + 32 + 8]  // U
3023                  // [esp + 32 + 12] // V
3024      mov       ebp, [esp + 32 + 16] // rgb
3025      mov       ecx, [esp + 32 + 20] // width
3026      imul      ecx, [esp + 32 + 24] // source_dx
3027 @@ -438,152 +439,60 @@ lscalelastpixel:
3028      paddsw    mm1, mm0
3029      psraw     mm1, 6
3030      packuswb  mm1, mm1
3031      movd      [ebp], mm1
3032      popad
3033      ret
3034    };
3035  }
3036 -#else  // USE_MMX
3037 -
3038 -// C reference code that mimic the YUV assembly.
3039 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
3040 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
3041 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
3042 -
3043 -static inline void YuvPixel(uint8 y,
3044 -                            uint8 u,
3045 -                            uint8 v,
3046 -                            uint8* rgb_buf) {
3047 -
3048 -  int b = kCoefficientsRgbY[256+u][0];
3049 -  int g = kCoefficientsRgbY[256+u][1];
3050 -  int r = kCoefficientsRgbY[256+u][2];
3051 -  int a = kCoefficientsRgbY[256+u][3];
3052 -
3053 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
3054 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
3055 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
3056 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
3057 -
3058 -  b = paddsw(b, kCoefficientsRgbY[y][0]);
3059 -  g = paddsw(g, kCoefficientsRgbY[y][1]);
3060 -  r = paddsw(r, kCoefficientsRgbY[y][2]);
3061 -  a = paddsw(a, kCoefficientsRgbY[y][3]);
3062 -
3063 -  b >>= 6;
3064 -  g >>= 6;
3065 -  r >>= 6;
3066 -  a >>= 6;
3067 -
3068 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
3069 -                                        (packuswb(g) << 8) |
3070 -                                        (packuswb(r) << 16) |
3071 -                                        (packuswb(a) << 24);
3072 -}
3073 -
3074 -#if TEST_MMX_YUV
3075 -static inline void YuvPixel(uint8 y,
3076 -                            uint8 u,
3077 -                            uint8 v,
3078 -                            uint8* rgb_buf) {
3079 -
3080 -  __asm {
3081 -    movzx     eax, u
3082 -    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]
3083 -    movzx     eax, v
3084 -    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]
3085 -    movzx     eax, y
3086 -    movq      mm1, [kCoefficientsRgbY + 8 * eax]
3087 -    paddsw    mm1, mm0
3088 -    psraw     mm1, 6
3089 -    packuswb  mm1, mm1
3090 -    mov       eax, rgb_buf
3091 -    movd      [eax], mm1
3092 -    emms
3093 -  }
3094 -}
3095 -#endif
3096 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3097
3098  void FastConvertYUVToRGB32Row(const uint8* y_buf,
3099                                const uint8* u_buf,
3100                                const uint8* v_buf,
3101                                uint8* rgb_buf,
3102                                int width) {
3103 -  for (int x = 0; x < width; x += 2) {
3104 -    uint8 u = u_buf[x >> 1];
3105 -    uint8 v = v_buf[x >> 1];
3106 -    uint8 y0 = y_buf[x];
3107 -    YuvPixel(y0, u, v, rgb_buf);
3108 -    if ((x + 1) < width) {
3109 -      uint8 y1 = y_buf[x + 1];
3110 -      YuvPixel(y1, u, v, rgb_buf + 4);
3111 -    }
3112 -    rgb_buf += 8;  // Advance 2 pixels.
3113 -  }
3114 -}
3115 -
3116 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
3117 -// A shift by 17 is used to further subsample the chrominence channels.
3118 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
3119 -// for 1/65536 pixel accurate interpolation.
3120 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3121 +  if (mozilla::supports_sse()) {
3122 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
3123 +    return;
3124 +  }
3125 +#endif
3126 +
3127 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
3128 +}
3129 +
3130  void ScaleYUVToRGB32Row(const uint8* y_buf,
3131                          const uint8* u_buf,
3132                          const uint8* v_buf,
3133                          uint8* rgb_buf,
3134                          int width,
3135                          int source_dx) {
3136 -  int x = 0;
3137 -  for (int i = 0; i < width; i += 2) {
3138 -    int y = y_buf[x >> 16];
3139 -    int u = u_buf[(x >> 17)];
3140 -    int v = v_buf[(x >> 17)];
3141 -    YuvPixel(y, u, v, rgb_buf);
3142 -    x += source_dx;
3143 -    if ((i + 1) < width) {
3144 -      y = y_buf[x >> 16];
3145 -      YuvPixel(y, u, v, rgb_buf+4);
3146 -      x += source_dx;
3147 -    }
3148 -    rgb_buf += 8;
3149 -  }
3150 -}
3151 +
3152 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3153 +  if (mozilla::supports_sse()) {
3154 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3155 +    return;
3156 +  }
3157 +#endif
3158 +
3159 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3160 +}
3161
3162  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3163                                const uint8* u_buf,
3164                                const uint8* v_buf,
3165                                uint8* rgb_buf,
3166                                int width,
3167                                int source_dx) {
3168 -  int x = 0;
3169 -  if (source_dx >= 0x20000) {
3170 -    x = 32768;
3171 -  }
3172 -  for (int i = 0; i < width; i += 2) {
3173 -    int y0 = y_buf[x >> 16];
3174 -    int y1 = y_buf[(x >> 16) + 1];
3175 -    int u0 = u_buf[(x >> 17)];
3176 -    int u1 = u_buf[(x >> 17) + 1];
3177 -    int v0 = v_buf[(x >> 17)];
3178 -    int v1 = v_buf[(x >> 17) + 1];
3179 -    int y_frac = (x & 65535);
3180 -    int uv_frac = ((x >> 1) & 65535);
3181 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3182 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
3183 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
3184 -    YuvPixel(y, u, v, rgb_buf);
3185 -    x += source_dx;
3186 -    if ((i + 1) < width) {
3187 -      y0 = y_buf[x >> 16];
3188 -      y1 = y_buf[(x >> 16) + 1];
3189 -      y_frac = (x & 65535);
3190 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3191 -      YuvPixel(y, u, v, rgb_buf+4);
3192 -      x += source_dx;
3193 -    }
3194 -    rgb_buf += 8;
3195 -  }
3196 -}
3197 -
3198 -#endif  // USE_MMX
3199 -}  // extern "C"
3200 -
3201 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3202 +  if (mozilla::supports_sse()) {
3203 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
3204 +                                 source_dx);
3205 +    return;
3206 +  }
3207 +#endif
3208 +
3209 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3210 +}
3211 +
3212 +} // extern "C"