CLOSED TREE: TraceMonkey merge head. (a=blockers)
[mozilla-central.git] / gfx / ycbcr / convert.patch
blob471dd3ca9638916ae398cd34e7a0d9650da1f0e7
1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
2 --- a/gfx/ycbcr/yuv_convert.cpp
3 +++ b/gfx/ycbcr/yuv_convert.cpp
4 @@ -6,145 +6,133 @@
5 // http://www.fourcc.org/yuv.php
6 // The actual conversion is best described here
7 // http://en.wikipedia.org/wiki/YUV
8 // An article on optimizing YUV conversion using tables instead of multiplies
9 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
11 // YV12 is a full plane of Y and a half height, half width chroma planes
12 // YV16 is a full plane of Y and a full height, half width chroma planes
13 +// YV24 is a full plane of Y and a full height, full width chroma planes
15 // ARGB pixel format is output, which on little endian is stored as BGRA.
16 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
18 -#include "media/base/yuv_convert.h"
19 +#include "yuv_convert.h"
21 // Header for low level row functions.
22 -#include "media/base/yuv_row.h"
24 -#if USE_MMX
25 -#if defined(_MSC_VER)
26 -#include <intrin.h>
27 -#else
28 -#include <mmintrin.h>
29 -#endif
30 -#endif
32 -#if USE_SSE2
33 -#include <emmintrin.h>
34 -#endif
36 -namespace media {
38 +#include "yuv_row.h"
39 +#include "mozilla/SSE.h"
41 +#ifdef HAVE_YCBCR_TO_RGB565
42 +void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
43 +#endif
45 +namespace mozilla {
47 +namespace gfx {
49 // 16.16 fixed point arithmetic
50 const int kFractionBits = 16;
51 const int kFractionMax = 1 << kFractionBits;
52 const int kFractionMask = ((1 << kFractionBits) - 1);
55 +// Convert a frame of YUV to 16 bit RGB565.
56 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
57 + const uint8* u_buf,
58 + const uint8* v_buf,
59 + uint8* rgb_buf,
60 + int pic_x,
61 + int pic_y,
62 + int pic_width,
63 + int pic_height,
64 + int y_pitch,
65 + int uv_pitch,
66 + int rgb_pitch,
67 + YUVType yuv_type)
69 +#ifdef HAVE_YCBCR_TO_RGB565
70 + for (int i = 0; i < pic_height; i++) {
71 + yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
72 + y_buf + y_pitch * i,
73 + u_buf + uv_pitch * (i / 2),
74 + v_buf + uv_pitch * (i / 2),
75 + pic_width,
76 + 0);
77 + }
78 +#endif
81 // Convert a frame of YUV to 32 bit ARGB.
82 -void ConvertYUVToRGB32(const uint8* y_buf,
83 - const uint8* u_buf,
84 - const uint8* v_buf,
85 - uint8* rgb_buf,
86 - int width,
87 - int height,
88 - int y_pitch,
89 - int uv_pitch,
90 - int rgb_pitch,
91 - YUVType yuv_type) {
92 - unsigned int y_shift = yuv_type;
93 - for (int y = 0; y < height; ++y) {
94 - uint8* rgb_row = rgb_buf + y * rgb_pitch;
95 - const uint8* y_ptr = y_buf + y * y_pitch;
96 - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
97 - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
99 - FastConvertYUVToRGB32Row(y_ptr,
100 - u_ptr,
101 - v_ptr,
102 - rgb_row,
103 - width);
105 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
106 + const uint8* u_buf,
107 + const uint8* v_buf,
108 + uint8* rgb_buf,
109 + int pic_x,
110 + int pic_y,
111 + int pic_width,
112 + int pic_height,
113 + int y_pitch,
114 + int uv_pitch,
115 + int rgb_pitch,
116 + YUVType yuv_type) {
117 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
118 + unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
119 + // Test for SSE because the optimized code uses movntq, which is not part of MMX.
120 + bool has_sse = supports_mmx() && supports_sse();
121 + // There is no optimized YV24 SSE routine so we check for this and
122 + // fall back to the C code.
123 + has_sse &= yuv_type != YV24;
124 + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
125 + int x_width = odd_pic_x ? pic_width - 1 : pic_width;
127 + for (int y = pic_y; y < pic_height + pic_y; ++y) {
128 + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
129 + const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
130 + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
131 + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
133 + if (odd_pic_x) {
134 + // Handle the single odd pixel manually and use the
135 + // fast routines for the remaining.
136 + FastConvertYUVToRGB32Row_C(y_ptr++,
137 + u_ptr++,
138 + v_ptr++,
139 + rgb_row,
140 + 1,
141 + x_shift);
142 + rgb_row += 4;
145 + if (has_sse) {
146 + FastConvertYUVToRGB32Row(y_ptr,
147 + u_ptr,
148 + v_ptr,
149 + rgb_row,
150 + x_width);
152 + else {
153 + FastConvertYUVToRGB32Row_C(y_ptr,
154 + u_ptr,
155 + v_ptr,
156 + rgb_row,
157 + x_width,
158 + x_shift);
162 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
163 - EMMS();
166 -#if USE_SSE2
167 -// FilterRows combines two rows of the image using linear interpolation.
168 -// SSE2 version does 16 pixels at a time
170 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
171 - int source_width, int source_y_fraction) {
172 - __m128i zero = _mm_setzero_si128();
173 - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
174 - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
176 - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
177 - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
178 - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
179 - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
181 - do {
182 - __m128i y0 = _mm_loadu_si128(y0_ptr128);
183 - __m128i y1 = _mm_loadu_si128(y1_ptr128);
184 - __m128i y2 = _mm_unpackhi_epi8(y0, zero);
185 - __m128i y3 = _mm_unpackhi_epi8(y1, zero);
186 - y0 = _mm_unpacklo_epi8(y0, zero);
187 - y1 = _mm_unpacklo_epi8(y1, zero);
188 - y0 = _mm_mullo_epi16(y0, y0_fraction);
189 - y1 = _mm_mullo_epi16(y1, y1_fraction);
190 - y2 = _mm_mullo_epi16(y2, y0_fraction);
191 - y3 = _mm_mullo_epi16(y3, y1_fraction);
192 - y0 = _mm_add_epi16(y0, y1);
193 - y2 = _mm_add_epi16(y2, y3);
194 - y0 = _mm_srli_epi16(y0, 8);
195 - y2 = _mm_srli_epi16(y2, 8);
196 - y0 = _mm_packus_epi16(y0, y2);
197 - *dest128++ = y0;
198 - ++y0_ptr128;
199 - ++y1_ptr128;
200 - } while (dest128 < end128);
202 -#elif USE_MMX
203 -// MMX version does 8 pixels at a time
204 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
205 - int source_width, int source_y_fraction) {
206 - __m64 zero = _mm_setzero_si64();
207 - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
208 - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
210 - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
211 - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
212 - __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
213 - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
215 - do {
216 - __m64 y0 = *y0_ptr64++;
217 - __m64 y1 = *y1_ptr64++;
218 - __m64 y2 = _mm_unpackhi_pi8(y0, zero);
219 - __m64 y3 = _mm_unpackhi_pi8(y1, zero);
220 - y0 = _mm_unpacklo_pi8(y0, zero);
221 - y1 = _mm_unpacklo_pi8(y1, zero);
222 - y0 = _mm_mullo_pi16(y0, y0_fraction);
223 - y1 = _mm_mullo_pi16(y1, y1_fraction);
224 - y2 = _mm_mullo_pi16(y2, y0_fraction);
225 - y3 = _mm_mullo_pi16(y3, y1_fraction);
226 - y0 = _mm_add_pi16(y0, y1);
227 - y2 = _mm_add_pi16(y2, y3);
228 - y0 = _mm_srli_pi16(y0, 8);
229 - y2 = _mm_srli_pi16(y2, 8);
230 - y0 = _mm_packs_pu16(y0, y2);
231 - *dest64++ = y0;
232 - } while (dest64 < end64);
234 -#else // no MMX or SSE2
235 + if (has_sse)
236 + EMMS();
239 // C version does 8 at a time to mimic MMX code
240 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
241 - int source_width, int source_y_fraction) {
242 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
243 + int source_width, int source_y_fraction) {
244 int y1_fraction = source_y_fraction;
245 int y0_fraction = 256 - y1_fraction;
246 uint8* end = ybuf + source_width;
247 do {
248 ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
249 ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
250 ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
251 ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
252 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
253 ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
254 ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
255 ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
256 y0_ptr += 8;
257 y1_ptr += 8;
258 ybuf += 8;
259 } while (ybuf < end);
261 -#endif
263 +#ifdef MOZILLA_MAY_SUPPORT_MMX
264 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
265 + int source_width, int source_y_fraction);
266 +#endif
268 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
269 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
270 + int source_width, int source_y_fraction);
271 +#endif
273 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
274 + const uint8* y1_ptr, int source_width,
275 + int source_y_fraction) {
276 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
277 + if (mozilla::supports_sse2()) {
278 + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
279 + return;
281 +#endif
283 +#ifdef MOZILLA_MAY_SUPPORT_MMX
284 + if (mozilla::supports_mmx()) {
285 + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
286 + return;
288 +#endif
290 + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
294 // Scale a frame of YUV to 32 bit ARGB.
295 -void ScaleYUVToRGB32(const uint8* y_buf,
296 - const uint8* u_buf,
297 - const uint8* v_buf,
298 - uint8* rgb_buf,
299 - int source_width,
300 - int source_height,
301 - int width,
302 - int height,
303 - int y_pitch,
304 - int uv_pitch,
305 - int rgb_pitch,
306 - YUVType yuv_type,
307 - Rotate view_rotate,
308 - ScaleFilter filter) {
309 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
310 + const uint8* u_buf,
311 + const uint8* v_buf,
312 + uint8* rgb_buf,
313 + int source_width,
314 + int source_height,
315 + int width,
316 + int height,
317 + int y_pitch,
318 + int uv_pitch,
319 + int rgb_pitch,
320 + YUVType yuv_type,
321 + Rotate view_rotate,
322 + ScaleFilter filter) {
323 + bool has_mmx = supports_mmx();
325 // 4096 allows 3 buffers to fit in 12k.
326 // Helps performance on CPU with 16K L1 cache.
327 // Large enough for 3830x2160 and 30" displays which are 2560x1600.
328 const int kFilterBufferSize = 4096;
329 // Disable filtering if the screen is too big (to avoid buffer overflows).
330 // This should never happen to regular users: they don't have monitors
331 // wider than 4096 pixels.
332 // TODO(fbarchard): Allow rotated videos to filter.
333 if (source_width > kFilterBufferSize || view_rotate)
334 filter = FILTER_NONE;
336 - unsigned int y_shift = yuv_type;
337 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
338 // Diagram showing origin and direction of source sampling.
339 // ->0 4<-
340 // 7 3
342 // 6 5
343 // ->1 2<-
344 // Rotations that start at right side of image.
345 if ((view_rotate == ROTATE_180) ||
346 @@ -243,17 +262,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
347 uv_pitch = 1;
351 // Need padding because FilterRows() will write 1 to 16 extra pixels
352 // after the end for SSE2 version.
353 uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
354 uint8* ybuf =
355 - reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
356 + reinterpret_cast<uint8*>(reinterpret_cast<PRUptrdiff>(yuvbuf + 15) & ~15);
357 uint8* ubuf = ybuf + kFilterBufferSize;
358 uint8* vbuf = ubuf + kFilterBufferSize;
359 // TODO(fbarchard): Fixed point math is off by 1 on negatives.
360 int yscale_fixed = (source_height << kFractionBits) / height;
362 // TODO(fbarchard): Split this into separate function for better efficiency.
363 for (int y = 0; y < height; ++y) {
364 uint8* dest_pixel = rgb_buf + y * rgb_pitch;
365 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
366 int source_uv_fraction =
367 ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
369 const uint8* y_ptr = y0_ptr;
370 const uint8* u_ptr = u0_ptr;
371 const uint8* v_ptr = v0_ptr;
372 // Apply vertical filtering if necessary.
373 // TODO(fbarchard): Remove memcpy when not necessary.
374 - if (filter & media::FILTER_BILINEAR_V) {
375 + if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
376 if (yscale_fixed != kFractionMax &&
377 source_y_fraction && ((source_y + 1) < source_height)) {
378 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
379 } else {
380 memcpy(ybuf, y0_ptr, source_width);
382 y_ptr = ybuf;
383 ybuf[source_width] = ybuf[source_width-1];
384 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
385 u_ptr = ubuf;
386 v_ptr = vbuf;
387 ubuf[uv_source_width] = ubuf[uv_source_width - 1];
388 vbuf[uv_source_width] = vbuf[uv_source_width - 1];
390 if (source_dx == kFractionMax) { // Not scaled
391 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
392 dest_pixel, width);
393 - } else {
394 - if (filter & FILTER_BILINEAR_H) {
395 + } else if (filter & FILTER_BILINEAR_H) {
396 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
397 dest_pixel, width, source_dx);
398 } else {
399 // Specialized scalers and rotation.
400 -#if USE_MMX && defined(_MSC_VER)
401 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
402 + if(mozilla::supports_sse()) {
403 if (width == (source_width * 2)) {
404 - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
405 - dest_pixel, width);
406 + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
407 + dest_pixel, width);
408 } else if ((source_dx & kFractionMask) == 0) {
409 // Scaling by integer scale factor. ie half.
410 - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
411 - dest_pixel, width,
412 - source_dx >> kFractionBits);
413 + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
414 + dest_pixel, width,
415 + source_dx >> kFractionBits);
416 } else if (source_dx_uv == source_dx) { // Not rotated.
417 ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
418 dest_pixel, width, source_dx);
419 } else {
420 - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
421 - dest_pixel, width,
422 - source_dx >> kFractionBits,
423 - source_dx_uv >> kFractionBits);
424 + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
425 + dest_pixel, width,
426 + source_dx >> kFractionBits,
427 + source_dx_uv >> kFractionBits);
430 + else {
431 + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
432 + dest_pixel, width, source_dx);
434 #else
435 - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
436 - dest_pixel, width, source_dx);
437 -#endif
439 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
440 + dest_pixel, width, source_dx);
441 +#endif
444 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
445 - EMMS();
448 -} // namespace media
449 + if (has_mmx)
450 + EMMS();
453 +} // namespace gfx
454 +} // namespace mozilla
455 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
456 --- a/gfx/ycbcr/yuv_convert.h
457 +++ b/gfx/ycbcr/yuv_convert.h
458 @@ -1,72 +1,98 @@
459 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
460 // Use of this source code is governed by a BSD-style license that can be
461 // found in the LICENSE file.
463 #ifndef MEDIA_BASE_YUV_CONVERT_H_
464 #define MEDIA_BASE_YUV_CONVERT_H_
466 -#include "base/basictypes.h"
468 -namespace media {
470 +#include "chromium_types.h"
471 +#include "gfxCore.h"
473 +#ifdef __arm__
474 +#define HAVE_YCBCR_TO_RGB565 1
475 +#endif
477 +namespace mozilla {
479 +namespace gfx {
481 // Type of YUV surface.
482 // The value of these enums matter as they are used to shift vertical indices.
483 enum YUVType {
484 - YV16 = 0, // YV16 is half width and full height chroma channels.
485 - YV12 = 1, // YV12 is half width and half height chroma channels.
486 + YV12 = 0, // YV12 is half width and half height chroma channels.
487 + YV16 = 1, // YV16 is half width and full height chroma channels.
488 + YV24 = 2 // YV24 is full width and full height chroma channels.
491 // Mirror means flip the image horizontally, as in looking in a mirror.
492 // Rotate happens after mirroring.
493 enum Rotate {
494 ROTATE_0, // Rotation off.
495 ROTATE_90, // Rotate clockwise.
496 ROTATE_180, // Rotate upside down.
497 ROTATE_270, // Rotate counter clockwise.
498 MIRROR_ROTATE_0, // Mirror horizontally.
499 MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
500 MIRROR_ROTATE_180, // Mirror vertically.
501 - MIRROR_ROTATE_270, // Transpose.
502 + MIRROR_ROTATE_270 // Transpose.
505 // Filter affects how scaling looks.
506 enum ScaleFilter {
507 FILTER_NONE = 0, // No filter (point sampled).
508 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
509 FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
510 - FILTER_BILINEAR = 3, // Bilinear filter.
511 + FILTER_BILINEAR = 3 // Bilinear filter.
514 +// Convert a frame of YUV to 16 bit RGB565.
515 +// Pass in YV12 formats
516 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
517 + const uint8* uplane,
518 + const uint8* vplane,
519 + uint8* rgbframe,
520 + int pic_x,
521 + int pic_y,
522 + int pic_width,
523 + int pic_height,
524 + int ystride,
525 + int uvstride,
526 + int rgbstride,
527 + YUVType yuv_type);
529 // Convert a frame of YUV to 32 bit ARGB.
530 // Pass in YV16/YV12 depending on source format
531 -void ConvertYUVToRGB32(const uint8* yplane,
532 - const uint8* uplane,
533 - const uint8* vplane,
534 - uint8* rgbframe,
535 - int width,
536 - int height,
537 - int ystride,
538 - int uvstride,
539 - int rgbstride,
540 - YUVType yuv_type);
541 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
542 + const uint8* uplane,
543 + const uint8* vplane,
544 + uint8* rgbframe,
545 + int pic_x,
546 + int pic_y,
547 + int pic_width,
548 + int pic_height,
549 + int ystride,
550 + int uvstride,
551 + int rgbstride,
552 + YUVType yuv_type);
554 // Scale a frame of YUV to 32 bit ARGB.
555 // Supports rotation and mirroring.
556 -void ScaleYUVToRGB32(const uint8* yplane,
557 - const uint8* uplane,
558 - const uint8* vplane,
559 - uint8* rgbframe,
560 - int source_width,
561 - int source_height,
562 - int width,
563 - int height,
564 - int ystride,
565 - int uvstride,
566 - int rgbstride,
567 - YUVType yuv_type,
568 - Rotate view_rotate,
569 - ScaleFilter filter);
571 -} // namespace media
573 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
574 + const uint8* uplane,
575 + const uint8* vplane,
576 + uint8* rgbframe,
577 + int source_width,
578 + int source_height,
579 + int width,
580 + int height,
581 + int ystride,
582 + int uvstride,
583 + int rgbstride,
584 + YUVType yuv_type,
585 + Rotate view_rotate,
586 + ScaleFilter filter);
588 +} // namespace gfx
589 +} // namespace mozilla
591 #endif // MEDIA_BASE_YUV_CONVERT_H_
592 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
593 new file mode 100644
594 --- /dev/null
595 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
596 @@ -0,0 +1,45 @@
597 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
598 +// Use of this source code is governed by a BSD-style license that can be
599 +// found in the LICENSE file.
601 +#include <mmintrin.h>
602 +#include "yuv_row.h"
604 +namespace mozilla {
605 +namespace gfx {
607 +// FilterRows combines two rows of the image using linear interpolation.
608 +// MMX version does 8 pixels at a time.
609 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
610 + int source_width, int source_y_fraction) {
611 + __m64 zero = _mm_setzero_si64();
612 + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
613 + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
615 + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
616 + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
617 + __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
618 + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
620 + do {
621 + __m64 y0 = *y0_ptr64++;
622 + __m64 y1 = *y1_ptr64++;
623 + __m64 y2 = _mm_unpackhi_pi8(y0, zero);
624 + __m64 y3 = _mm_unpackhi_pi8(y1, zero);
625 + y0 = _mm_unpacklo_pi8(y0, zero);
626 + y1 = _mm_unpacklo_pi8(y1, zero);
627 + y0 = _mm_mullo_pi16(y0, y0_fraction);
628 + y1 = _mm_mullo_pi16(y1, y1_fraction);
629 + y2 = _mm_mullo_pi16(y2, y0_fraction);
630 + y3 = _mm_mullo_pi16(y3, y1_fraction);
631 + y0 = _mm_add_pi16(y0, y1);
632 + y2 = _mm_add_pi16(y2, y3);
633 + y0 = _mm_srli_pi16(y0, 8);
634 + y2 = _mm_srli_pi16(y2, 8);
635 + y0 = _mm_packs_pu16(y0, y2);
636 + *dest64++ = y0;
637 + } while (dest64 < end64);
642 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
643 new file mode 100644
644 --- /dev/null
645 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
646 @@ -0,0 +1,47 @@
647 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
648 +// Use of this source code is governed by a BSD-style license that can be
649 +// found in the LICENSE file.
651 +#include <emmintrin.h>
652 +#include "yuv_row.h"
654 +namespace mozilla {
655 +namespace gfx {
657 +// FilterRows combines two rows of the image using linear interpolation.
658 +// SSE2 version does 16 pixels at a time.
659 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
660 + int source_width, int source_y_fraction) {
661 + __m128i zero = _mm_setzero_si128();
662 + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
663 + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
665 + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
666 + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
667 + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
668 + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
670 + do {
671 + __m128i y0 = _mm_loadu_si128(y0_ptr128);
672 + __m128i y1 = _mm_loadu_si128(y1_ptr128);
673 + __m128i y2 = _mm_unpackhi_epi8(y0, zero);
674 + __m128i y3 = _mm_unpackhi_epi8(y1, zero);
675 + y0 = _mm_unpacklo_epi8(y0, zero);
676 + y1 = _mm_unpacklo_epi8(y1, zero);
677 + y0 = _mm_mullo_epi16(y0, y0_fraction);
678 + y1 = _mm_mullo_epi16(y1, y1_fraction);
679 + y2 = _mm_mullo_epi16(y2, y0_fraction);
680 + y3 = _mm_mullo_epi16(y3, y1_fraction);
681 + y0 = _mm_add_epi16(y0, y1);
682 + y2 = _mm_add_epi16(y2, y3);
683 + y0 = _mm_srli_epi16(y0, 8);
684 + y2 = _mm_srli_epi16(y2, 8);
685 + y0 = _mm_packus_epi16(y0, y2);
686 + *dest128++ = y0;
687 + ++y0_ptr128;
688 + ++y1_ptr128;
689 + } while (dest128 < end128);
694 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
695 --- a/gfx/ycbcr/yuv_row.h
696 +++ b/gfx/ycbcr/yuv_row.h
697 @@ -5,109 +5,133 @@
698 // yuv_row internal functions to handle YUV conversion and scaling to RGB.
699 // These functions are used from both yuv_convert.cc and yuv_scale.cc.
701 // TODO(fbarchard): Write function that can handle rotation and scaling.
703 #ifndef MEDIA_BASE_YUV_ROW_H_
704 #define MEDIA_BASE_YUV_ROW_H_
706 -#include "base/basictypes.h"
707 +#include "chromium_types.h"
709 extern "C" {
710 // Can only do 1x.
711 // This is the second fastest of the scalers.
712 void FastConvertYUVToRGB32Row(const uint8* y_buf,
713 const uint8* u_buf,
714 const uint8* v_buf,
715 uint8* rgb_buf,
716 int width);
718 -// Can do 1x, half size or any scale down by an integer amount.
719 -// Step can be negative (mirroring, rotate 180).
720 -// This is the third fastest of the scalers.
721 -void ConvertYUVToRGB32Row(const uint8* y_buf,
722 - const uint8* u_buf,
723 - const uint8* v_buf,
724 - uint8* rgb_buf,
725 - int width,
726 - int step);
728 -// Rotate is like Convert, but applies different step to Y versus U and V.
729 -// This allows rotation by 90 or 270, by stepping by stride.
730 -// This is the forth fastest of the scalers.
731 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
732 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
733 const uint8* u_buf,
734 const uint8* v_buf,
735 uint8* rgb_buf,
736 int width,
737 - int ystep,
738 - int uvstep);
739 + unsigned int x_shift);
741 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
742 + const uint8* u_buf,
743 + const uint8* v_buf,
744 + uint8* rgb_buf,
745 + int width);
747 +// Can do 1x, half size or any scale down by an integer amount.
748 +// Step can be negative (mirroring, rotate 180).
749 +// This is the third fastest of the scalers.
750 +// Only defined on Windows x86-32.
751 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
752 + const uint8* u_buf,
753 + const uint8* v_buf,
754 + uint8* rgb_buf,
755 + int width,
756 + int step);
758 +// Rotate is like Convert, but applies different step to Y versus U and V.
759 +// This allows rotation by 90 or 270, by stepping by stride.
760 +// This is the forth fastest of the scalers.
761 +// Only defined on Windows x86-32.
762 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
763 + const uint8* u_buf,
764 + const uint8* v_buf,
765 + uint8* rgb_buf,
766 + int width,
767 + int ystep,
768 + int uvstep);
770 // Doubler does 4 pixels at a time. Each pixel is replicated.
771 // This is the fastest of the scalers.
772 -void DoubleYUVToRGB32Row(const uint8* y_buf,
773 - const uint8* u_buf,
774 - const uint8* v_buf,
775 - uint8* rgb_buf,
776 - int width);
777 +// Only defined on Windows x86-32.
778 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
779 + const uint8* u_buf,
780 + const uint8* v_buf,
781 + uint8* rgb_buf,
782 + int width);
784 // Handles arbitrary scaling up or down.
785 // Mirroring is supported, but not 90 or 270 degree rotation.
786 // Chroma is under sampled every 2 pixels for performance.
787 void ScaleYUVToRGB32Row(const uint8* y_buf,
788 const uint8* u_buf,
789 const uint8* v_buf,
790 uint8* rgb_buf,
791 int width,
792 int source_dx);
794 +void ScaleYUVToRGB32Row(const uint8* y_buf,
795 + const uint8* u_buf,
796 + const uint8* v_buf,
797 + uint8* rgb_buf,
798 + int width,
799 + int source_dx);
801 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
802 + const uint8* u_buf,
803 + const uint8* v_buf,
804 + uint8* rgb_buf,
805 + int width,
806 + int source_dx);
808 // Handles arbitrary scaling up or down with bilinear filtering.
809 // Mirroring is supported, but not 90 or 270 degree rotation.
810 // Chroma is under sampled every 2 pixels for performance.
811 // This is the slowest of the scalers.
812 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
813 const uint8* u_buf,
814 const uint8* v_buf,
815 uint8* rgb_buf,
816 int width,
817 int source_dx);
819 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
820 + const uint8* u_buf,
821 + const uint8* v_buf,
822 + uint8* rgb_buf,
823 + int width,
824 + int source_dx);
826 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
827 + const uint8* u_buf,
828 + const uint8* v_buf,
829 + uint8* rgb_buf,
830 + int width,
831 + int source_dx);
834 #if defined(_MSC_VER)
835 #define SIMD_ALIGNED(var) __declspec(align(16)) var
836 #else
837 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
838 #endif
839 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
841 -// Method to force C version.
842 -//#define USE_MMX 0
843 -//#define USE_SSE2 0
845 -#if !defined(USE_MMX)
846 -// Windows, Mac and Linux/BSD use MMX
847 -#if defined(__MMX__) || defined(_MSC_VER)
848 -#define USE_MMX 1
849 -#else
850 -#define USE_MMX 0
851 -#endif
852 -#endif
854 -#if !defined(USE_SSE2)
855 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
856 -#define USE_SSE2 1
857 -#else
858 -#define USE_SSE2 0
859 -#endif
860 -#endif
862 // x64 uses MMX2 (SSE) so emms is not required.
863 // Warning C4799: function has no EMMS instruction.
864 // EMMS() is slow and should be called by the calling function once per image.
865 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
866 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
867 #if defined(_MSC_VER)
868 #define EMMS() __asm emms
869 #pragma warning(disable: 4799)
870 #else
871 #define EMMS() asm("emms")
872 #endif
873 #else
874 #define EMMS()
875 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
876 --- a/gfx/ycbcr/yuv_row_c.cpp
877 +++ b/gfx/ycbcr/yuv_row_c.cpp
878 @@ -1,812 +1,18 @@
879 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
880 // Use of this source code is governed by a BSD-style license that can be
881 // found in the LICENSE file.
883 -#include "media/base/yuv_row.h"
885 -#ifdef _DEBUG
886 -#include "base/logging.h"
887 -#else
888 +#include "yuv_row.h"
890 #define DCHECK(a)
891 -#endif
893 extern "C" {
895 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
897 -// AMD64 ABI uses register paremters.
898 -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
899 - const uint8* u_buf, // rsi
900 - const uint8* v_buf, // rdx
901 - uint8* rgb_buf, // rcx
902 - int width) { // r8
903 - asm(
904 - "jmp convertend\n"
905 -"convertloop:"
906 - "movzb (%1),%%r10\n"
907 - "add $0x1,%1\n"
908 - "movzb (%2),%%r11\n"
909 - "add $0x1,%2\n"
910 - "movq 2048(%5,%%r10,8),%%xmm0\n"
911 - "movzb (%0),%%r10\n"
912 - "movq 4096(%5,%%r11,8),%%xmm1\n"
913 - "movzb 0x1(%0),%%r11\n"
914 - "paddsw %%xmm1,%%xmm0\n"
915 - "movq (%5,%%r10,8),%%xmm2\n"
916 - "add $0x2,%0\n"
917 - "movq (%5,%%r11,8),%%xmm3\n"
918 - "paddsw %%xmm0,%%xmm2\n"
919 - "paddsw %%xmm0,%%xmm3\n"
920 - "shufps $0x44,%%xmm3,%%xmm2\n"
921 - "psraw $0x6,%%xmm2\n"
922 - "packuswb %%xmm2,%%xmm2\n"
923 - "movq %%xmm2,0x0(%3)\n"
924 - "add $0x8,%3\n"
925 -"convertend:"
926 - "sub $0x2,%4\n"
927 - "jns convertloop\n"
929 -"convertnext:"
930 - "add $0x1,%4\n"
931 - "js convertdone\n"
933 - "movzb (%1),%%r10\n"
934 - "movq 2048(%5,%%r10,8),%%xmm0\n"
935 - "movzb (%2),%%r10\n"
936 - "movq 4096(%5,%%r10,8),%%xmm1\n"
937 - "paddsw %%xmm1,%%xmm0\n"
938 - "movzb (%0),%%r10\n"
939 - "movq (%5,%%r10,8),%%xmm1\n"
940 - "paddsw %%xmm0,%%xmm1\n"
941 - "psraw $0x6,%%xmm1\n"
942 - "packuswb %%xmm1,%%xmm1\n"
943 - "movd %%xmm1,0x0(%3)\n"
944 -"convertdone:"
946 - : "r"(y_buf), // %0
947 - "r"(u_buf), // %1
948 - "r"(v_buf), // %2
949 - "r"(rgb_buf), // %3
950 - "r"(width), // %4
951 - "r" (kCoefficientsRgbY) // %5
952 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
956 -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
957 - const uint8* u_buf, // rsi
958 - const uint8* v_buf, // rdx
959 - uint8* rgb_buf, // rcx
960 - int width, // r8
961 - int source_dx) { // r9
962 - asm(
963 - "xor %%r11,%%r11\n"
964 - "sub $0x2,%4\n"
965 - "js scalenext\n"
967 -"scaleloop:"
968 - "mov %%r11,%%r10\n"
969 - "sar $0x11,%%r10\n"
970 - "movzb (%1,%%r10,1),%%rax\n"
971 - "movq 2048(%5,%%rax,8),%%xmm0\n"
972 - "movzb (%2,%%r10,1),%%rax\n"
973 - "movq 4096(%5,%%rax,8),%%xmm1\n"
974 - "lea (%%r11,%6),%%r10\n"
975 - "sar $0x10,%%r11\n"
976 - "movzb (%0,%%r11,1),%%rax\n"
977 - "paddsw %%xmm1,%%xmm0\n"
978 - "movq (%5,%%rax,8),%%xmm1\n"
979 - "lea (%%r10,%6),%%r11\n"
980 - "sar $0x10,%%r10\n"
981 - "movzb (%0,%%r10,1),%%rax\n"
982 - "movq (%5,%%rax,8),%%xmm2\n"
983 - "paddsw %%xmm0,%%xmm1\n"
984 - "paddsw %%xmm0,%%xmm2\n"
985 - "shufps $0x44,%%xmm2,%%xmm1\n"
986 - "psraw $0x6,%%xmm1\n"
987 - "packuswb %%xmm1,%%xmm1\n"
988 - "movq %%xmm1,0x0(%3)\n"
989 - "add $0x8,%3\n"
990 - "sub $0x2,%4\n"
991 - "jns scaleloop\n"
993 -"scalenext:"
994 - "add $0x1,%4\n"
995 - "js scaledone\n"
997 - "mov %%r11,%%r10\n"
998 - "sar $0x11,%%r10\n"
999 - "movzb (%1,%%r10,1),%%rax\n"
1000 - "movq 2048(%5,%%rax,8),%%xmm0\n"
1001 - "movzb (%2,%%r10,1),%%rax\n"
1002 - "movq 4096(%5,%%rax,8),%%xmm1\n"
1003 - "paddsw %%xmm1,%%xmm0\n"
1004 - "sar $0x10,%%r11\n"
1005 - "movzb (%0,%%r11,1),%%rax\n"
1006 - "movq (%5,%%rax,8),%%xmm1\n"
1007 - "paddsw %%xmm0,%%xmm1\n"
1008 - "psraw $0x6,%%xmm1\n"
1009 - "packuswb %%xmm1,%%xmm1\n"
1010 - "movd %%xmm1,0x0(%3)\n"
1012 -"scaledone:"
1014 - : "r"(y_buf), // %0
1015 - "r"(u_buf), // %1
1016 - "r"(v_buf), // %2
1017 - "r"(rgb_buf), // %3
1018 - "r"(width), // %4
1019 - "r" (kCoefficientsRgbY), // %5
1020 - "r"(static_cast<long>(source_dx)) // %6
1021 - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
1025 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1026 - const uint8* u_buf,
1027 - const uint8* v_buf,
1028 - uint8* rgb_buf,
1029 - int width,
1030 - int source_dx) {
1031 - asm(
1032 - "xor %%r11,%%r11\n" // x = 0
1033 - "sub $0x2,%4\n"
1034 - "js .lscalenext\n"
1035 - "cmp $0x20000,%6\n" // if source_dx >= 2.0
1036 - "jl .lscalehalf\n"
1037 - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1038 -".lscalehalf:"
1040 -".lscaleloop:"
1041 - "mov %%r11,%%r10\n"
1042 - "sar $0x11,%%r10\n"
1044 - "movzb (%1, %%r10, 1), %%r13 \n"
1045 - "movzb 1(%1, %%r10, 1), %%r14 \n"
1046 - "mov %%r11, %%rax \n"
1047 - "and $0x1fffe, %%rax \n"
1048 - "imul %%rax, %%r14 \n"
1049 - "xor $0x1fffe, %%rax \n"
1050 - "imul %%rax, %%r13 \n"
1051 - "add %%r14, %%r13 \n"
1052 - "shr $17, %%r13 \n"
1053 - "movq 2048(%5,%%r13,8), %%xmm0\n"
1055 - "movzb (%2, %%r10, 1), %%r13 \n"
1056 - "movzb 1(%2, %%r10, 1), %%r14 \n"
1057 - "mov %%r11, %%rax \n"
1058 - "and $0x1fffe, %%rax \n"
1059 - "imul %%rax, %%r14 \n"
1060 - "xor $0x1fffe, %%rax \n"
1061 - "imul %%rax, %%r13 \n"
1062 - "add %%r14, %%r13 \n"
1063 - "shr $17, %%r13 \n"
1064 - "movq 4096(%5,%%r13,8), %%xmm1\n"
1066 - "mov %%r11, %%rax \n"
1067 - "lea (%%r11,%6),%%r10\n"
1068 - "sar $0x10,%%r11\n"
1069 - "paddsw %%xmm1,%%xmm0\n"
1071 - "movzb (%0, %%r11, 1), %%r13 \n"
1072 - "movzb 1(%0, %%r11, 1), %%r14 \n"
1073 - "and $0xffff, %%rax \n"
1074 - "imul %%rax, %%r14 \n"
1075 - "xor $0xffff, %%rax \n"
1076 - "imul %%rax, %%r13 \n"
1077 - "add %%r14, %%r13 \n"
1078 - "shr $16, %%r13 \n"
1079 - "movq (%5,%%r13,8),%%xmm1\n"
1081 - "mov %%r10, %%rax \n"
1082 - "lea (%%r10,%6),%%r11\n"
1083 - "sar $0x10,%%r10\n"
1085 - "movzb (%0,%%r10,1), %%r13 \n"
1086 - "movzb 1(%0,%%r10,1), %%r14 \n"
1087 - "and $0xffff, %%rax \n"
1088 - "imul %%rax, %%r14 \n"
1089 - "xor $0xffff, %%rax \n"
1090 - "imul %%rax, %%r13 \n"
1091 - "add %%r14, %%r13 \n"
1092 - "shr $16, %%r13 \n"
1093 - "movq (%5,%%r13,8),%%xmm2\n"
1095 - "paddsw %%xmm0,%%xmm1\n"
1096 - "paddsw %%xmm0,%%xmm2\n"
1097 - "shufps $0x44,%%xmm2,%%xmm1\n"
1098 - "psraw $0x6,%%xmm1\n"
1099 - "packuswb %%xmm1,%%xmm1\n"
1100 - "movq %%xmm1,0x0(%3)\n"
1101 - "add $0x8,%3\n"
1102 - "sub $0x2,%4\n"
1103 - "jns .lscaleloop\n"
1105 -".lscalenext:"
1106 - "add $0x1,%4\n"
1107 - "js .lscaledone\n"
1109 - "mov %%r11,%%r10\n"
1110 - "sar $0x11,%%r10\n"
1112 - "movzb (%1,%%r10,1), %%r13 \n"
1113 - "movq 2048(%5,%%r13,8),%%xmm0\n"
1115 - "movzb (%2,%%r10,1), %%r13 \n"
1116 - "movq 4096(%5,%%r13,8),%%xmm1\n"
1118 - "paddsw %%xmm1,%%xmm0\n"
1119 - "sar $0x10,%%r11\n"
1121 - "movzb (%0,%%r11,1), %%r13 \n"
1122 - "movq (%5,%%r13,8),%%xmm1\n"
1124 - "paddsw %%xmm0,%%xmm1\n"
1125 - "psraw $0x6,%%xmm1\n"
1126 - "packuswb %%xmm1,%%xmm1\n"
1127 - "movd %%xmm1,0x0(%3)\n"
1129 -".lscaledone:"
1131 - : "r"(y_buf), // %0
1132 - "r"(u_buf), // %1
1133 - "r"(v_buf), // %2
1134 - "r"(rgb_buf), // %3
1135 - "r"(width), // %4
1136 - "r" (kCoefficientsRgbY), // %5
1137 - "r"(static_cast<long>(source_dx)) // %6
1138 - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1142 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1144 -// PIC version is slower because less registers are available, so
1145 -// non-PIC is used on platforms where it is possible.
1147 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1148 - const uint8* u_buf,
1149 - const uint8* v_buf,
1150 - uint8* rgb_buf,
1151 - int width);
1152 - asm(
1153 - ".text\n"
1154 - ".global FastConvertYUVToRGB32Row\n"
1155 -"FastConvertYUVToRGB32Row:\n"
1156 - "pusha\n"
1157 - "mov 0x24(%esp),%edx\n"
1158 - "mov 0x28(%esp),%edi\n"
1159 - "mov 0x2c(%esp),%esi\n"
1160 - "mov 0x30(%esp),%ebp\n"
1161 - "mov 0x34(%esp),%ecx\n"
1162 - "jmp convertend\n"
1164 -"convertloop:"
1165 - "movzbl (%edi),%eax\n"
1166 - "add $0x1,%edi\n"
1167 - "movzbl (%esi),%ebx\n"
1168 - "add $0x1,%esi\n"
1169 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1170 - "movzbl (%edx),%eax\n"
1171 - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
1172 - "movzbl 0x1(%edx),%ebx\n"
1173 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1174 - "add $0x2,%edx\n"
1175 - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
1176 - "paddsw %mm0,%mm1\n"
1177 - "paddsw %mm0,%mm2\n"
1178 - "psraw $0x6,%mm1\n"
1179 - "psraw $0x6,%mm2\n"
1180 - "packuswb %mm2,%mm1\n"
1181 - "movntq %mm1,0x0(%ebp)\n"
1182 - "add $0x8,%ebp\n"
1183 -"convertend:"
1184 - "sub $0x2,%ecx\n"
1185 - "jns convertloop\n"
1187 - "and $0x1,%ecx\n"
1188 - "je convertdone\n"
1190 - "movzbl (%edi),%eax\n"
1191 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1192 - "movzbl (%esi),%eax\n"
1193 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1194 - "movzbl (%edx),%eax\n"
1195 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1196 - "paddsw %mm0,%mm1\n"
1197 - "psraw $0x6,%mm1\n"
1198 - "packuswb %mm1,%mm1\n"
1199 - "movd %mm1,0x0(%ebp)\n"
1200 -"convertdone:"
1201 - "popa\n"
1202 - "ret\n"
1206 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1207 - const uint8* u_buf,
1208 - const uint8* v_buf,
1209 - uint8* rgb_buf,
1210 - int width,
1211 - int source_dx);
1212 - asm(
1213 - ".text\n"
1214 - ".global ScaleYUVToRGB32Row\n"
1215 -"ScaleYUVToRGB32Row:\n"
1216 - "pusha\n"
1217 - "mov 0x24(%esp),%edx\n"
1218 - "mov 0x28(%esp),%edi\n"
1219 - "mov 0x2c(%esp),%esi\n"
1220 - "mov 0x30(%esp),%ebp\n"
1221 - "mov 0x34(%esp),%ecx\n"
1222 - "xor %ebx,%ebx\n"
1223 - "jmp scaleend\n"
1225 -"scaleloop:"
1226 - "mov %ebx,%eax\n"
1227 - "sar $0x11,%eax\n"
1228 - "movzbl (%edi,%eax,1),%eax\n"
1229 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1230 - "mov %ebx,%eax\n"
1231 - "sar $0x11,%eax\n"
1232 - "movzbl (%esi,%eax,1),%eax\n"
1233 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1234 - "mov %ebx,%eax\n"
1235 - "add 0x38(%esp),%ebx\n"
1236 - "sar $0x10,%eax\n"
1237 - "movzbl (%edx,%eax,1),%eax\n"
1238 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1239 - "mov %ebx,%eax\n"
1240 - "add 0x38(%esp),%ebx\n"
1241 - "sar $0x10,%eax\n"
1242 - "movzbl (%edx,%eax,1),%eax\n"
1243 - "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
1244 - "paddsw %mm0,%mm1\n"
1245 - "paddsw %mm0,%mm2\n"
1246 - "psraw $0x6,%mm1\n"
1247 - "psraw $0x6,%mm2\n"
1248 - "packuswb %mm2,%mm1\n"
1249 - "movntq %mm1,0x0(%ebp)\n"
1250 - "add $0x8,%ebp\n"
1251 -"scaleend:"
1252 - "sub $0x2,%ecx\n"
1253 - "jns scaleloop\n"
1255 - "and $0x1,%ecx\n"
1256 - "je scaledone\n"
1258 - "mov %ebx,%eax\n"
1259 - "sar $0x11,%eax\n"
1260 - "movzbl (%edi,%eax,1),%eax\n"
1261 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1262 - "mov %ebx,%eax\n"
1263 - "sar $0x11,%eax\n"
1264 - "movzbl (%esi,%eax,1),%eax\n"
1265 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1266 - "mov %ebx,%eax\n"
1267 - "sar $0x10,%eax\n"
1268 - "movzbl (%edx,%eax,1),%eax\n"
1269 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1270 - "paddsw %mm0,%mm1\n"
1271 - "psraw $0x6,%mm1\n"
1272 - "packuswb %mm1,%mm1\n"
1273 - "movd %mm1,0x0(%ebp)\n"
1275 -"scaledone:"
1276 - "popa\n"
1277 - "ret\n"
1280 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1281 - const uint8* u_buf,
1282 - const uint8* v_buf,
1283 - uint8* rgb_buf,
1284 - int width,
1285 - int source_dx);
1286 - asm(
1287 - ".text\n"
1288 - ".global LinearScaleYUVToRGB32Row\n"
1289 -"LinearScaleYUVToRGB32Row:\n"
1290 - "pusha\n"
1291 - "mov 0x24(%esp),%edx\n"
1292 - "mov 0x28(%esp),%edi\n"
1293 - "mov 0x30(%esp),%ebp\n"
1295 - // source_width = width * source_dx + ebx
1296 - "mov 0x34(%esp), %ecx\n"
1297 - "imull 0x38(%esp), %ecx\n"
1298 - "mov %ecx, 0x34(%esp)\n"
1300 - "mov 0x38(%esp), %ecx\n"
1301 - "xor %ebx,%ebx\n" // x = 0
1302 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1303 - "jl .lscaleend\n"
1304 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1305 - "jmp .lscaleend\n"
1307 -".lscaleloop:"
1308 - "mov %ebx,%eax\n"
1309 - "sar $0x11,%eax\n"
1311 - "movzbl (%edi,%eax,1),%ecx\n"
1312 - "movzbl 1(%edi,%eax,1),%esi\n"
1313 - "mov %ebx,%eax\n"
1314 - "andl $0x1fffe, %eax \n"
1315 - "imul %eax, %esi \n"
1316 - "xorl $0x1fffe, %eax \n"
1317 - "imul %eax, %ecx \n"
1318 - "addl %esi, %ecx \n"
1319 - "shrl $17, %ecx \n"
1320 - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
1322 - "mov 0x2c(%esp),%esi\n"
1323 - "mov %ebx,%eax\n"
1324 - "sar $0x11,%eax\n"
1326 - "movzbl (%esi,%eax,1),%ecx\n"
1327 - "movzbl 1(%esi,%eax,1),%esi\n"
1328 - "mov %ebx,%eax\n"
1329 - "andl $0x1fffe, %eax \n"
1330 - "imul %eax, %esi \n"
1331 - "xorl $0x1fffe, %eax \n"
1332 - "imul %eax, %ecx \n"
1333 - "addl %esi, %ecx \n"
1334 - "shrl $17, %ecx \n"
1335 - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
1337 - "mov %ebx,%eax\n"
1338 - "sar $0x10,%eax\n"
1339 - "movzbl (%edx,%eax,1),%ecx\n"
1340 - "movzbl 1(%edx,%eax,1),%esi\n"
1341 - "mov %ebx,%eax\n"
1342 - "add 0x38(%esp),%ebx\n"
1343 - "andl $0xffff, %eax \n"
1344 - "imul %eax, %esi \n"
1345 - "xorl $0xffff, %eax \n"
1346 - "imul %eax, %ecx \n"
1347 - "addl %esi, %ecx \n"
1348 - "shrl $16, %ecx \n"
1349 - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
1351 - "cmp 0x34(%esp), %ebx\n"
1352 - "jge .lscalelastpixel\n"
1354 - "mov %ebx,%eax\n"
1355 - "sar $0x10,%eax\n"
1356 - "movzbl (%edx,%eax,1),%ecx\n"
1357 - "movzbl 1(%edx,%eax,1),%esi\n"
1358 - "mov %ebx,%eax\n"
1359 - "add 0x38(%esp),%ebx\n"
1360 - "andl $0xffff, %eax \n"
1361 - "imul %eax, %esi \n"
1362 - "xorl $0xffff, %eax \n"
1363 - "imul %eax, %ecx \n"
1364 - "addl %esi, %ecx \n"
1365 - "shrl $16, %ecx \n"
1366 - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
1368 - "paddsw %mm0,%mm1\n"
1369 - "paddsw %mm0,%mm2\n"
1370 - "psraw $0x6,%mm1\n"
1371 - "psraw $0x6,%mm2\n"
1372 - "packuswb %mm2,%mm1\n"
1373 - "movntq %mm1,0x0(%ebp)\n"
1374 - "add $0x8,%ebp\n"
1376 -".lscaleend:"
1377 - "cmp 0x34(%esp), %ebx\n"
1378 - "jl .lscaleloop\n"
1379 - "popa\n"
1380 - "ret\n"
1382 -".lscalelastpixel:"
1383 - "paddsw %mm0, %mm1\n"
1384 - "psraw $6, %mm1\n"
1385 - "packuswb %mm1, %mm1\n"
1386 - "movd %mm1, (%ebp)\n"
1387 - "popa\n"
1388 - "ret\n"
1391 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
1393 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
1394 - const uint8* u_buf,
1395 - const uint8* v_buf,
1396 - uint8* rgb_buf,
1397 - int width,
1398 - int16 *kCoefficientsRgbY);
1399 - asm(
1400 - ".text\n"
1401 -#if defined(OS_MACOSX)
1402 -"_PICConvertYUVToRGB32Row:\n"
1403 -#else
1404 -"PICConvertYUVToRGB32Row:\n"
1405 -#endif
1406 - "pusha\n"
1407 - "mov 0x24(%esp),%edx\n"
1408 - "mov 0x28(%esp),%edi\n"
1409 - "mov 0x2c(%esp),%esi\n"
1410 - "mov 0x30(%esp),%ebp\n"
1411 - "mov 0x38(%esp),%ecx\n"
1413 - "jmp .Lconvertend\n"
1415 -".Lconvertloop:"
1416 - "movzbl (%edi),%eax\n"
1417 - "add $0x1,%edi\n"
1418 - "movzbl (%esi),%ebx\n"
1419 - "add $0x1,%esi\n"
1420 - "movq 2048(%ecx,%eax,8),%mm0\n"
1421 - "movzbl (%edx),%eax\n"
1422 - "paddsw 4096(%ecx,%ebx,8),%mm0\n"
1423 - "movzbl 0x1(%edx),%ebx\n"
1424 - "movq 0(%ecx,%eax,8),%mm1\n"
1425 - "add $0x2,%edx\n"
1426 - "movq 0(%ecx,%ebx,8),%mm2\n"
1427 - "paddsw %mm0,%mm1\n"
1428 - "paddsw %mm0,%mm2\n"
1429 - "psraw $0x6,%mm1\n"
1430 - "psraw $0x6,%mm2\n"
1431 - "packuswb %mm2,%mm1\n"
1432 - "movntq %mm1,0x0(%ebp)\n"
1433 - "add $0x8,%ebp\n"
1434 -".Lconvertend:"
1435 - "subl $0x2,0x34(%esp)\n"
1436 - "jns .Lconvertloop\n"
1438 - "andl $0x1,0x34(%esp)\n"
1439 - "je .Lconvertdone\n"
1441 - "movzbl (%edi),%eax\n"
1442 - "movq 2048(%ecx,%eax,8),%mm0\n"
1443 - "movzbl (%esi),%eax\n"
1444 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1445 - "movzbl (%edx),%eax\n"
1446 - "movq 0(%ecx,%eax,8),%mm1\n"
1447 - "paddsw %mm0,%mm1\n"
1448 - "psraw $0x6,%mm1\n"
1449 - "packuswb %mm1,%mm1\n"
1450 - "movd %mm1,0x0(%ebp)\n"
1451 -".Lconvertdone:\n"
1452 - "popa\n"
1453 - "ret\n"
1456 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1457 - const uint8* u_buf,
1458 - const uint8* v_buf,
1459 - uint8* rgb_buf,
1460 - int width) {
1461 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
1462 - &kCoefficientsRgbY[0][0]);
1465 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
1466 - const uint8* u_buf,
1467 - const uint8* v_buf,
1468 - uint8* rgb_buf,
1469 - int width,
1470 - int source_dx,
1471 - int16 *kCoefficientsRgbY);
1473 - asm(
1474 - ".text\n"
1475 -#if defined(OS_MACOSX)
1476 -"_PICScaleYUVToRGB32Row:\n"
1477 -#else
1478 -"PICScaleYUVToRGB32Row:\n"
1479 -#endif
1480 - "pusha\n"
1481 - "mov 0x24(%esp),%edx\n"
1482 - "mov 0x28(%esp),%edi\n"
1483 - "mov 0x2c(%esp),%esi\n"
1484 - "mov 0x30(%esp),%ebp\n"
1485 - "mov 0x3c(%esp),%ecx\n"
1486 - "xor %ebx,%ebx\n"
1487 - "jmp Lscaleend\n"
1489 -"Lscaleloop:"
1490 - "mov %ebx,%eax\n"
1491 - "sar $0x11,%eax\n"
1492 - "movzbl (%edi,%eax,1),%eax\n"
1493 - "movq 2048(%ecx,%eax,8),%mm0\n"
1494 - "mov %ebx,%eax\n"
1495 - "sar $0x11,%eax\n"
1496 - "movzbl (%esi,%eax,1),%eax\n"
1497 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1498 - "mov %ebx,%eax\n"
1499 - "add 0x38(%esp),%ebx\n"
1500 - "sar $0x10,%eax\n"
1501 - "movzbl (%edx,%eax,1),%eax\n"
1502 - "movq 0(%ecx,%eax,8),%mm1\n"
1503 - "mov %ebx,%eax\n"
1504 - "add 0x38(%esp),%ebx\n"
1505 - "sar $0x10,%eax\n"
1506 - "movzbl (%edx,%eax,1),%eax\n"
1507 - "movq 0(%ecx,%eax,8),%mm2\n"
1508 - "paddsw %mm0,%mm1\n"
1509 - "paddsw %mm0,%mm2\n"
1510 - "psraw $0x6,%mm1\n"
1511 - "psraw $0x6,%mm2\n"
1512 - "packuswb %mm2,%mm1\n"
1513 - "movntq %mm1,0x0(%ebp)\n"
1514 - "add $0x8,%ebp\n"
1515 -"Lscaleend:"
1516 - "subl $0x2,0x34(%esp)\n"
1517 - "jns Lscaleloop\n"
1519 - "andl $0x1,0x34(%esp)\n"
1520 - "je Lscaledone\n"
1522 - "mov %ebx,%eax\n"
1523 - "sar $0x11,%eax\n"
1524 - "movzbl (%edi,%eax,1),%eax\n"
1525 - "movq 2048(%ecx,%eax,8),%mm0\n"
1526 - "mov %ebx,%eax\n"
1527 - "sar $0x11,%eax\n"
1528 - "movzbl (%esi,%eax,1),%eax\n"
1529 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1530 - "mov %ebx,%eax\n"
1531 - "sar $0x10,%eax\n"
1532 - "movzbl (%edx,%eax,1),%eax\n"
1533 - "movq 0(%ecx,%eax,8),%mm1\n"
1534 - "paddsw %mm0,%mm1\n"
1535 - "psraw $0x6,%mm1\n"
1536 - "packuswb %mm1,%mm1\n"
1537 - "movd %mm1,0x0(%ebp)\n"
1539 -"Lscaledone:"
1540 - "popa\n"
1541 - "ret\n"
1545 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1546 - const uint8* u_buf,
1547 - const uint8* v_buf,
1548 - uint8* rgb_buf,
1549 - int width,
1550 - int source_dx) {
1551 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1552 - &kCoefficientsRgbY[0][0]);
1555 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
1556 - const uint8* u_buf,
1557 - const uint8* v_buf,
1558 - uint8* rgb_buf,
1559 - int width,
1560 - int source_dx,
1561 - int16 *kCoefficientsRgbY);
1562 - asm(
1563 - ".text\n"
1564 -#if defined(OS_MACOSX)
1565 -"_PICLinearScaleYUVToRGB32Row:\n"
1566 -#else
1567 -"PICLinearScaleYUVToRGB32Row:\n"
1568 -#endif
1569 - "pusha\n"
1570 - "mov 0x24(%esp),%edx\n"
1571 - "mov 0x30(%esp),%ebp\n"
1572 - "mov 0x34(%esp),%ecx\n"
1573 - "mov 0x3c(%esp),%edi\n"
1574 - "xor %ebx,%ebx\n"
1576 - // source_width = width * source_dx + ebx
1577 - "mov 0x34(%esp), %ecx\n"
1578 - "imull 0x38(%esp), %ecx\n"
1579 - "mov %ecx, 0x34(%esp)\n"
1581 - "mov 0x38(%esp), %ecx\n"
1582 - "xor %ebx,%ebx\n" // x = 0
1583 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1584 - "jl .lscaleend\n"
1585 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1586 - "jmp .lscaleend\n"
1588 -".lscaleloop:"
1589 - "mov 0x28(%esp),%esi\n"
1590 - "mov %ebx,%eax\n"
1591 - "sar $0x11,%eax\n"
1593 - "movzbl (%esi,%eax,1),%ecx\n"
1594 - "movzbl 1(%esi,%eax,1),%esi\n"
1595 - "mov %ebx,%eax\n"
1596 - "andl $0x1fffe, %eax \n"
1597 - "imul %eax, %esi \n"
1598 - "xorl $0x1fffe, %eax \n"
1599 - "imul %eax, %ecx \n"
1600 - "addl %esi, %ecx \n"
1601 - "shrl $17, %ecx \n"
1602 - "movq 2048(%edi,%ecx,8),%mm0\n"
1604 - "mov 0x2c(%esp),%esi\n"
1605 - "mov %ebx,%eax\n"
1606 - "sar $0x11,%eax\n"
1608 - "movzbl (%esi,%eax,1),%ecx\n"
1609 - "movzbl 1(%esi,%eax,1),%esi\n"
1610 - "mov %ebx,%eax\n"
1611 - "andl $0x1fffe, %eax \n"
1612 - "imul %eax, %esi \n"
1613 - "xorl $0x1fffe, %eax \n"
1614 - "imul %eax, %ecx \n"
1615 - "addl %esi, %ecx \n"
1616 - "shrl $17, %ecx \n"
1617 - "paddsw 4096(%edi,%ecx,8),%mm0\n"
1619 - "mov %ebx,%eax\n"
1620 - "sar $0x10,%eax\n"
1621 - "movzbl (%edx,%eax,1),%ecx\n"
1622 - "movzbl 1(%edx,%eax,1),%esi\n"
1623 - "mov %ebx,%eax\n"
1624 - "add 0x38(%esp),%ebx\n"
1625 - "andl $0xffff, %eax \n"
1626 - "imul %eax, %esi \n"
1627 - "xorl $0xffff, %eax \n"
1628 - "imul %eax, %ecx \n"
1629 - "addl %esi, %ecx \n"
1630 - "shrl $16, %ecx \n"
1631 - "movq (%edi,%ecx,8),%mm1\n"
1633 - "cmp 0x34(%esp), %ebx\n"
1634 - "jge .lscalelastpixel\n"
1636 - "mov %ebx,%eax\n"
1637 - "sar $0x10,%eax\n"
1638 - "movzbl (%edx,%eax,1),%ecx\n"
1639 - "movzbl 1(%edx,%eax,1),%esi\n"
1640 - "mov %ebx,%eax\n"
1641 - "add 0x38(%esp),%ebx\n"
1642 - "andl $0xffff, %eax \n"
1643 - "imul %eax, %esi \n"
1644 - "xorl $0xffff, %eax \n"
1645 - "imul %eax, %ecx \n"
1646 - "addl %esi, %ecx \n"
1647 - "shrl $16, %ecx \n"
1648 - "movq (%edi,%ecx,8),%mm2\n"
1650 - "paddsw %mm0,%mm1\n"
1651 - "paddsw %mm0,%mm2\n"
1652 - "psraw $0x6,%mm1\n"
1653 - "psraw $0x6,%mm2\n"
1654 - "packuswb %mm2,%mm1\n"
1655 - "movntq %mm1,0x0(%ebp)\n"
1656 - "add $0x8,%ebp\n"
1658 -".lscaleend:"
1659 - "cmp %ebx, 0x34(%esp)\n"
1660 - "jg .lscaleloop\n"
1661 - "popa\n"
1662 - "ret\n"
1664 -".lscalelastpixel:"
1665 - "paddsw %mm0, %mm1\n"
1666 - "psraw $6, %mm1\n"
1667 - "packuswb %mm1, %mm1\n"
1668 - "movd %mm1, (%ebp)\n"
1669 - "popa\n"
1670 - "ret\n"
1673 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1674 - const uint8* u_buf,
1675 - const uint8* v_buf,
1676 - uint8* rgb_buf,
1677 - int width,
1678 - int source_dx) {
1679 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1680 - &kCoefficientsRgbY[0][0]);
1683 -#else // USE_MMX
1685 // C reference code that mimic the YUV assembly.
1686 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
1687 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
1688 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
1690 static inline void YuvPixel(uint8 y,
1691 uint8 u,
1692 uint8 v,
1693 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
1694 a >>= 6;
1696 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
1697 (packuswb(g) << 8) |
1698 (packuswb(r) << 16) |
1699 (packuswb(a) << 24);
1702 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1703 - const uint8* u_buf,
1704 - const uint8* v_buf,
1705 - uint8* rgb_buf,
1706 - int width) {
1707 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
1708 + const uint8* u_buf,
1709 + const uint8* v_buf,
1710 + uint8* rgb_buf,
1711 + int width,
1712 + unsigned int x_shift) {
1713 for (int x = 0; x < width; x += 2) {
1714 - uint8 u = u_buf[x >> 1];
1715 - uint8 v = v_buf[x >> 1];
1716 + uint8 u = u_buf[x >> x_shift];
1717 + uint8 v = v_buf[x >> x_shift];
1718 uint8 y0 = y_buf[x];
1719 YuvPixel(y0, u, v, rgb_buf);
1720 if ((x + 1) < width) {
1721 uint8 y1 = y_buf[x + 1];
1722 + if (x_shift == 0) {
1723 + u = u_buf[x + 1];
1724 + v = v_buf[x + 1];
1726 YuvPixel(y1, u, v, rgb_buf + 4);
1728 rgb_buf += 8; // Advance 2 pixels.
1732 // 16.16 fixed point is used. A shift by 16 isolates the integer.
1733 // A shift by 17 is used to further subsample the chrominence channels.
1734 // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
1735 // for 1/65536 pixel accurate interpolation.
1736 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1737 - const uint8* u_buf,
1738 - const uint8* v_buf,
1739 - uint8* rgb_buf,
1740 - int width,
1741 - int source_dx) {
1742 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
1743 + const uint8* u_buf,
1744 + const uint8* v_buf,
1745 + uint8* rgb_buf,
1746 + int width,
1747 + int source_dx) {
1748 int x = 0;
1749 for (int i = 0; i < width; i += 2) {
1750 int y = y_buf[x >> 16];
1751 int u = u_buf[(x >> 17)];
1752 int v = v_buf[(x >> 17)];
1753 YuvPixel(y, u, v, rgb_buf);
1754 x += source_dx;
1755 if ((i + 1) < width) {
1756 y = y_buf[x >> 16];
1757 YuvPixel(y, u, v, rgb_buf+4);
1758 x += source_dx;
1760 rgb_buf += 8;
1764 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1765 - const uint8* u_buf,
1766 - const uint8* v_buf,
1767 - uint8* rgb_buf,
1768 - int width,
1769 - int source_dx) {
1770 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
1771 + const uint8* u_buf,
1772 + const uint8* v_buf,
1773 + uint8* rgb_buf,
1774 + int width,
1775 + int source_dx) {
1776 int x = 0;
1777 if (source_dx >= 0x20000) {
1778 x = 32768;
1780 for (int i = 0; i < width; i += 2) {
1781 int y0 = y_buf[x >> 16];
1782 int y1 = y_buf[(x >> 16) + 1];
1783 int u0 = u_buf[(x >> 17)];
1784 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
1785 y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
1786 YuvPixel(y, u, v, rgb_buf+4);
1787 x += source_dx;
1789 rgb_buf += 8;
1793 -#endif // USE_MMX
1794 } // extern "C"
1796 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
1797 --- a/gfx/ycbcr/yuv_row_posix.cpp
1798 +++ b/gfx/ycbcr/yuv_row_posix.cpp
1799 @@ -1,33 +1,32 @@
1800 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
1801 // Use of this source code is governed by a BSD-style license that can be
1802 // found in the LICENSE file.
1804 -#include "media/base/yuv_row.h"
1806 -#ifdef _DEBUG
1807 -#include "base/logging.h"
1808 -#else
1809 +#include "yuv_row.h"
1810 +#include "mozilla/SSE.h"
1812 #define DCHECK(a)
1813 -#endif
1815 extern "C" {
1817 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
1818 +#if defined(ARCH_CPU_X86_64)
1820 +// We don't need CPUID guards here, since x86-64 implies SSE2.
1822 // AMD64 ABI uses register paremters.
1823 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
1824 const uint8* u_buf, // rsi
1825 const uint8* v_buf, // rdx
1826 uint8* rgb_buf, // rcx
1827 int width) { // r8
1828 asm(
1829 - "jmp convertend\n"
1830 -"convertloop:"
1831 + "jmp 1f\n"
1832 +"0:"
1833 "movzb (%1),%%r10\n"
1834 "add $0x1,%1\n"
1835 "movzb (%2),%%r11\n"
1836 "add $0x1,%2\n"
1837 "movq 2048(%5,%%r10,8),%%xmm0\n"
1838 "movzb (%0),%%r10\n"
1839 "movq 4096(%5,%%r11,8),%%xmm1\n"
1840 "movzb 0x1(%0),%%r11\n"
1841 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
1842 "movq (%5,%%r11,8),%%xmm3\n"
1843 "paddsw %%xmm0,%%xmm2\n"
1844 "paddsw %%xmm0,%%xmm3\n"
1845 "shufps $0x44,%%xmm3,%%xmm2\n"
1846 "psraw $0x6,%%xmm2\n"
1847 "packuswb %%xmm2,%%xmm2\n"
1848 "movq %%xmm2,0x0(%3)\n"
1849 "add $0x8,%3\n"
1850 -"convertend:"
1851 +"1:"
1852 "sub $0x2,%4\n"
1853 - "jns convertloop\n"
1855 -"convertnext:"
1856 + "jns 0b\n"
1858 +"2:"
1859 "add $0x1,%4\n"
1860 - "js convertdone\n"
1861 + "js 3f\n"
1863 "movzb (%1),%%r10\n"
1864 "movq 2048(%5,%%r10,8),%%xmm0\n"
1865 "movzb (%2),%%r10\n"
1866 "movq 4096(%5,%%r10,8),%%xmm1\n"
1867 "paddsw %%xmm1,%%xmm0\n"
1868 "movzb (%0),%%r10\n"
1869 "movq (%5,%%r10,8),%%xmm1\n"
1870 "paddsw %%xmm0,%%xmm1\n"
1871 "psraw $0x6,%%xmm1\n"
1872 "packuswb %%xmm1,%%xmm1\n"
1873 "movd %%xmm1,0x0(%3)\n"
1874 -"convertdone:"
1875 +"3:"
1877 : "r"(y_buf), // %0
1878 "r"(u_buf), // %1
1879 "r"(v_buf), // %2
1880 "r"(rgb_buf), // %3
1881 "r"(width), // %4
1882 "r" (kCoefficientsRgbY) // %5
1883 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
1884 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1885 const uint8* u_buf, // rsi
1886 const uint8* v_buf, // rdx
1887 uint8* rgb_buf, // rcx
1888 int width, // r8
1889 int source_dx) { // r9
1890 asm(
1891 "xor %%r11,%%r11\n"
1892 "sub $0x2,%4\n"
1893 - "js scalenext\n"
1895 -"scaleloop:"
1896 + "js 1f\n"
1898 +"0:"
1899 "mov %%r11,%%r10\n"
1900 "sar $0x11,%%r10\n"
1901 "movzb (%1,%%r10,1),%%rax\n"
1902 "movq 2048(%5,%%rax,8),%%xmm0\n"
1903 "movzb (%2,%%r10,1),%%rax\n"
1904 "movq 4096(%5,%%rax,8),%%xmm1\n"
1905 "lea (%%r11,%6),%%r10\n"
1906 "sar $0x10,%%r11\n"
1907 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1908 "paddsw %%xmm0,%%xmm1\n"
1909 "paddsw %%xmm0,%%xmm2\n"
1910 "shufps $0x44,%%xmm2,%%xmm1\n"
1911 "psraw $0x6,%%xmm1\n"
1912 "packuswb %%xmm1,%%xmm1\n"
1913 "movq %%xmm1,0x0(%3)\n"
1914 "add $0x8,%3\n"
1915 "sub $0x2,%4\n"
1916 - "jns scaleloop\n"
1918 -"scalenext:"
1919 + "jns 0b\n"
1921 +"1:"
1922 "add $0x1,%4\n"
1923 - "js scaledone\n"
1924 + "js 2f\n"
1926 "mov %%r11,%%r10\n"
1927 "sar $0x11,%%r10\n"
1928 "movzb (%1,%%r10,1),%%rax\n"
1929 "movq 2048(%5,%%rax,8),%%xmm0\n"
1930 "movzb (%2,%%r10,1),%%rax\n"
1931 "movq 4096(%5,%%rax,8),%%xmm1\n"
1932 "paddsw %%xmm1,%%xmm0\n"
1933 "sar $0x10,%%r11\n"
1934 "movzb (%0,%%r11,1),%%rax\n"
1935 "movq (%5,%%rax,8),%%xmm1\n"
1936 "paddsw %%xmm0,%%xmm1\n"
1937 "psraw $0x6,%%xmm1\n"
1938 "packuswb %%xmm1,%%xmm1\n"
1939 "movd %%xmm1,0x0(%3)\n"
1941 -"scaledone:"
1942 +"2:"
1944 : "r"(y_buf), // %0
1945 "r"(u_buf), // %1
1946 "r"(v_buf), // %2
1947 "r"(rgb_buf), // %3
1948 "r"(width), // %4
1949 "r" (kCoefficientsRgbY), // %5
1950 "r"(static_cast<long>(source_dx)) // %6
1951 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
1952 const uint8* u_buf,
1953 const uint8* v_buf,
1954 uint8* rgb_buf,
1955 int width,
1956 int source_dx) {
1957 asm(
1958 "xor %%r11,%%r11\n" // x = 0
1959 "sub $0x2,%4\n"
1960 - "js .lscalenext\n"
1961 + "js 2f\n"
1962 "cmp $0x20000,%6\n" // if source_dx >= 2.0
1963 - "jl .lscalehalf\n"
1964 + "jl 0f\n"
1965 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1966 -".lscalehalf:"
1968 -".lscaleloop:"
1969 +"0:"
1971 +"1:"
1972 "mov %%r11,%%r10\n"
1973 "sar $0x11,%%r10\n"
1975 "movzb (%1, %%r10, 1), %%r13 \n"
1976 "movzb 1(%1, %%r10, 1), %%r14 \n"
1977 "mov %%r11, %%rax \n"
1978 "and $0x1fffe, %%rax \n"
1979 "imul %%rax, %%r14 \n"
1980 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
1981 "paddsw %%xmm0,%%xmm1\n"
1982 "paddsw %%xmm0,%%xmm2\n"
1983 "shufps $0x44,%%xmm2,%%xmm1\n"
1984 "psraw $0x6,%%xmm1\n"
1985 "packuswb %%xmm1,%%xmm1\n"
1986 "movq %%xmm1,0x0(%3)\n"
1987 "add $0x8,%3\n"
1988 "sub $0x2,%4\n"
1989 - "jns .lscaleloop\n"
1991 -".lscalenext:"
1992 + "jns 1b\n"
1994 +"2:"
1995 "add $0x1,%4\n"
1996 - "js .lscaledone\n"
1997 + "js 3f\n"
1999 "mov %%r11,%%r10\n"
2000 "sar $0x11,%%r10\n"
2002 "movzb (%1,%%r10,1), %%r13 \n"
2003 "movq 2048(%5,%%r13,8),%%xmm0\n"
2005 "movzb (%2,%%r10,1), %%r13 \n"
2006 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
2007 "movzb (%0,%%r11,1), %%r13 \n"
2008 "movq (%5,%%r13,8),%%xmm1\n"
2010 "paddsw %%xmm0,%%xmm1\n"
2011 "psraw $0x6,%%xmm1\n"
2012 "packuswb %%xmm1,%%xmm1\n"
2013 "movd %%xmm1,0x0(%3)\n"
2015 -".lscaledone:"
2016 +"3:"
2018 : "r"(y_buf), // %0
2019 "r"(u_buf), // %1
2020 "r"(v_buf), // %2
2021 "r"(rgb_buf), // %3
2022 "r"(width), // %4
2023 "r" (kCoefficientsRgbY), // %5
2024 "r"(static_cast<long>(source_dx)) // %6
2025 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
2029 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
2030 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
2032 // PIC version is slower because less registers are available, so
2033 // non-PIC is used on platforms where it is possible.
2035 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2036 - const uint8* u_buf,
2037 - const uint8* v_buf,
2038 - uint8* rgb_buf,
2039 - int width);
2040 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2041 + const uint8* u_buf,
2042 + const uint8* v_buf,
2043 + uint8* rgb_buf,
2044 + int width);
2045 asm(
2046 ".text\n"
2047 - ".global FastConvertYUVToRGB32Row\n"
2048 -"FastConvertYUVToRGB32Row:\n"
2049 + ".global FastConvertYUVToRGB32Row_SSE\n"
2050 + ".type FastConvertYUVToRGB32Row_SSE, @function\n"
2051 +"FastConvertYUVToRGB32Row_SSE:\n"
2052 "pusha\n"
2053 "mov 0x24(%esp),%edx\n"
2054 "mov 0x28(%esp),%edi\n"
2055 "mov 0x2c(%esp),%esi\n"
2056 "mov 0x30(%esp),%ebp\n"
2057 "mov 0x34(%esp),%ecx\n"
2058 - "jmp convertend\n"
2060 -"convertloop:"
2061 + "jmp 1f\n"
2063 +"0:"
2064 "movzbl (%edi),%eax\n"
2065 "add $0x1,%edi\n"
2066 "movzbl (%esi),%ebx\n"
2067 "add $0x1,%esi\n"
2068 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2069 "movzbl (%edx),%eax\n"
2070 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
2071 "movzbl 0x1(%edx),%ebx\n"
2072 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
2073 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
2074 "paddsw %mm0,%mm1\n"
2075 "paddsw %mm0,%mm2\n"
2076 "psraw $0x6,%mm1\n"
2077 "psraw $0x6,%mm2\n"
2078 "packuswb %mm2,%mm1\n"
2079 "movntq %mm1,0x0(%ebp)\n"
2080 "add $0x8,%ebp\n"
2081 -"convertend:"
2082 +"1:"
2083 "sub $0x2,%ecx\n"
2084 - "jns convertloop\n"
2085 + "jns 0b\n"
2087 "and $0x1,%ecx\n"
2088 - "je convertdone\n"
2089 + "je 2f\n"
2091 "movzbl (%edi),%eax\n"
2092 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2093 "movzbl (%esi),%eax\n"
2094 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2095 "movzbl (%edx),%eax\n"
2096 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2097 "paddsw %mm0,%mm1\n"
2098 "psraw $0x6,%mm1\n"
2099 "packuswb %mm1,%mm1\n"
2100 "movd %mm1,0x0(%ebp)\n"
2101 -"convertdone:"
2102 +"2:"
2103 "popa\n"
2104 "ret\n"
2105 +#if !defined(XP_MACOSX)
2106 + ".previous\n"
2107 +#endif
2111 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2112 - const uint8* u_buf,
2113 - const uint8* v_buf,
2114 - uint8* rgb_buf,
2115 - int width,
2116 - int source_dx);
2117 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
2118 + const uint8* u_buf,
2119 + const uint8* v_buf,
2120 + uint8* rgb_buf,
2121 + int width)
2123 + if (mozilla::supports_sse()) {
2124 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
2125 + return;
2128 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2132 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2133 + const uint8* u_buf,
2134 + const uint8* v_buf,
2135 + uint8* rgb_buf,
2136 + int width,
2137 + int source_dx);
2138 asm(
2139 ".text\n"
2140 - ".global ScaleYUVToRGB32Row\n"
2141 -"ScaleYUVToRGB32Row:\n"
2142 + ".global ScaleYUVToRGB32Row_SSE\n"
2143 + ".type ScaleYUVToRGB32Row_SSE, @function\n"
2144 +"ScaleYUVToRGB32Row_SSE:\n"
2145 "pusha\n"
2146 "mov 0x24(%esp),%edx\n"
2147 "mov 0x28(%esp),%edi\n"
2148 "mov 0x2c(%esp),%esi\n"
2149 "mov 0x30(%esp),%ebp\n"
2150 "mov 0x34(%esp),%ecx\n"
2151 "xor %ebx,%ebx\n"
2152 - "jmp scaleend\n"
2154 -"scaleloop:"
2155 + "jmp 1f\n"
2157 +"0:"
2158 "mov %ebx,%eax\n"
2159 "sar $0x11,%eax\n"
2160 "movzbl (%edi,%eax,1),%eax\n"
2161 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2162 "mov %ebx,%eax\n"
2163 "sar $0x11,%eax\n"
2164 "movzbl (%esi,%eax,1),%eax\n"
2165 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2166 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2167 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
2168 "paddsw %mm0,%mm1\n"
2169 "paddsw %mm0,%mm2\n"
2170 "psraw $0x6,%mm1\n"
2171 "psraw $0x6,%mm2\n"
2172 "packuswb %mm2,%mm1\n"
2173 "movntq %mm1,0x0(%ebp)\n"
2174 "add $0x8,%ebp\n"
2175 -"scaleend:"
2176 +"1:"
2177 "sub $0x2,%ecx\n"
2178 - "jns scaleloop\n"
2179 + "jns 0b\n"
2181 "and $0x1,%ecx\n"
2182 - "je scaledone\n"
2183 + "je 2f\n"
2185 "mov %ebx,%eax\n"
2186 "sar $0x11,%eax\n"
2187 "movzbl (%edi,%eax,1),%eax\n"
2188 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2189 "mov %ebx,%eax\n"
2190 "sar $0x11,%eax\n"
2191 "movzbl (%esi,%eax,1),%eax\n"
2192 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2193 "sar $0x10,%eax\n"
2194 "movzbl (%edx,%eax,1),%eax\n"
2195 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2196 "paddsw %mm0,%mm1\n"
2197 "psraw $0x6,%mm1\n"
2198 "packuswb %mm1,%mm1\n"
2199 "movd %mm1,0x0(%ebp)\n"
2201 -"scaledone:"
2202 +"2:"
2203 "popa\n"
2204 "ret\n"
2205 +#if !defined(XP_MACOSX)
2206 + ".previous\n"
2207 +#endif
2210 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2211 - const uint8* u_buf,
2212 - const uint8* v_buf,
2213 - uint8* rgb_buf,
2214 - int width,
2215 - int source_dx);
2216 +void ScaleYUVToRGB32Row(const uint8* y_buf,
2217 + const uint8* u_buf,
2218 + const uint8* v_buf,
2219 + uint8* rgb_buf,
2220 + int width,
2221 + int source_dx)
2223 + if (mozilla::supports_sse()) {
2224 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2225 + width, source_dx);
2228 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2229 + width, source_dx);
2232 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2233 + const uint8* u_buf,
2234 + const uint8* v_buf,
2235 + uint8* rgb_buf,
2236 + int width,
2237 + int source_dx);
2238 asm(
2239 ".text\n"
2240 - ".global LinearScaleYUVToRGB32Row\n"
2241 -"LinearScaleYUVToRGB32Row:\n"
2242 + ".global LinearScaleYUVToRGB32Row_SSE\n"
2243 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
2244 +"LinearScaleYUVToRGB32Row_SSE:\n"
2245 "pusha\n"
2246 "mov 0x24(%esp),%edx\n"
2247 "mov 0x28(%esp),%edi\n"
2248 "mov 0x30(%esp),%ebp\n"
2250 // source_width = width * source_dx + ebx
2251 "mov 0x34(%esp), %ecx\n"
2252 "imull 0x38(%esp), %ecx\n"
2253 "mov %ecx, 0x34(%esp)\n"
2255 "mov 0x38(%esp), %ecx\n"
2256 "xor %ebx,%ebx\n" // x = 0
2257 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2258 - "jl .lscaleend\n"
2259 + "jl 1f\n"
2260 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2261 - "jmp .lscaleend\n"
2263 -".lscaleloop:"
2264 - "mov %ebx,%eax\n"
2265 - "sar $0x11,%eax\n"
2266 + "jmp 1f\n"
2268 +"0:"
2269 + "mov %ebx,%eax\n"
2270 + "sar $0x11,%eax\n"
2272 "movzbl (%edi,%eax,1),%ecx\n"
2273 "movzbl 1(%edi,%eax,1),%esi\n"
2274 "mov %ebx,%eax\n"
2275 "andl $0x1fffe, %eax \n"
2276 "imul %eax, %esi \n"
2277 "xorl $0x1fffe, %eax \n"
2278 "imul %eax, %ecx \n"
2279 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
2280 "imul %eax, %esi \n"
2281 "xorl $0xffff, %eax \n"
2282 "imul %eax, %ecx \n"
2283 "addl %esi, %ecx \n"
2284 "shrl $16, %ecx \n"
2285 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
2287 "cmp 0x34(%esp), %ebx\n"
2288 - "jge .lscalelastpixel\n"
2289 + "jge 2f\n"
2291 "mov %ebx,%eax\n"
2292 "sar $0x10,%eax\n"
2293 "movzbl (%edx,%eax,1),%ecx\n"
2294 "movzbl 1(%edx,%eax,1),%esi\n"
2295 "mov %ebx,%eax\n"
2296 "add 0x38(%esp),%ebx\n"
2297 "andl $0xffff, %eax \n"
2298 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
2299 "paddsw %mm0,%mm1\n"
2300 "paddsw %mm0,%mm2\n"
2301 "psraw $0x6,%mm1\n"
2302 "psraw $0x6,%mm2\n"
2303 "packuswb %mm2,%mm1\n"
2304 "movntq %mm1,0x0(%ebp)\n"
2305 "add $0x8,%ebp\n"
2307 -".lscaleend:"
2308 +"1:"
2309 "cmp 0x34(%esp), %ebx\n"
2310 - "jl .lscaleloop\n"
2311 + "jl 0b\n"
2312 "popa\n"
2313 "ret\n"
2315 -".lscalelastpixel:"
2316 +"2:"
2317 "paddsw %mm0, %mm1\n"
2318 "psraw $6, %mm1\n"
2319 "packuswb %mm1, %mm1\n"
2320 "movd %mm1, (%ebp)\n"
2321 "popa\n"
2322 "ret\n"
2323 +#if !defined(XP_MACOSX)
2324 + ".previous\n"
2325 +#endif
2328 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
2330 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
2331 - const uint8* u_buf,
2332 - const uint8* v_buf,
2333 - uint8* rgb_buf,
2334 - int width,
2335 - int16 *kCoefficientsRgbY);
2336 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2337 + const uint8* u_buf,
2338 + const uint8* v_buf,
2339 + uint8* rgb_buf,
2340 + int width,
2341 + int source_dx)
2343 + if (mozilla::supports_sse()) {
2344 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2345 + width, source_dx);
2348 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2349 + width, source_dx);
2352 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
2354 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2355 + const uint8* u_buf,
2356 + const uint8* v_buf,
2357 + uint8* rgb_buf,
2358 + int width,
2359 + int16 *kCoefficientsRgbY);
2361 asm(
2362 ".text\n"
2363 -#if defined(OS_MACOSX)
2364 -"_PICConvertYUVToRGB32Row:\n"
2365 +#if defined(XP_MACOSX)
2366 +"_PICConvertYUVToRGB32Row_SSE:\n"
2367 #else
2368 -"PICConvertYUVToRGB32Row:\n"
2369 +"PICConvertYUVToRGB32Row_SSE:\n"
2370 #endif
2371 "pusha\n"
2372 "mov 0x24(%esp),%edx\n"
2373 "mov 0x28(%esp),%edi\n"
2374 "mov 0x2c(%esp),%esi\n"
2375 "mov 0x30(%esp),%ebp\n"
2376 "mov 0x38(%esp),%ecx\n"
2378 - "jmp .Lconvertend\n"
2380 -".Lconvertloop:"
2381 + "jmp 1f\n"
2383 +"0:"
2384 "movzbl (%edi),%eax\n"
2385 "add $0x1,%edi\n"
2386 "movzbl (%esi),%ebx\n"
2387 "add $0x1,%esi\n"
2388 "movq 2048(%ecx,%eax,8),%mm0\n"
2389 "movzbl (%edx),%eax\n"
2390 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
2391 "movzbl 0x1(%edx),%ebx\n"
2392 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
2393 "movq 0(%ecx,%ebx,8),%mm2\n"
2394 "paddsw %mm0,%mm1\n"
2395 "paddsw %mm0,%mm2\n"
2396 "psraw $0x6,%mm1\n"
2397 "psraw $0x6,%mm2\n"
2398 "packuswb %mm2,%mm1\n"
2399 "movntq %mm1,0x0(%ebp)\n"
2400 "add $0x8,%ebp\n"
2401 -".Lconvertend:"
2402 +"1:"
2403 "subl $0x2,0x34(%esp)\n"
2404 - "jns .Lconvertloop\n"
2405 + "jns 0b\n"
2407 "andl $0x1,0x34(%esp)\n"
2408 - "je .Lconvertdone\n"
2409 + "je 2f\n"
2411 "movzbl (%edi),%eax\n"
2412 "movq 2048(%ecx,%eax,8),%mm0\n"
2413 "movzbl (%esi),%eax\n"
2414 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2415 "movzbl (%edx),%eax\n"
2416 "movq 0(%ecx,%eax,8),%mm1\n"
2417 "paddsw %mm0,%mm1\n"
2418 "psraw $0x6,%mm1\n"
2419 "packuswb %mm1,%mm1\n"
2420 "movd %mm1,0x0(%ebp)\n"
2421 -".Lconvertdone:\n"
2422 +"2:"
2423 "popa\n"
2424 "ret\n"
2425 +#if !defined(XP_MACOSX)
2426 + ".previous\n"
2427 +#endif
2430 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2431 const uint8* u_buf,
2432 const uint8* v_buf,
2433 uint8* rgb_buf,
2434 - int width) {
2435 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
2436 - &kCoefficientsRgbY[0][0]);
2439 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
2440 + int width)
2442 + if (mozilla::supports_sse()) {
2443 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2444 + &kCoefficientsRgbY[0][0]);
2445 + return;
2448 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2451 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2452 const uint8* u_buf,
2453 const uint8* v_buf,
2454 uint8* rgb_buf,
2455 int width,
2456 int source_dx,
2457 int16 *kCoefficientsRgbY);
2459 asm(
2460 ".text\n"
2461 -#if defined(OS_MACOSX)
2462 -"_PICScaleYUVToRGB32Row:\n"
2463 +#if defined(XP_MACOSX)
2464 +"_PICScaleYUVToRGB32Row_SSE:\n"
2465 #else
2466 -"PICScaleYUVToRGB32Row:\n"
2467 +"PICScaleYUVToRGB32Row_SSE:\n"
2468 #endif
2469 "pusha\n"
2470 "mov 0x24(%esp),%edx\n"
2471 "mov 0x28(%esp),%edi\n"
2472 "mov 0x2c(%esp),%esi\n"
2473 "mov 0x30(%esp),%ebp\n"
2474 "mov 0x3c(%esp),%ecx\n"
2475 "xor %ebx,%ebx\n"
2476 - "jmp Lscaleend\n"
2478 -"Lscaleloop:"
2479 + "jmp 1f\n"
2481 +"0:"
2482 "mov %ebx,%eax\n"
2483 "sar $0x11,%eax\n"
2484 "movzbl (%edi,%eax,1),%eax\n"
2485 "movq 2048(%ecx,%eax,8),%mm0\n"
2486 "mov %ebx,%eax\n"
2487 "sar $0x11,%eax\n"
2488 "movzbl (%esi,%eax,1),%eax\n"
2489 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2490 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
2491 "movq 0(%ecx,%eax,8),%mm2\n"
2492 "paddsw %mm0,%mm1\n"
2493 "paddsw %mm0,%mm2\n"
2494 "psraw $0x6,%mm1\n"
2495 "psraw $0x6,%mm2\n"
2496 "packuswb %mm2,%mm1\n"
2497 "movntq %mm1,0x0(%ebp)\n"
2498 "add $0x8,%ebp\n"
2499 -"Lscaleend:"
2500 +"1:"
2501 "subl $0x2,0x34(%esp)\n"
2502 - "jns Lscaleloop\n"
2503 + "jns 0b\n"
2505 "andl $0x1,0x34(%esp)\n"
2506 - "je Lscaledone\n"
2507 + "je 2f\n"
2509 "mov %ebx,%eax\n"
2510 "sar $0x11,%eax\n"
2511 "movzbl (%edi,%eax,1),%eax\n"
2512 "movq 2048(%ecx,%eax,8),%mm0\n"
2513 "mov %ebx,%eax\n"
2514 "sar $0x11,%eax\n"
2515 "movzbl (%esi,%eax,1),%eax\n"
2516 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
2517 "sar $0x10,%eax\n"
2518 "movzbl (%edx,%eax,1),%eax\n"
2519 "movq 0(%ecx,%eax,8),%mm1\n"
2520 "paddsw %mm0,%mm1\n"
2521 "psraw $0x6,%mm1\n"
2522 "packuswb %mm1,%mm1\n"
2523 "movd %mm1,0x0(%ebp)\n"
2525 -"Lscaledone:"
2526 +"2:"
2527 "popa\n"
2528 "ret\n"
2529 +#if !defined(XP_MACOSX)
2530 + ".previous\n"
2531 +#endif
2535 void ScaleYUVToRGB32Row(const uint8* y_buf,
2536 const uint8* u_buf,
2537 const uint8* v_buf,
2538 uint8* rgb_buf,
2539 int width,
2540 - int source_dx) {
2541 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2542 - &kCoefficientsRgbY[0][0]);
2545 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
2546 - const uint8* u_buf,
2547 - const uint8* v_buf,
2548 - uint8* rgb_buf,
2549 - int width,
2550 - int source_dx,
2551 - int16 *kCoefficientsRgbY);
2552 + int source_dx)
2554 + if (mozilla::supports_sse()) {
2555 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2556 + &kCoefficientsRgbY[0][0]);
2557 + return;
2560 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2563 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2564 + const uint8* u_buf,
2565 + const uint8* v_buf,
2566 + uint8* rgb_buf,
2567 + int width,
2568 + int source_dx,
2569 + int16 *kCoefficientsRgbY);
2571 asm(
2572 ".text\n"
2573 -#if defined(OS_MACOSX)
2574 -"_PICLinearScaleYUVToRGB32Row:\n"
2575 +#if defined(XP_MACOSX)
2576 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
2577 #else
2578 -"PICLinearScaleYUVToRGB32Row:\n"
2579 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
2580 #endif
2581 "pusha\n"
2582 "mov 0x24(%esp),%edx\n"
2583 "mov 0x30(%esp),%ebp\n"
2584 "mov 0x34(%esp),%ecx\n"
2585 "mov 0x3c(%esp),%edi\n"
2586 "xor %ebx,%ebx\n"
2588 // source_width = width * source_dx + ebx
2589 "mov 0x34(%esp), %ecx\n"
2590 "imull 0x38(%esp), %ecx\n"
2591 "mov %ecx, 0x34(%esp)\n"
2593 "mov 0x38(%esp), %ecx\n"
2594 "xor %ebx,%ebx\n" // x = 0
2595 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2596 - "jl .lscaleend\n"
2597 + "jl 1f\n"
2598 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2599 - "jmp .lscaleend\n"
2601 -".lscaleloop:"
2602 + "jmp 1f\n"
2604 +"0:"
2605 "mov 0x28(%esp),%esi\n"
2606 "mov %ebx,%eax\n"
2607 "sar $0x11,%eax\n"
2609 "movzbl (%esi,%eax,1),%ecx\n"
2610 "movzbl 1(%esi,%eax,1),%esi\n"
2611 "mov %ebx,%eax\n"
2612 "andl $0x1fffe, %eax \n"
2613 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
2614 "imul %eax, %esi \n"
2615 "xorl $0xffff, %eax \n"
2616 "imul %eax, %ecx \n"
2617 "addl %esi, %ecx \n"
2618 "shrl $16, %ecx \n"
2619 "movq (%edi,%ecx,8),%mm1\n"
2621 "cmp 0x34(%esp), %ebx\n"
2622 - "jge .lscalelastpixel\n"
2623 + "jge 2f\n"
2625 "mov %ebx,%eax\n"
2626 "sar $0x10,%eax\n"
2627 "movzbl (%edx,%eax,1),%ecx\n"
2628 "movzbl 1(%edx,%eax,1),%esi\n"
2629 "mov %ebx,%eax\n"
2630 "add 0x38(%esp),%ebx\n"
2631 "andl $0xffff, %eax \n"
2632 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
2633 "paddsw %mm0,%mm1\n"
2634 "paddsw %mm0,%mm2\n"
2635 "psraw $0x6,%mm1\n"
2636 "psraw $0x6,%mm2\n"
2637 "packuswb %mm2,%mm1\n"
2638 "movntq %mm1,0x0(%ebp)\n"
2639 "add $0x8,%ebp\n"
2641 -".lscaleend:"
2642 +"1:"
2643 "cmp %ebx, 0x34(%esp)\n"
2644 - "jg .lscaleloop\n"
2645 + "jg 0b\n"
2646 "popa\n"
2647 "ret\n"
2649 -".lscalelastpixel:"
2650 +"2:"
2651 "paddsw %mm0, %mm1\n"
2652 "psraw $6, %mm1\n"
2653 "packuswb %mm1, %mm1\n"
2654 "movd %mm1, (%ebp)\n"
2655 "popa\n"
2656 "ret\n"
2657 +#if !defined(XP_MACOSX)
2658 + ".previous\n"
2659 +#endif
2663 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2664 - const uint8* u_buf,
2665 - const uint8* v_buf,
2666 - uint8* rgb_buf,
2667 - int width,
2668 - int source_dx) {
2669 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2670 - &kCoefficientsRgbY[0][0]);
2673 -#else // USE_MMX
2675 -// C reference code that mimic the YUV assembly.
2676 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2677 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2678 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2680 -static inline void YuvPixel(uint8 y,
2681 - uint8 u,
2682 - uint8 v,
2683 - uint8* rgb_buf) {
2685 - int b = kCoefficientsRgbY[256+u][0];
2686 - int g = kCoefficientsRgbY[256+u][1];
2687 - int r = kCoefficientsRgbY[256+u][2];
2688 - int a = kCoefficientsRgbY[256+u][3];
2690 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2691 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2692 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2693 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2695 - b = paddsw(b, kCoefficientsRgbY[y][0]);
2696 - g = paddsw(g, kCoefficientsRgbY[y][1]);
2697 - r = paddsw(r, kCoefficientsRgbY[y][2]);
2698 - a = paddsw(a, kCoefficientsRgbY[y][3]);
2700 - b >>= 6;
2701 - g >>= 6;
2702 - r >>= 6;
2703 - a >>= 6;
2705 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
2706 - (packuswb(g) << 8) |
2707 - (packuswb(r) << 16) |
2708 - (packuswb(a) << 24);
2711 + const uint8* u_buf,
2712 + const uint8* v_buf,
2713 + uint8* rgb_buf,
2714 + int width,
2715 + int source_dx)
2717 + if (mozilla::supports_sse()) {
2718 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2719 + source_dx, &kCoefficientsRgbY[0][0]);
2720 + return;
2723 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2725 +#else
2726 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2727 const uint8* u_buf,
2728 const uint8* v_buf,
2729 uint8* rgb_buf,
2730 int width) {
2731 - for (int x = 0; x < width; x += 2) {
2732 - uint8 u = u_buf[x >> 1];
2733 - uint8 v = v_buf[x >> 1];
2734 - uint8 y0 = y_buf[x];
2735 - YuvPixel(y0, u, v, rgb_buf);
2736 - if ((x + 1) < width) {
2737 - uint8 y1 = y_buf[x + 1];
2738 - YuvPixel(y1, u, v, rgb_buf + 4);
2740 - rgb_buf += 8; // Advance 2 pixels.
2744 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
2745 -// A shift by 17 is used to further subsample the chrominence channels.
2746 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
2747 -// for 1/65536 pixel accurate interpolation.
2748 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2751 void ScaleYUVToRGB32Row(const uint8* y_buf,
2752 const uint8* u_buf,
2753 const uint8* v_buf,
2754 uint8* rgb_buf,
2755 int width,
2756 int source_dx) {
2757 - int x = 0;
2758 - for (int i = 0; i < width; i += 2) {
2759 - int y = y_buf[x >> 16];
2760 - int u = u_buf[(x >> 17)];
2761 - int v = v_buf[(x >> 17)];
2762 - YuvPixel(y, u, v, rgb_buf);
2763 - x += source_dx;
2764 - if ((i + 1) < width) {
2765 - y = y_buf[x >> 16];
2766 - YuvPixel(y, u, v, rgb_buf+4);
2767 - x += source_dx;
2769 - rgb_buf += 8;
2772 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2775 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2776 const uint8* u_buf,
2777 const uint8* v_buf,
2778 uint8* rgb_buf,
2779 int width,
2780 int source_dx) {
2781 - int x = 0;
2782 - if (source_dx >= 0x20000) {
2783 - x = 32768;
2785 - for (int i = 0; i < width; i += 2) {
2786 - int y0 = y_buf[x >> 16];
2787 - int y1 = y_buf[(x >> 16) + 1];
2788 - int u0 = u_buf[(x >> 17)];
2789 - int u1 = u_buf[(x >> 17) + 1];
2790 - int v0 = v_buf[(x >> 17)];
2791 - int v1 = v_buf[(x >> 17) + 1];
2792 - int y_frac = (x & 65535);
2793 - int uv_frac = ((x >> 1) & 65535);
2794 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2795 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
2796 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
2797 - YuvPixel(y, u, v, rgb_buf);
2798 - x += source_dx;
2799 - if ((i + 1) < width) {
2800 - y0 = y_buf[x >> 16];
2801 - y1 = y_buf[(x >> 16) + 1];
2802 - y_frac = (x & 65535);
2803 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2804 - YuvPixel(y, u, v, rgb_buf+4);
2805 - x += source_dx;
2807 - rgb_buf += 8;
2811 -#endif // USE_MMX
2812 -} // extern "C"
2814 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2816 +#endif
2819 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
2820 --- a/gfx/ycbcr/yuv_row_table.cpp
2821 +++ b/gfx/ycbcr/yuv_row_table.cpp
2822 @@ -1,13 +1,13 @@
2823 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2824 // Use of this source code is governed by a BSD-style license that can be
2825 // found in the LICENSE file.
2827 -#include "media/base/yuv_row.h"
2828 +#include "yuv_row.h"
2830 extern "C" {
2832 #define RGBY(i) { \
2833 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2834 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2835 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2837 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
2838 --- a/gfx/ycbcr/yuv_row_win.cpp
2839 +++ b/gfx/ycbcr/yuv_row_win.cpp
2840 @@ -1,26 +1,27 @@
2841 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2842 // Use of this source code is governed by a BSD-style license that can be
2843 // found in the LICENSE file.
2845 -#include "media/base/yuv_row.h"
2846 +#include "yuv_row.h"
2847 +#include "mozilla/SSE.h"
2849 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
2850 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
2852 extern "C" {
2854 -#if USE_MMX
2855 -__declspec(naked)
2856 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2857 - const uint8* u_buf,
2858 - const uint8* v_buf,
2859 - uint8* rgb_buf,
2860 - int width) {
2861 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
2862 +__declspec(naked)
2863 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2864 + const uint8* u_buf,
2865 + const uint8* v_buf,
2866 + uint8* rgb_buf,
2867 + int width) {
2868 __asm {
2869 pushad
2870 mov edx, [esp + 32 + 4] // Y
2871 mov edi, [esp + 32 + 8] // U
2872 mov esi, [esp + 32 + 12] // V
2873 mov ebp, [esp + 32 + 16] // rgb
2874 mov ecx, [esp + 32 + 20] // width
2875 jmp convertend
2876 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
2877 convertdone :
2879 popad
2884 __declspec(naked)
2885 -void ConvertYUVToRGB32Row(const uint8* y_buf,
2886 - const uint8* u_buf,
2887 - const uint8* v_buf,
2888 - uint8* rgb_buf,
2889 - int width,
2890 - int step) {
2891 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2892 + const uint8* u_buf,
2893 + const uint8* v_buf,
2894 + uint8* rgb_buf,
2895 + int width,
2896 + int step) {
2897 __asm {
2898 pushad
2899 mov edx, [esp + 32 + 4] // Y
2900 mov edi, [esp + 32 + 8] // U
2901 mov esi, [esp + 32 + 12] // V
2902 mov ebp, [esp + 32 + 16] // rgb
2903 mov ecx, [esp + 32 + 20] // width
2904 mov ebx, [esp + 32 + 24] // step
2905 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
2906 wdone :
2908 popad
2913 __declspec(naked)
2914 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
2915 - const uint8* u_buf,
2916 - const uint8* v_buf,
2917 - uint8* rgb_buf,
2918 - int width,
2919 - int ystep,
2920 - int uvstep) {
2921 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2922 + const uint8* u_buf,
2923 + const uint8* v_buf,
2924 + uint8* rgb_buf,
2925 + int width,
2926 + int ystep,
2927 + int uvstep) {
2928 __asm {
2929 pushad
2930 mov edx, [esp + 32 + 4] // Y
2931 mov edi, [esp + 32 + 8] // U
2932 mov esi, [esp + 32 + 12] // V
2933 mov ebp, [esp + 32 + 16] // rgb
2934 mov ecx, [esp + 32 + 20] // width
2935 jmp wend
2936 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
2937 wdone :
2939 popad
2944 __declspec(naked)
2945 -void DoubleYUVToRGB32Row(const uint8* y_buf,
2946 - const uint8* u_buf,
2947 - const uint8* v_buf,
2948 - uint8* rgb_buf,
2949 - int width) {
2950 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
2951 + const uint8* u_buf,
2952 + const uint8* v_buf,
2953 + uint8* rgb_buf,
2954 + int width) {
2955 __asm {
2956 pushad
2957 mov edx, [esp + 32 + 4] // Y
2958 mov edi, [esp + 32 + 8] // U
2959 mov esi, [esp + 32 + 12] // V
2960 mov ebp, [esp + 32 + 16] // rgb
2961 mov ecx, [esp + 32 + 20] // width
2962 jmp wend
2963 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
2964 jns wloop1
2965 wdone :
2966 popad
2971 // This version does general purpose scaling by any amount, up or down.
2972 -// The only thing it can not do it rotation by 90 or 270.
2973 -// For performance the chroma is under sampled, reducing cost of a 3x
2974 +// The only thing it cannot do is rotation by 90 or 270.
2975 +// For performance the chroma is under-sampled, reducing cost of a 3x
2976 // 1080p scale from 8.4 ms to 5.4 ms.
2977 __declspec(naked)
2978 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2979 - const uint8* u_buf,
2980 - const uint8* v_buf,
2981 - uint8* rgb_buf,
2982 - int width,
2983 - int source_dx) {
2984 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2985 + const uint8* u_buf,
2986 + const uint8* v_buf,
2987 + uint8* rgb_buf,
2988 + int width,
2989 + int source_dx) {
2990 __asm {
2991 pushad
2992 mov edx, [esp + 32 + 4] // Y
2993 mov edi, [esp + 32 + 8] // U
2994 mov esi, [esp + 32 + 12] // V
2995 mov ebp, [esp + 32 + 16] // rgb
2996 mov ecx, [esp + 32 + 20] // width
2997 xor ebx, ebx // x
2998 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
3000 scaledone :
3001 popad
3006 __declspec(naked)
3007 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3008 - const uint8* u_buf,
3009 - const uint8* v_buf,
3010 - uint8* rgb_buf,
3011 - int width,
3012 - int source_dx) {
3013 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
3014 + const uint8* u_buf,
3015 + const uint8* v_buf,
3016 + uint8* rgb_buf,
3017 + int width,
3018 + int source_dx) {
3019 __asm {
3020 pushad
3021 mov edx, [esp + 32 + 4] // Y
3022 mov edi, [esp + 32 + 8] // U
3023 // [esp + 32 + 12] // V
3024 mov ebp, [esp + 32 + 16] // rgb
3025 mov ecx, [esp + 32 + 20] // width
3026 imul ecx, [esp + 32 + 24] // source_dx
3027 @@ -438,152 +439,60 @@ lscalelastpixel:
3028 paddsw mm1, mm0
3029 psraw mm1, 6
3030 packuswb mm1, mm1
3031 movd [ebp], mm1
3032 popad
3036 -#else // USE_MMX
3038 -// C reference code that mimic the YUV assembly.
3039 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
3040 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
3041 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
3043 -static inline void YuvPixel(uint8 y,
3044 - uint8 u,
3045 - uint8 v,
3046 - uint8* rgb_buf) {
3048 - int b = kCoefficientsRgbY[256+u][0];
3049 - int g = kCoefficientsRgbY[256+u][1];
3050 - int r = kCoefficientsRgbY[256+u][2];
3051 - int a = kCoefficientsRgbY[256+u][3];
3053 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
3054 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
3055 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
3056 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
3058 - b = paddsw(b, kCoefficientsRgbY[y][0]);
3059 - g = paddsw(g, kCoefficientsRgbY[y][1]);
3060 - r = paddsw(r, kCoefficientsRgbY[y][2]);
3061 - a = paddsw(a, kCoefficientsRgbY[y][3]);
3063 - b >>= 6;
3064 - g >>= 6;
3065 - r >>= 6;
3066 - a >>= 6;
3068 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
3069 - (packuswb(g) << 8) |
3070 - (packuswb(r) << 16) |
3071 - (packuswb(a) << 24);
3074 -#if TEST_MMX_YUV
3075 -static inline void YuvPixel(uint8 y,
3076 - uint8 u,
3077 - uint8 v,
3078 - uint8* rgb_buf) {
3080 - __asm {
3081 - movzx eax, u
3082 - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax]
3083 - movzx eax, v
3084 - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax]
3085 - movzx eax, y
3086 - movq mm1, [kCoefficientsRgbY + 8 * eax]
3087 - paddsw mm1, mm0
3088 - psraw mm1, 6
3089 - packuswb mm1, mm1
3090 - mov eax, rgb_buf
3091 - movd [eax], mm1
3092 - emms
3095 -#endif
3096 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3098 void FastConvertYUVToRGB32Row(const uint8* y_buf,
3099 const uint8* u_buf,
3100 const uint8* v_buf,
3101 uint8* rgb_buf,
3102 int width) {
3103 - for (int x = 0; x < width; x += 2) {
3104 - uint8 u = u_buf[x >> 1];
3105 - uint8 v = v_buf[x >> 1];
3106 - uint8 y0 = y_buf[x];
3107 - YuvPixel(y0, u, v, rgb_buf);
3108 - if ((x + 1) < width) {
3109 - uint8 y1 = y_buf[x + 1];
3110 - YuvPixel(y1, u, v, rgb_buf + 4);
3112 - rgb_buf += 8; // Advance 2 pixels.
3116 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
3117 -// A shift by 17 is used to further subsample the chrominence channels.
3118 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
3119 -// for 1/65536 pixel accurate interpolation.
3120 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3121 + if (mozilla::supports_sse()) {
3122 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
3123 + return;
3125 +#endif
3127 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
3130 void ScaleYUVToRGB32Row(const uint8* y_buf,
3131 const uint8* u_buf,
3132 const uint8* v_buf,
3133 uint8* rgb_buf,
3134 int width,
3135 int source_dx) {
3136 - int x = 0;
3137 - for (int i = 0; i < width; i += 2) {
3138 - int y = y_buf[x >> 16];
3139 - int u = u_buf[(x >> 17)];
3140 - int v = v_buf[(x >> 17)];
3141 - YuvPixel(y, u, v, rgb_buf);
3142 - x += source_dx;
3143 - if ((i + 1) < width) {
3144 - y = y_buf[x >> 16];
3145 - YuvPixel(y, u, v, rgb_buf+4);
3146 - x += source_dx;
3148 - rgb_buf += 8;
3152 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3153 + if (mozilla::supports_sse()) {
3154 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3155 + return;
3157 +#endif
3159 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3162 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3163 const uint8* u_buf,
3164 const uint8* v_buf,
3165 uint8* rgb_buf,
3166 int width,
3167 int source_dx) {
3168 - int x = 0;
3169 - if (source_dx >= 0x20000) {
3170 - x = 32768;
3172 - for (int i = 0; i < width; i += 2) {
3173 - int y0 = y_buf[x >> 16];
3174 - int y1 = y_buf[(x >> 16) + 1];
3175 - int u0 = u_buf[(x >> 17)];
3176 - int u1 = u_buf[(x >> 17) + 1];
3177 - int v0 = v_buf[(x >> 17)];
3178 - int v1 = v_buf[(x >> 17) + 1];
3179 - int y_frac = (x & 65535);
3180 - int uv_frac = ((x >> 1) & 65535);
3181 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3182 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
3183 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
3184 - YuvPixel(y, u, v, rgb_buf);
3185 - x += source_dx;
3186 - if ((i + 1) < width) {
3187 - y0 = y_buf[x >> 16];
3188 - y1 = y_buf[(x >> 16) + 1];
3189 - y_frac = (x & 65535);
3190 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3191 - YuvPixel(y, u, v, rgb_buf+4);
3192 - x += source_dx;
3194 - rgb_buf += 8;
3198 -#endif // USE_MMX
3199 -} // extern "C"
3201 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3202 + if (mozilla::supports_sse()) {
3203 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
3204 + source_dx);
3205 + return;
3207 +#endif
3209 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3212 +} // extern "C"