2009-06-17 Jeffrey Stedfast <fejj@novell.com>
[moon.git] / src / yuv-converter.cpp
bloba5ec2980ee2f3117e043719685dc7f6deeb3f1ee
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * yuv-converter.cpp: YUV2RGB converters for the pipeline
5 * Contact:
6 * Moonlight List (moonlight-list@lists.ximian.com)
8 * Copyright 2008 Novell, Inc. (http://www.novell.com)
10 * See the LICENSE file included with the distribution for details.
14 #include <config.h>
16 #include <glib.h>
18 #include <stdlib.h>
20 #include "yuv-converter.h"
22 /* R = 1.164 * (Y - 16) + 1.596 * (V - 128)
23 * G = 1.164 * (Y - 16) - 0.813 * (V - 128) - 0.391 * (U - 128)
24 * B = 1.164 * (Y - 16) + 2.018 * (U - 128)
26 * R V coefficient = 1.596*64 = 102 = 0x66
27 * G V coefficient = 0.813*64 = 52 = 0x34 (-ve) == 0xFFCC
28 * G U coefficient = 0.391*64 = 25 = 0x19 (-ve) == 0xFFE7
29 * B U coefficient = 2.018*64 = 129 = 0x81
30 * Y coefficient = 1.164*64 = 74 = 0x4a
33 #define RED_V_C 0x0066006600660066ULL
34 #define GREEN_V_C 0xffccffccffccffccULL
35 #define GREEN_U_C 0xffe7ffe7ffe7ffe7ULL
36 #define BLUE_U_C 0x0081008100810081ULL
37 #define Y_C 0x004a004a004a004aULL
38 #define UV_128 0x0080008000800080ULL
39 #define Y_16 0x1010101010101010ULL
40 #define ALPHA_MASK 0xFFFFFFFFFFFFFFFFULL
42 #if HAVE_SSE2 || HAVE_MMX
43 static const guint64 simd_table [16] __attribute__ ((aligned (16))) = {
44 RED_V_C, RED_V_C,
45 GREEN_V_C, GREEN_V_C,
46 GREEN_U_C, GREEN_U_C,
47 BLUE_U_C, BLUE_U_C,
48 Y_C, Y_C,
49 UV_128, UV_128,
50 Y_16, Y_16,
51 ALPHA_MASK, ALPHA_MASK,
54 #define PREFETCH(memory) do { \
55 __asm__ __volatile__ ( \
56 "prefetchnta (%0);" \
57 : : "r" (memory)); \
58 } while (0);
60 #if defined(__x86_64__)
61 #define ALIGN_CMP_REG "rax"
62 #else
63 #define ALIGN_CMP_REG "eax"
64 #endif
66 #define CALC_COLOR_MODIFIERS(mov_instr, reg_type, alignment, align_reg, u, v, coeff_storage) do { \
67 __asm__ __volatile__ ( \
68 "mov %0, %%"align_reg";" \
69 "and $"alignment", %%"align_reg";" \
70 "test %%"align_reg", %%"align_reg";" \
71 "je 1f;" \
73 mov_instr " 48(%2), %%"reg_type"2;" /* restore Dred */ \
74 mov_instr " 64(%2), %%"reg_type"3;" /* restore Dgreen */ \
75 mov_instr " 80(%2), %%"reg_type"1;" /* restore Dblue */ \
77 mov_instr " %%"reg_type"2, (%2);" /* backup Dred */ \
78 mov_instr " %%"reg_type"3, 16(%2);" /* backup Dgreen */ \
79 mov_instr " %%"reg_type"1, 32(%2);" /* backup Dblue */ \
81 "jmp 2f;" \
83 "1:" \
84 "pxor %%"reg_type"7, %%"reg_type"7;" \
86 mov_instr " (%0), %%"reg_type"1;" \
87 mov_instr " (%1), %%"reg_type"2;" \
89 mov_instr " %%"reg_type"1, %%"reg_type"5;" \
90 mov_instr " %%"reg_type"2, %%"reg_type"6;" \
92 "punpckhbw %%"reg_type"7, %%"reg_type"5;" \
93 "punpckhbw %%"reg_type"7, %%"reg_type"6;" \
95 "punpcklbw %%"reg_type"7, %%"reg_type"1;" \
96 "punpcklbw %%"reg_type"7, %%"reg_type"2;" \
98 mov_instr " 80(%3), %%"reg_type"7;" \
100 "psubsw %%"reg_type"7, %%"reg_type"5;" /* U[hi] = U[hi] - 128 */ \
101 "psubsw %%"reg_type"7, %%"reg_type"6;" /* V[hi] = V[hi] - 128 */ \
103 "psubsw %%"reg_type"7, %%"reg_type"1;" /* U[lo] = U[lo] - 128 */ \
104 "psubsw %%"reg_type"7, %%"reg_type"2;" /* V[lo] = V[lo] - 128 */ \
106 mov_instr " %%"reg_type"5, %%"reg_type"3;" \
107 mov_instr " %%"reg_type"6, %%"reg_type"4;" \
109 mov_instr " 32(%3), %%"reg_type"7;" \
110 "pmullw %%"reg_type"7, %%"reg_type"3;" /* calculate Ugreen[hi] */ \
111 "psraw $6, %%"reg_type"3;" /* Ugreen[hi] = Ugreen[hi] / 64 */ \
112 mov_instr " 16(%3), %%"reg_type"7;" \
113 "pmullw %%"reg_type"7, %%"reg_type"4;" /* calculate Vgreen[hi] */ \
114 "psraw $6, %%"reg_type"4;" /* Vgreen[hi] = Vgreen[hi] / 64 */ \
115 "paddsw %%"reg_type"4, %%"reg_type"3;" /* Dgreen[hi] = Ugreen[hi] + Vgreen[hi] */ \
117 mov_instr " %%"reg_type"3, 64(%2);" /* backup Dgreen[hi] (clobbered) */ \
119 mov_instr " %%"reg_type"1, %%"reg_type"3;" \
120 mov_instr " %%"reg_type"2, %%"reg_type"4;" \
122 mov_instr " 32(%3), %%"reg_type"7;" \
123 "pmullw %%"reg_type"7, %%"reg_type"3;" /* calculate Ugreen[lo] */ \
124 "psraw $6, %%"reg_type"3;" /* Ugreen[lo] = Ugreen[lo] / 64 */ \
125 mov_instr " 16(%3), %%"reg_type"7;" \
126 "pmullw %%"reg_type"7, %%"reg_type"4;" /* calculate Vgreen[lo] */ \
127 "psraw $6, %%"reg_type"4;" /* Vgreen[lo] = Vgreen[lo] / 64 */ \
128 "paddsw %%"reg_type"4, %%"reg_type"3;" /* Dgreen[lo] = Ugreen[lo] + Vgreen[lo] */ \
130 mov_instr " 48(%3), %%"reg_type"7;" \
131 "pmullw %%"reg_type"7, %%"reg_type"5;" /* calculate Dblue[hi] */ \
132 "psraw $6, %%"reg_type"5;" /* Dblue[hi] = Dblue[hi] / 64 */ \
133 "pmullw %%"reg_type"7, %%"reg_type"1;" /* calculate Dblue[lo] */ \
134 "psraw $6, %%"reg_type"1;" /* Dblue[lo] = Dblue[lo] / 64 */ \
136 mov_instr " (%3), %%"reg_type"7;" \
137 "pmullw %%"reg_type"7, %%"reg_type"6;" /* calculate Dred[hi] */ \
138 "psraw $6, %%"reg_type"6;" /* Dred[hi] = Dred[hi] / 64 */ \
139 "pmullw %%"reg_type"7, %%"reg_type"2;" /* calculate Dred[lo] */ \
140 "psraw $6, %%"reg_type"2;" /* Dred[lo] = Dred[lo] / 64 */ \
142 mov_instr " %%"reg_type"6, 48(%2);" /* backup Dred[hi] */ \
143 mov_instr " %%"reg_type"5, 80(%2);" /* backup Dblue[hi] */ \
145 mov_instr " %%"reg_type"2, 0(%2);" /* backup Dred[lo] */ \
146 mov_instr " %%"reg_type"3, 16(%2);" /* backup Dgreen[lo] */ \
147 mov_instr " %%"reg_type"1, 32(%2);" /* backup Dblue[lo] */ \
148 "2:" \
149 : : "r" (u), "r" (v), "r" (coeff_storage), "r" (&simd_table) : "%"align_reg); \
150 } while (0);
152 #define RESTORE_COLOR_MODIFIERS(mov_instr, reg_type, coeff_storage) do { \
153 __asm__ __volatile__ ( \
154 mov_instr " (%0), %%"reg_type"2;" /* restore Dred */ \
155 mov_instr " 16(%0), %%"reg_type"3;" /* restore Dgreen */ \
156 mov_instr " 32(%0), %%"reg_type"1;" /* restore Dblue */ \
157 : : "r" (coeff_storage)); \
158 } while (0);
160 #define YUV2RGB_INTEL_SIMD(mov_instr, reg_type, output_offset1, output_offset2, output_offset3, y_plane, dest) do { \
161 __asm__ __volatile__ ( \
162 mov_instr " (%0), %%"reg_type"0;" /* Load Y plane into r0 */ \
163 mov_instr " 96(%2), %%"reg_type"7;" /* Load 16 into r7 */ \
164 "psubusb %%"reg_type"7, %%"reg_type"0;" /* Y = Y - 16 */ \
166 mov_instr " %%"reg_type"0, %%"reg_type"4;" /* r4 == r0 */ \
168 "psllw $8, %%"reg_type"0;" /* r0 [00 Y0 00 Y2 ...] */ \
169 "psrlw $8, %%"reg_type"0;" /* r0 [Y0 00 Y2 00 ...] */ \
170 "psrlw $8, %%"reg_type"4;" /* r4 [Y1 00 Y3 00 ...] */ \
172 mov_instr " 64(%2), %%"reg_type"7;" \
173 "pmullw %%"reg_type"7, %%"reg_type"0;" /* calculate Y*Yc[even] */ \
174 "pmullw %%"reg_type"7, %%"reg_type"4;" /* calculate Y*Yc[odd] */ \
175 "psraw $6, %%"reg_type"0;" /* Yyc[even] = Yyc[even] / 64 */ \
176 "psraw $6, %%"reg_type"4;" /* Yyc[odd] = Yyc[odd] / 64 */ \
178 mov_instr " %%"reg_type"2, %%"reg_type"6;" \
179 mov_instr " %%"reg_type"3, %%"reg_type"7;" \
180 mov_instr " %%"reg_type"1, %%"reg_type"5;" \
182 "paddsw %%"reg_type"0, %%"reg_type"2;" /* CY[even] + DR */ \
183 "paddsw %%"reg_type"0, %%"reg_type"3;" /* CY[even] + DG */ \
184 "paddsw %%"reg_type"0, %%"reg_type"1;" /* CY[even] + DB */ \
186 "paddsw %%"reg_type"4, %%"reg_type"6;" /* CY[odd] + DR */ \
187 "paddsw %%"reg_type"4, %%"reg_type"7;" /* CY[odd] + DG */ \
188 "paddsw %%"reg_type"4, %%"reg_type"5;" /* CY[odd] + DB */ \
190 "packuswb %%"reg_type"2, %%"reg_type"2;" /* Clamp RGB to [0-255] */ \
191 "packuswb %%"reg_type"3, %%"reg_type"3;" \
192 "packuswb %%"reg_type"1, %%"reg_type"1;" \
194 "packuswb %%"reg_type"6, %%"reg_type"6;" \
195 "packuswb %%"reg_type"7, %%"reg_type"7;" \
196 "packuswb %%"reg_type"5, %%"reg_type"5;" \
198 "punpcklbw %%"reg_type"6, %%"reg_type"2;" /* r2 [R0 R1 R2 R3 ...] */ \
199 "punpcklbw %%"reg_type"7, %%"reg_type"3;" /* r3 [G0 G1 G2 G3 ...] */ \
200 "punpcklbw %%"reg_type"5, %%"reg_type"1;" /* r1 [B0 B1 B2 B3 ...] */ \
202 mov_instr " %%"reg_type"2, %%"reg_type"5;" /* copy RGB */ \
203 mov_instr " %%"reg_type"3, %%"reg_type"7;" \
204 mov_instr " %%"reg_type"1, %%"reg_type"6;" \
206 mov_instr " 112(%2), %%"reg_type"4;" \
207 "punpcklbw %%"reg_type"2, %%"reg_type"1;" /* r1 [B0 R0 B1 R1 ...] */ \
208 "punpcklbw %%"reg_type"4, %%"reg_type"3;" /* r4 [G0 FF G1 FF ...] */ \
210 mov_instr " %%"reg_type"1, %%"reg_type"0;" /* r3 [G0 FF G1 FF ...] */ \
212 "punpcklbw %%"reg_type"3, %%"reg_type"1;" /* r2 [B0 G0 R0 FF B1 G1 R1 FF ...] */ \
213 "punpckhbw %%"reg_type"3, %%"reg_type"0;" /* r3 [B2 G2 R2 FF B3 G3 R3 FF ...] */ \
215 mov_instr " %%"reg_type"1, (%1);" /* output BGRA */ \
216 mov_instr " %%"reg_type"0, "output_offset1"(%1);" \
218 "punpckhbw %%"reg_type"5, %%"reg_type"6;" \
219 "punpckhbw %%"reg_type"4, %%"reg_type"7;" \
221 mov_instr " %%"reg_type"6, %%"reg_type"0;" \
223 "punpcklbw %%"reg_type"7, %%"reg_type"6;" \
224 "punpckhbw %%"reg_type"7, %%"reg_type"0;" \
226 mov_instr " %%"reg_type"6, "output_offset2"(%1);" \
227 mov_instr " %%"reg_type"0, "output_offset3"(%1);" \
228 : : "r" (y_plane), "r" (dest), "r" (&simd_table)); \
229 } while (0);
230 #endif
232 #if HAVE_SSE2
233 #define YUV2RGB_SSE(y_plane, dest) YUV2RGB_INTEL_SIMD("movdqa", "xmm", "16", "32", "48", y_plane, dest)
234 #endif
236 #if HAVE_MMX
237 #define YUV2RGB_MMX(y_plane, dest) YUV2RGB_INTEL_SIMD("movq", "mm", "8", "16", "24", y_plane, dest)
238 #endif
240 static inline void YUV444ToBGRA(guint8 Y, guint8 U, guint8 V, guint8 *dst)
242 dst[2] = CLAMP((298 * (Y - 16) + 409 * (V - 128) + 128) >> 8, 0, 255);
243 dst[1] = CLAMP((298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128) >> 8, 0, 255);
244 dst[0] = CLAMP((298 * (Y - 16) + 516 * (U - 128) + 128) >> 8, 0, 255);
245 dst[3] = 0xFF;
249 * YUVConverterInfo
252 bool
253 YUVConverterInfo::Supports (MoonPixelFormat input, MoonPixelFormat output)
255 return input != MoonPixelFormatNone && output != MoonPixelFormatNone;
258 IImageConverter*
259 YUVConverterInfo::Create (Media* media, VideoStream* stream)
261 return new YUVConverter (media, stream);
265 * YUVConverter
268 YUVConverter::YUVConverter (Media* media, VideoStream* stream) : IImageConverter (Type::YUVCONVERTER, media, stream)
270 #if defined(__amd64__) && defined(__x86_64__)
271 have_mmx = true;
272 have_sse2 = true;
273 #else
274 # if HAVE_MMX
275 int have_cpuid = 0;
276 int features = 0;
278 have_mmx = false;
279 have_sse2 = false;
281 __asm__ __volatile__ (
282 "pushfl;"
283 "popl %%eax;"
284 "movl %%eax, %%edx;"
285 "xorl $0x200000, %%eax;"
286 "pushl %%eax;"
287 "popfl;"
288 "pushfl;"
289 "popl %%eax;"
290 "xorl %%edx, %%eax;"
291 "andl $0x200000, %%eax;"
292 "movl %%eax, %0"
293 : "=r" (have_cpuid)
295 : "%eax", "%edx"
298 if (have_cpuid) {
299 __asm__ __volatile__ (
300 "movl $0x0000001, %%eax;"
301 "pushl %%ebx;"
302 "cpuid;"
303 "popl %%ebx;"
304 "movl %%edx, %0;"
305 : "=r" (features)
307 : "%eax"
310 have_mmx = features & 0x00800000;
311 have_sse2 = features & 0x04000000;
313 # else
314 have_mmx = false;
315 have_sse2 = false;
316 # endif
317 #endif
318 if (posix_memalign ((void **)(&rgb_uv), 16, 96))
319 rgb_uv = NULL;
322 YUVConverter::~YUVConverter ()
324 free(rgb_uv);
327 MediaResult
328 YUVConverter::Open ()
330 if (input_format == MoonPixelFormatNone) {
331 Media::Warning (MEDIA_CONVERTER_ERROR, "Invalid input format.");
332 return MEDIA_CONVERTER_ERROR;
335 if (output_format == MoonPixelFormatNone) {
336 Media::Warning (MEDIA_CONVERTER_ERROR, "Invalid output format.");
337 return MEDIA_CONVERTER_ERROR;
340 return MEDIA_SUCCESS;
343 MediaResult
344 YUVConverter::Convert (guint8 *src[], int srcStride[], int srcSlideY, int srcSlideH, guint8* dest[], int dstStride [])
346 guint8 *y_row1 = src[0];
347 guint8 *y_row2 = src[0]+srcStride[0];
349 guint8 *u_plane = src[1];
350 guint8 *v_plane = src[2];
352 guint8 *dest_row1 = dest[0];
353 guint8 *dest_row2 = dest[0]+dstStride[0];
355 int i, j;
357 int width = dstStride[0] >> 2;
358 int height = srcSlideH;
359 int pad = 0;
360 bool aligned = true;
362 if (width != srcStride[0]) {
363 pad = (srcStride[0] - width);
364 if (pad % 16) {
365 g_warning ("This video has padding that prevents us from doing aligned SIMD operations on it.");
366 aligned = false;
370 if (rgb_uv == NULL && posix_memalign ((void **)(&rgb_uv), 16, 96) != 0) {
371 g_warning ("Could not allocate memory for YUVConverter");
372 return MEDIA_OUT_OF_MEMORY;
375 #if HAVE_SSE2
376 if (have_sse2 && aligned) {
377 for (i = 0; i < height >> 1; i ++, y_row1 += srcStride[0], y_row2 += srcStride[0], dest_row1 += dstStride[0], dest_row2 += dstStride[0]) {
378 for (j = 0; j < width >> 4; j ++, y_row1 += 16, y_row2 += 16, u_plane += 8, v_plane += 8, dest_row1 += 64, dest_row2 += 64) {
379 PREFETCH(y_row1);
380 CALC_COLOR_MODIFIERS("movdqa", "xmm", "15", ALIGN_CMP_REG, u_plane, v_plane, rgb_uv);
382 YUV2RGB_SSE(y_row1, dest_row1);
384 PREFETCH(y_row2);
385 RESTORE_COLOR_MODIFIERS("movdqa", "xmm", rgb_uv);
387 YUV2RGB_SSE(y_row2, dest_row2);
389 y_row1 += pad;
390 y_row2 += pad;
391 u_plane += pad >> 1;
392 v_plane += pad >> 1;
394 } else {
395 #endif
396 #if HAVE_MMX
397 if (have_mmx && aligned) {
398 for (i = 0; i < height >> 1; i ++, y_row1 += srcStride[0], y_row2 += srcStride[0], dest_row1 += dstStride[0], dest_row2 += dstStride[0]) {
399 for (j = 0; j < width >> 3; j ++, y_row1 += 8, y_row2 += 8, u_plane += 4, v_plane += 4, dest_row1 += 32, dest_row2 += 32) {
400 PREFETCH(y_row1);
401 CALC_COLOR_MODIFIERS("movq", "mm", "7", ALIGN_CMP_REG, u_plane, v_plane, rgb_uv);
403 YUV2RGB_MMX(y_row1, dest_row1);
405 PREFETCH(y_row2);
406 RESTORE_COLOR_MODIFIERS("movq", "mm", rgb_uv);
408 YUV2RGB_MMX(y_row2, dest_row2);
410 y_row1 += pad;
411 y_row2 += pad;
412 u_plane += pad >> 1;
413 v_plane += pad >> 1;
415 __asm__ __volatile__ ("emms");
416 } else {
417 #endif
418 for (i = 0; i < height >> 1; i ++, y_row1 += srcStride[0], y_row2 += srcStride[0], dest_row1 += dstStride[0], dest_row2 += dstStride[0]) {
419 for (j = 0; j < width >> 1; j ++, dest_row1 += 8, dest_row2 += 8, y_row1 += 2, y_row2 += 2, u_plane += 1, v_plane += 1) {
420 YUV444ToBGRA (*y_row1, *u_plane, *v_plane, dest_row1);
421 YUV444ToBGRA (y_row1[1], *u_plane, *v_plane, (dest_row1+4));
423 YUV444ToBGRA (*y_row2, *u_plane, *v_plane, dest_row2);
424 YUV444ToBGRA (y_row2[1], *u_plane, *v_plane, (dest_row2+4));
426 y_row1 += pad;
427 y_row2 += pad;
428 u_plane += pad >> 1;
429 v_plane += pad >> 1;
431 #if HAVE_MMX
433 #endif
434 #if HAVE_SSE2
436 #endif
437 return MEDIA_SUCCESS;