CLOSED TREE: TraceMonkey merge head. (a=blockers)
[mozilla-central.git] / gfx / ycbcr / arm.patch
blob658009b1ff3822b369a202b4bb3eeb5bd5a9827a
1 diff --git a/gfx/ycbcr/Makefile.in b/gfx/ycbcr/Makefile.in
2 --- a/gfx/ycbcr/Makefile.in
3 +++ b/gfx/ycbcr/Makefile.in
4 @@ -40,16 +40,21 @@ CPPSRCS += yuv_row_posix.cpp \
5 else
6 CPPSRCS += yuv_row_other.cpp \
7 $(NULL)
8 endif # Darwin
9 endif # SunOS
10 endif # linux
11 endif # windows
13 +ifeq (arm,$(findstring arm,$(OS_TEST)))
14 +CPPSRCS += yuv_convert_arm.cpp \
15 + $(NULL)
16 +endif
18 EXTRA_DSO_LDOPTS += \
19 $(LIBS_DIR) \
20 $(EXTRA_DSO_LIBS) \
21 $(XPCOM_LIBS) \
22 $(NSPR_LIBS) \
23 $(NULL)
25 include $(topsrcdir)/config/rules.mk
26 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
27 --- a/gfx/ycbcr/yuv_convert.cpp
28 +++ b/gfx/ycbcr/yuv_convert.cpp
29 @@ -19,25 +19,56 @@
30 #include "yuv_convert.h"
32 // Header for low level row functions.
33 #include "yuv_row.h"
34 #define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
35 #define MOZILLA_SSE_INCLUDE_HEADER_FOR_MMX
36 #include "mozilla/SSE.h"
38 +#ifdef HAVE_YCBCR_TO_RGB565
39 +void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag);
40 +#endif
42 namespace mozilla {
44 namespace gfx {
46 // 16.16 fixed point arithmetic
47 const int kFractionBits = 16;
48 const int kFractionMax = 1 << kFractionBits;
49 const int kFractionMask = ((1 << kFractionBits) - 1);
52 +// Convert a frame of YUV to 16 bit RGB565.
53 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* y_buf,
54 + const uint8* u_buf,
55 + const uint8* v_buf,
56 + uint8* rgb_buf,
57 + int pic_x,
58 + int pic_y,
59 + int pic_width,
60 + int pic_height,
61 + int y_pitch,
62 + int uv_pitch,
63 + int rgb_pitch,
64 + YUVType yuv_type)
66 +#ifdef HAVE_YCBCR_TO_RGB565
67 + for (int i = 0; i < pic_height; i++) {
68 + yv12_to_rgb565_neon((uint16*)rgb_buf + pic_width * i,
69 + y_buf + y_pitch * i,
70 + u_buf + uv_pitch * (i / 2),
71 + v_buf + uv_pitch * (i / 2),
72 + pic_width,
73 + 0);
74 + }
75 +#endif
78 // Convert a frame of YUV to 32 bit ARGB.
79 NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
80 const uint8* u_buf,
81 const uint8* v_buf,
82 uint8* rgb_buf,
83 int pic_x,
84 int pic_y,
85 int pic_width,
86 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
87 --- a/gfx/ycbcr/yuv_convert.h
88 +++ b/gfx/ycbcr/yuv_convert.h
89 @@ -2,16 +2,20 @@
90 // Use of this source code is governed by a BSD-style license that can be
91 // found in the LICENSE file.
93 #ifndef MEDIA_BASE_YUV_CONVERT_H_
94 #define MEDIA_BASE_YUV_CONVERT_H_
96 #include "chromium_types.h"
97 #include "gfxCore.h"
99 +#ifdef __arm__
100 +#define HAVE_YCBCR_TO_RGB565 1
101 +#endif
103 namespace mozilla {
105 namespace gfx {
107 // Type of YUV surface.
108 // The value of these enums matter as they are used to shift vertical indices.
109 enum YUVType {
110 @@ -36,16 +40,31 @@ enum Rotate {
111 // Filter affects how scaling looks.
112 enum ScaleFilter {
113 FILTER_NONE = 0, // No filter (point sampled).
114 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
115 FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
116 FILTER_BILINEAR = 3 // Bilinear filter.
119 +// Convert a frame of YUV to 16 bit RGB565.
120 +// Pass in YV12 formats
121 +NS_GFX_(void) ConvertYCbCrToRGB565(const uint8* yplane,
122 + const uint8* uplane,
123 + const uint8* vplane,
124 + uint8* rgbframe,
125 + int pic_x,
126 + int pic_y,
127 + int pic_width,
128 + int pic_height,
129 + int ystride,
130 + int uvstride,
131 + int rgbstride,
132 + YUVType yuv_type);
134 // Convert a frame of YUV to 32 bit ARGB.
135 // Pass in YV16/YV12 depending on source format
136 NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
137 const uint8* uplane,
138 const uint8* vplane,
139 uint8* rgbframe,
140 int pic_x,
141 int pic_y,
142 diff --git a/gfx/ycbcr/yuv_convert_arm.cpp b/gfx/ycbcr/yuv_convert_arm.cpp
143 new file mode 100644
144 --- /dev/null
145 +++ b/gfx/ycbcr/yuv_convert_arm.cpp
146 @@ -0,0 +1,201 @@
147 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
148 +// Use of this source code is governed by a BSD-style license that can be
149 +// found in the LICENSE file.
151 +// contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
153 +#include "yuv_convert.h"
155 +void __attribute((noinline)) yv12_to_rgb565_neon(uint16 *dst, const uint8 *y, const uint8 *u, const uint8 *v, int n, int oddflag)
157 + static __attribute__((aligned(16))) uint16 acc_r[8] = {
158 + 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
159 + };
160 + static __attribute__((aligned(16))) uint16 acc_g[8] = {
161 + 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
162 + };
163 + static __attribute__((aligned(16))) uint16 acc_b[8] = {
164 + 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
165 + };
166 + /*
167 + * Registers:
168 + * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data
169 + * q2 : d4, d5 - are used for storing converted RGB data
170 + * q3 : d6, d7 - are used for temporary storage
172 + * q4-q7 - reserved
174 + * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data
175 + * q10 : d20, d21
176 + * q11 : d22, d23
177 + * q12 : d24, d25
178 + * q13 : d26, d27
179 + * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154)
180 + */
181 + asm volatile (
182 +".fpu neon\n"
183 +".macro convert_macroblock size\n"
184 +/* load up to 16 source pixels */
185 + ".if \\size == 16\n"
186 + "pld [%[y], #64]\n"
187 + "pld [%[u], #64]\n"
188 + "pld [%[v], #64]\n"
189 + "vld1.8 {d1}, [%[y]]!\n"
190 + "vld1.8 {d3}, [%[y]]!\n"
191 + "vld1.8 {d0}, [%[u]]!\n"
192 + "vld1.8 {d2}, [%[v]]!\n"
193 + ".elseif \\size == 8\n"
194 + "vld1.8 {d1}, [%[y]]!\n"
195 + "vld1.8 {d0[0]}, [%[u]]!\n"
196 + "vld1.8 {d0[1]}, [%[u]]!\n"
197 + "vld1.8 {d0[2]}, [%[u]]!\n"
198 + "vld1.8 {d0[3]}, [%[u]]!\n"
199 + "vld1.8 {d2[0]}, [%[v]]!\n"
200 + "vld1.8 {d2[1]}, [%[v]]!\n"
201 + "vld1.8 {d2[2]}, [%[v]]!\n"
202 + "vld1.8 {d2[3]}, [%[v]]!\n"
203 + ".elseif \\size == 4\n"
204 + "vld1.8 {d1[0]}, [%[y]]!\n"
205 + "vld1.8 {d1[1]}, [%[y]]!\n"
206 + "vld1.8 {d1[2]}, [%[y]]!\n"
207 + "vld1.8 {d1[3]}, [%[y]]!\n"
208 + "vld1.8 {d0[0]}, [%[u]]!\n"
209 + "vld1.8 {d0[1]}, [%[u]]!\n"
210 + "vld1.8 {d2[0]}, [%[v]]!\n"
211 + "vld1.8 {d2[1]}, [%[v]]!\n"
212 + ".elseif \\size == 2\n"
213 + "vld1.8 {d1[0]}, [%[y]]!\n"
214 + "vld1.8 {d1[1]}, [%[y]]!\n"
215 + "vld1.8 {d0[0]}, [%[u]]!\n"
216 + "vld1.8 {d2[0]}, [%[v]]!\n"
217 + ".elseif \\size == 1\n"
218 + "vld1.8 {d1[0]}, [%[y]]!\n"
219 + "vld1.8 {d0[0]}, [%[u]]!\n"
220 + "vld1.8 {d2[0]}, [%[v]]!\n"
221 + ".else\n"
222 + ".error \"unsupported macroblock size\"\n"
223 + ".endif\n"
225 + /* d1 - Y data (first 8 bytes) */
226 + /* d3 - Y data (next 8 bytes) */
227 + /* d0 - U data, d2 - V data */
229 + /* split even and odd Y color components */
230 + "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */
231 + /* clip upper and lower boundaries */
232 + "vqadd.u8 q0, q0, q4\n"
233 + "vqadd.u8 q1, q1, q4\n"
234 + "vqsub.u8 q0, q0, q5\n"
235 + "vqsub.u8 q1, q1, q5\n"
237 + "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */
239 + "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */
240 + "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */
242 + "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
243 + "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */
244 + "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */
245 + "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
246 + "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */
247 + "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */
248 + "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
249 + "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */
250 + "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */
252 + "vhsub.s16 q3, q8, q10\n" /* calculate even red components */
253 + "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */
254 + "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */
255 + "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */
257 + "vhadd.s16 q3, q8, q11\n" /* calculate even green components */
258 + "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */
259 + "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */
260 + "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */
262 + "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */
263 + "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */
264 + "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */
265 + "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */
267 + "vzip.8 d0, d3\n" /* join even and odd red components */
268 + "vzip.8 d1, d4\n" /* join even and odd green components */
269 + "vzip.8 d2, d5\n" /* join even and odd blue components */
271 + "vshll.u8 q3, d0, #8\n\t"
272 + "vshll.u8 q8, d1, #8\n\t"
273 + "vshll.u8 q9, d2, #8\n\t"
274 + "vsri.u16 q3, q8, #5\t\n"
275 + "vsri.u16 q3, q9, #11\t\n"
276 + /* store pixel data to memory */
277 + ".if \\size == 16\n"
278 + " vst1.16 {d6, d7}, [%[dst]]!\n"
279 + " vshll.u8 q3, d3, #8\n\t"
280 + " vshll.u8 q8, d4, #8\n\t"
281 + " vshll.u8 q9, d5, #8\n\t"
282 + " vsri.u16 q3, q8, #5\t\n"
283 + " vsri.u16 q3, q9, #11\t\n"
284 + " vst1.16 {d6, d7}, [%[dst]]!\n"
285 + ".elseif \\size == 8\n"
286 + " vst1.16 {d6, d7}, [%[dst]]!\n"
287 + ".elseif \\size == 4\n"
288 + " vst1.16 {d6}, [%[dst]]!\n"
289 + ".elseif \\size == 2\n"
290 + " vst1.16 {d6[0]}, [%[dst]]!\n"
291 + " vst1.16 {d6[1]}, [%[dst]]!\n"
292 + ".elseif \\size == 1\n"
293 + " vst1.16 {d6[0]}, [%[dst]]!\n"
294 + ".endif\n"
295 + ".endm\n"
297 + "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */
298 + "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */
299 + "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */
300 + "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */
302 + "vmov.u8 d26, #16\n"
303 + "vmov.u8 d27, #149\n"
304 + "vmov.u8 d28, #204\n"
305 + "vmov.u8 d29, #50\n"
306 + "vmov.u8 d30, #104\n"
307 + "vmov.u8 d31, #154\n"
309 + "cmp %[oddflag], #0\n"
310 + "beq 1f\n"
311 + "convert_macroblock 1\n"
312 + "sub %[n], %[n], #1\n"
313 + "1:\n"
314 + "subs %[n], %[n], #16\n"
315 + "blt 2f\n"
316 + "1:\n"
317 + "convert_macroblock 16\n"
318 + "subs %[n], %[n], #16\n"
319 + "bge 1b\n"
320 + "2:\n"
321 + "tst %[n], #8\n"
322 + "beq 3f\n"
323 + "convert_macroblock 8\n"
324 + "3:\n"
325 + "tst %[n], #4\n"
326 + "beq 4f\n"
327 + "convert_macroblock 4\n"
328 + "4:\n"
329 + "tst %[n], #2\n"
330 + "beq 5f\n"
331 + "convert_macroblock 2\n"
332 + "5:\n"
333 + "tst %[n], #1\n"
334 + "beq 6f\n"
335 + "convert_macroblock 1\n"
336 + "6:\n"
337 + ".purgem convert_macroblock\n"
338 + : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
339 + : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
340 + [oddflag] "r" (oddflag)
341 + : "cc", "memory",
342 + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
343 + "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */
344 + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
345 + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
346 + );