2 ; Copyright (C) 2009-2010 David McPaul
4 ; All rights reserved. Distributed under the terms of the MIT License.
7 ; A rather unoptimised set of sse yuv to rgb converters
8 ; does 4 pixels per loop
11 ; reads 128 bits of yuv 8 bit data and puts
12 ; the y values converted to 16 bit in mm0
13 ; the u values converted to 16 bit and duplicated into mm1
14 ; the v values converted to 16 bit and duplicated into mm2
17 ; does the yuv to rgb conversion using 16 bit fixed point and the
18 ; results are placed into the following registers as 8 bit clamped values
24 ; writes out the rgba pixels as 8 bit values with 0 for alpha
26 ; mm6 used for scratch
27 ; mm7 used for scratch
40 ; r = y + v + v >> 2 + v >> 3 + v >> 5
41 ; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
42 ; b = y + u + u >> 1 + u >> 2 + u >> 6
44 movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
45 ; psubsw mm0,mm7 ; y = y - 16
46 ; subtract 128 from u and v
47 movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
48 psubsw mm1,mm7 ; u = u - 128
49 psubsw mm2,mm7 ; v = v - 128
52 pshufw mm5,mm0, 0xE4 ; b = y
54 ; r = r + v + v >> 2 + v >> 3 + v >> 5
55 paddsw mm3, mm2 ; add v to r
56 movq mm7, mm1 ; move u to scratch
57 pshufw mm6, mm2, 0xE4 ; move v to scratch
59 psraw mm6,2 ; divide v by 4
60 paddsw mm3, mm6 ; and add to r
61 psraw mm6,1 ; divide v by 2
62 paddsw mm3, mm6 ; and add to r
63 psraw mm6,2 ; divide v by 4
64 paddsw mm3, mm6 ; and add to r
66 ; b = y + u + u >> 1 + u >> 2 + u >> 6
67 paddsw mm5, mm1 ; add u to b
68 psraw mm7,1 ; divide u by 2
69 paddsw mm5, mm7 ; and add to b
70 psraw mm7,1 ; divide u by 2
71 paddsw mm5, mm7 ; and add to b
72 psraw mm7,4 ; divide u by 32
73 paddsw mm5, mm7 ; and add to b
75 ; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
76 movq mm7,mm2 ; move v to scratch
77 pshufw mm6,mm1, 0xE4 ; move u to scratch
80 psraw mm6,2 ; divide u by 4
81 psubsw mm4,mm6 ; subtract from g
82 psraw mm6,2 ; divide u by 4
83 psubsw mm4,mm6 ; subtract from g
84 psraw mm6,1 ; divide u by 2
85 psubsw mm4,mm6 ; subtract from g
87 psraw mm7,1 ; divide v by 2
88 psubsw mm4,mm7 ; subtract from g
89 psraw mm7,2 ; divide v by 4
90 psubsw mm4,mm7 ; subtract from g
91 psraw mm7,1 ; divide v by 2
92 psubsw mm4,mm7 ; subtract from g
93 psraw mm7,1 ; divide v by 2
94 psubsw mm4,mm7 ; subtract from g
98 %macro rgba32sseoutput 0
101 packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel
102 packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel
103 packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel
104 ; convert to bgra32 packed
105 punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg
106 movq mm0, mm5 ; save bg values
107 punpcklbw mm3,mm7 ; r0r0r0r0
108 punpcklwd mm5,mm3 ; lower half bgr0bgr0
109 punpckhwd mm0,mm3 ; upper half bgr0bgr0
110 ; write to output ptr
111 movq [edi], mm5 ; output first 2 pixels
112 movq [edi+8], mm0 ; output second 2 pixels
115 SECTION .data align=16
136 ; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
139 %define fromPtr ebp+8
142 ; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
143 %define width1 ebp+24
144 %define toPtr1 ebp+20
145 %define fromVPtr ebp+16
146 %define fromUPtr ebp+12
147 %define fromYPtr ebp+8
149 SECTION .text align=16
152 cglobal Convert_YUV422_RGBA32_SSE
163 ; loop width / 4 times
167 REPEATLOOP2: ; loop over width / 4
169 ; YUV422 packed inputer
170 movq mm0, [esi] ; should have yuyv yuyv
171 pshufw mm1, mm0, 0xE4 ; copy to mm1
172 movq mm2, mm0 ; copy to mm2
174 pxor mm7,mm7 ; 0000000000000000
175 pcmpeqb mm6,mm6 ; ffffffffffffffff
176 punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00
177 pand mm0, mm6 ; clear all but y values leaving y0y0 etc
178 ; extract u and duplicate so each u in yuyv becomes 0u0u
179 psrld mm6,8 ; 00ff0000 00ff0000
180 pand mm1, mm6 ; clear all yv values leaving 0u00 etc
181 psrld mm1,8 ; rotate u to get u000
182 pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX)
184 pslld mm6,16 ; 000000ff000000ff
185 pand mm2, mm6 ; clear all yu values leaving 000v etc
186 psrld mm2,8 ; rotate v to get 00v0
187 pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX)
196 sub ecx, 1 ; apparently sub is better than dec
200 emms ; reset mmx regs back to float
208 cglobal Convert_YUV420P_RGBA32_SSE
223 ; loop width / 4 times
227 REPEATLOOP3: ; loop over width / 4
228 ; YUV420 Planar inputer
229 movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000
230 movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000
231 movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000
234 pxor mm7,mm7 ; 0000000000000000
235 punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y
236 ; extract u and duplicate so each becomes 0u0u
237 punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000
238 punpcklwd mm1,mm7 ; interleave again u000u000
239 pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0
241 punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000
242 punpcklwd mm2,mm7 ; interleave again v000v000
243 pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0
254 sub ecx, 1 ; apparently sub is better than dec
268 SECTION .note.GNU-stack noalloc noexec nowrite progbits