2 ; Copyright (C) 2009-2010 David McPaul
4 ; All rights reserved. Distributed under the terms of the MIT License.
7 ; A rather unoptimised set of sse2 yuv to rgb converters
8 ; does 8 pixels per loop
11 ; reads 128 bits of yuv 8 bit data and puts
12 ; the y values converted to 16 bit in xmm0
13 ; the u values converted to 16 bit and duplicated into xmm1
14 ; the v values converted to 16 bit and duplicated into xmm2
17 ; does the yuv to rgb conversion using 16 bit fixed point and the
18 ; results are placed into the following registers as 8 bit clamped values
24 ; writes out the rgba pixels as 8 bit values with 0 for alpha
26 ; xmm6 used for scratch
27 ; xmm7 used for scratch
36 SECTION .data align=16
138 ; r = y + 0 * u + 1.402 * v
139 ; g = y + -0.344 * u + -0.714 * v
140 ; b = y + 1.772 * u + 0 * v
142 ; psubsw xmm0, [Const16] ; y = y - 16
143 ; subtract 128 from u and v
144 psubsw xmm3, [Const128] ; u = u - 128, v = v -128
146 movdqa xmm4, xmm3 ; duplicate
147 pshufd xmm5, xmm3, 0xE4 ; duplicate
149 pmaddwd xmm3, [RConst] ; multiply and add
150 pmaddwd xmm4, [GConst] ; to get RGB offsets to Y
151 pmaddwd xmm5, [BConst] ;
153 psrad xmm3, 12 ; Scale back to original range
157 pshuflw xmm3, xmm3, 0xa0 ; duplicate results
158 pshufhw xmm3, xmm3, 0xa0
159 pshuflw xmm4, xmm4, 0xa0
160 pshufhw xmm4, xmm4, 0xa0
161 pshuflw xmm5, xmm5, 0xa0
162 pshufhw xmm5, xmm5, 0xa0
164 paddsw xmm3, xmm0 ; add to y
170 %macro rgba32sse2output 0
173 packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
174 packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
175 packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
176 ; convert to bgra32 packed
177 punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
178 movdqa xmm0, xmm5 ; save bg values
179 punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
180 punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
181 punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
182 ; write to output ptr
183 movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
184 movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
187 ; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
190 %define fromPtr ebp+8
192 ; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
193 %define width1 ebp+24
194 %define toPtr1 ebp+20
195 %define fromVPtr ebp+16
196 %define fromUPtr ebp+12
197 %define fromYPtr ebp+8
199 SECTION .text align=16
201 cglobal Convert_YUV422_RGBA32_SSE2
212 ; loop width / 8 times
216 REPEATLOOP: ; loop over width / 8
217 prefetchnta [esi+256]
218 ; YUV422 packed inputer
219 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
220 pshufd xmm3, xmm0, 0xE4 ; copy to xmm3
222 pxor xmm7, xmm7 ; 00000000000000000000000000000000
223 pcmpeqd xmm6, xmm6 ; ffffffffffffffffffffffffffffffff
224 punpcklbw xmm6, xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
225 pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc
227 psllw xmm6, 8 ; 00ff00ff00ff00ff00ff00ff00ff00ff
228 pand xmm3, xmm6 ; extract uv values 0u0v0u0v0u0v0u0v0u0v
229 psrlw xmm3, 8 ; covert to 16bit u0v0u0v0u0v0u0v0u0v0
238 sub ecx, 1 ; apparently sub is better than dec
249 cglobal Convert_YUV420P_RGBA32_SSE2
264 ; loop width / 8 times
268 REPEATLOOP1: ; loop over width / 8
269 ; YUV420 Planar inputer
270 movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
271 movd xmm3, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
272 movd xmm1, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
275 pxor xmm7, xmm7 ; 00000000000000000000000000000000
276 punpcklbw xmm0, xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
279 punpcklbw xmm3, xmm1 ; uvuvuvuv00000000
280 punpcklbw xmm3, xmm7 ; u0v0u0v0u0v0u0v0
291 sub ecx, 1 ; apparently sub is better than dec
317 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
318 pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
319 movdqa xmm3, xmm0 ; copy to xmm2
321 pxor xmm7,xmm7 ; 00000000000000000000000000000000
322 pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff
323 punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
324 pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc
325 ; extract u and duplicate so each u in yuyv becomes 0u0u
326 psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000
327 pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc
328 psrld xmm1,8 ; rotate u to get u000
330 pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff
331 pand xmm3, xmm6 ; clear all yu values leaving 000v etc
332 psrld xmm3,8 ; rotate v to get 00v0
335 psubsw xmm3, [Const128] ; u = u - 128, v = v -128
337 pmaddwd xmm3, [RConst] ; multiply and add
338 psrad xmm3, 12 ; Scale back to original range
340 pshufb xmm3, [shuffconst] ; duplicate results
341 ; paddsw xmm3, xmm0 ; add to y
344 ; packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
346 movntdq [edi], xmm3 ; output first 4 pixels bypassing cache
358 SECTION .note.GNU-stack noalloc noexec nowrite progbits