3rdparty/licenseReport: Add seperate LGPL checks
[haiku.git] / src / add-ons / media / plugins / ffmpeg / yuvrgb_sse2.nasm
blob46ce76e13f5e407f80c793519c88905c1dfc4ce5
2 ; Copyright (C) 2009-2010 David McPaul
4 ; All rights reserved. Distributed under the terms of the MIT License.
7 ; A rather unoptimised set of sse2 yuv to rgb converters
8 ; does 8 pixels per loop
10 ; inputer:
11 ; reads 128 bits of yuv 8 bit data and puts
12 ; the y values converted to 16 bit in xmm0
13 ; the u values converted to 16 bit and duplicated into xmm1
14 ; the v values converted to 16 bit and duplicated into xmm2
16 ; conversion:
17 ; does the yuv to rgb conversion using 16 bit fixed point and the
18 ; results are placed into the following registers as 8 bit clamped values
19 ; r values in xmm3
20 ; g values in xmm4
21 ; b values in xmm5
23 ; outputer:
24 ; writes out the rgba pixels as 8 bit values with 0 for alpha
26 ; xmm6 used for scratch
27 ; xmm7 used for scratch
29 %macro  cglobal 1
30         global  _%1
31         %define %1 _%1
32         align 16
33 %1:
34 %endmacro
36 SECTION .data align=16
38 Const16 dw      16
39         dw      16
40         dw      16
41         dw      16
42         dw      16
43         dw      16
44         dw      16
45         dw      16
47 Const128        dw      128
48         dw      128
49         dw      128
50         dw      128
51         dw      128
52         dw      128
53         dw      128
54         dw      128
55         
56 RConst  dw 0
57                 dw 5743
58                 dw 0
59                 dw 5743
60                 dw 0
61                 dw 5743
62                 dw 0
63                 dw 5743
64                 
65 GConst  dw -1409
66                 dw -2925
67                 dw -1409
68                 dw -2925
69                 dw -1409
70                 dw -2925
71                 dw -1409
72                 dw -2925
73                 
74 BConst  dw 7258
75                 dw 0
76                 dw 7258
77                 dw 0
78                 dw 7258
79                 dw 0
80                 dw 7258
81                 dw 0
83 shuffconst db 0x0
84                 db 0x01
85                 db 0x00
86                 db 0x01
87                 db 0x04
88                 db 0x05
89                 db 0x04
90                 db 0x05
91                 db 0x08
92                 db 0x09
93                 db 0x08
94                 db 0x09
95                 db 0x0c
96                 db 0x0d
97                 db 0x0c
98                 db 0x0d
99                 
100 YMask   db      0x00
101         db      0x80
102         db      0x02
103         db      0x80
104         db      0x04
105         db      0x80
106         db      0x06
107         db      0x80
108         db      0x08
109         db      0x80
110         db      0x0a
111         db      0x80
112         db      0x0c
113         db      0x80
114         db      0x0e
115         db      0x80
117 UVMask  db      0x01
118         db      0x80
119         db      0x03
120         db      0x80
121         db      0x05
122         db      0x80
123         db      0x07
124         db      0x80
125         db      0x09
126         db      0x80
127         db      0x0b
128         db      0x80
129         db      0x0d
130         db      0x80
131         db      0x0f
132         db      0x80
134 ; conversion code 
135 %macro yuv2rgbsse2 0
136 ; u = u - 128
137 ; v = v - 128
138 ; r = y + 0 * u + 1.402 * v
139 ; g = y + -0.344 * u + -0.714 * v
140 ; b = y + 1.772 * u + 0 * v
141 ; subtract 16 from y
142 ;       psubsw xmm0, [Const16]                  ; y = y - 16
143 ; subtract 128 from u and v
144         psubsw xmm3, [Const128]                 ; u = u - 128, v = v -128
145         
146         movdqa xmm4, xmm3                               ; duplicate
147         pshufd xmm5, xmm3, 0xE4                 ; duplicate
148         
149         pmaddwd xmm3, [RConst]                  ; multiply and add
150         pmaddwd xmm4, [GConst]                  ; to get RGB offsets to Y
151         pmaddwd xmm5, [BConst]                  ;
153         psrad xmm3, 12                                  ; Scale back to original range
154         psrad xmm4, 12                                  ;
155         psrad xmm5, 12                                  ;
156         
157         pshuflw xmm3, xmm3, 0xa0                ; duplicate results
158         pshufhw xmm3, xmm3, 0xa0
159         pshuflw xmm4, xmm4, 0xa0
160         pshufhw xmm4, xmm4, 0xa0
161         pshuflw xmm5, xmm5, 0xa0
162         pshufhw xmm5, xmm5, 0xa0
163         
164         paddsw xmm3, xmm0                               ; add to y
165         paddsw xmm4, xmm0                               ;
166         paddsw xmm5, xmm0                               ;
167 %endmacro
169 ; outputer
170 %macro rgba32sse2output 0
171 ; clamp values
172         pxor xmm7,xmm7
173         packuswb xmm3,xmm7                              ; clamp to 0,255 and pack R to 8 bit per pixel
174         packuswb xmm4,xmm7                              ; clamp to 0,255 and pack G to 8 bit per pixel
175         packuswb xmm5,xmm7                              ; clamp to 0,255 and pack B to 8 bit per pixel
176 ; convert to bgra32 packed
177         punpcklbw xmm5,xmm4                             ; bgbgbgbgbgbgbgbg
178         movdqa xmm0, xmm5                               ; save bg values
179         punpcklbw xmm3,xmm7                             ; r0r0r0r0r0r0r0r0
180         punpcklwd xmm5,xmm3                             ; lower half bgr0bgr0bgr0bgr0
181         punpckhwd xmm0,xmm3                             ; upper half bgr0bgr0bgr0bgr0
182 ; write to output ptr
183         movntdq [edi], xmm5                             ; output first 4 pixels bypassing cache
184         movntdq [edi+16], xmm0                  ; output second 4 pixels bypassing cache
185 %endmacro
187 ; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
188 %define width   ebp+16
189 %define toPtr   ebp+12
190 %define fromPtr ebp+8
192 ; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
193 %define width1   ebp+24
194 %define toPtr1   ebp+20
195 %define fromVPtr ebp+16
196 %define fromUPtr ebp+12
197 %define fromYPtr ebp+8
199 SECTION .text align=16
201 cglobal Convert_YUV422_RGBA32_SSE2
202 ; reserve variables
203         push ebp
204         mov ebp, esp
205         push edi
206         push esi
207         push ecx
208         
209         mov esi, [fromPtr]
210         mov edi, [toPtr]
211         mov ecx, [width]
212 ; loop width / 8 times
213         shr ecx,3
214         test ecx,ecx
215         jng ENDLOOP
216 REPEATLOOP:                                                     ; loop over width / 8
217         prefetchnta [esi+256]
218 ; YUV422 packed inputer
219         movdqa xmm0, [esi]                              ; should have yuyv yuyv yuyv yuyv
220         pshufd xmm3, xmm0, 0xE4                 ; copy to xmm3
221 ; extract y
222         pxor xmm7, xmm7                                 ; 00000000000000000000000000000000
223         pcmpeqd xmm6, xmm6                              ; ffffffffffffffffffffffffffffffff
224         punpcklbw xmm6, xmm7                    ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
225         pand xmm0, xmm6                                 ; clear all but y values leaving y0y0 etc
226 ; extract u and v
227         psllw xmm6, 8                                   ; 00ff00ff00ff00ff00ff00ff00ff00ff
228         pand xmm3, xmm6                                 ; extract uv values 0u0v0u0v0u0v0u0v0u0v
229         psrlw xmm3, 8                                   ; covert to 16bit   u0v0u0v0u0v0u0v0u0v0
230         
231 yuv2rgbsse2
232         
233 rgba32sse2output
235 ; endloop
236         add edi,32
237         add esi,16
238         sub ecx, 1                              ; apparently sub is better than dec
239         jnz REPEATLOOP
240 ENDLOOP:
241 ; Cleanup
242         pop ecx
243         pop esi
244         pop edi
245         mov esp, ebp
246         pop ebp
247         ret
249 cglobal Convert_YUV420P_RGBA32_SSE2
250 ; reserve variables
251         push ebp
252         mov ebp, esp
253         push edi
254         push esi
255         push ecx
256         push eax
257         push ebx
258                 
259         mov esi, [fromYPtr]
260         mov eax, [fromUPtr]
261         mov ebx, [fromVPtr]
262         mov edi, [toPtr1]
263         mov ecx, [width1]
264 ; loop width / 8 times
265         shr ecx,3
266         test ecx,ecx
267         jng ENDLOOP1
268 REPEATLOOP1:                                            ; loop over width / 8
269 ; YUV420 Planar inputer
270         movq xmm0, [esi]                                ; fetch 8 y values (8 bit) yyyyyyyy00000000
271         movd xmm3, [eax]                                ; fetch 4 u values (8 bit) uuuu000000000000
272         movd xmm1, [ebx]                                ; fetch 4 v values (8 bit) vvvv000000000000
273         
274 ; extract y
275         pxor xmm7, xmm7                                 ; 00000000000000000000000000000000
276         punpcklbw xmm0, xmm7                    ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
277         
278 ; combine u and v
279         punpcklbw xmm3, xmm1                    ; uvuvuvuv00000000
280         punpcklbw xmm3, xmm7                    ; u0v0u0v0u0v0u0v0
282 yuv2rgbsse2
283         
284 rgba32sse2output
286 ; endloop
287         add edi,32
288         add esi,8
289         add eax,4
290         add ebx,4
291         sub ecx, 1                              ; apparently sub is better than dec
292         jnz REPEATLOOP1
293 ENDLOOP1:
294 ; Cleanup
295         pop ebx
296         pop eax
297         pop ecx
298         pop esi
299         pop edi
300         mov esp, ebp
301         pop ebp
302         ret
304 cglobal Test_SSE2
305 ; reserve variables
306         push ebp
307         mov ebp, esp
308         push edi
309         push esi
310         push ecx
311         push eax
312         push ebx
313         
314         mov esi, [fromPtr]
315         mov edi, [toPtr]
317         movdqa xmm0, [esi]                              ; should have yuyv yuyv yuyv yuyv
318         pshufd xmm1, xmm0, 0xE4                 ; copy to xmm1
319         movdqa xmm3, xmm0                               ; copy to xmm2
320 ; extract y
321         pxor xmm7,xmm7                                  ; 00000000000000000000000000000000
322         pcmpeqd xmm6,xmm6                               ; ffffffffffffffffffffffffffffffff
323         punpcklbw xmm6,xmm7                             ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
324         pand xmm0, xmm6                                 ; clear all but y values leaving y0y0 etc
325 ; extract u and duplicate so each u in yuyv becomes 0u0u
326         psrld xmm6,8                                    ; 00ff0000 00ff0000 00ff0000 00ff0000
327         pand xmm1, xmm6                                 ; clear all yv values leaving 0u00 etc
328         psrld xmm1,8                                    ; rotate u to get u000
329 ; extract v
330         pslld xmm6,16                                   ; 000000ff000000ff 000000ff000000ff
331         pand xmm3, xmm6                                 ; clear all yu values leaving 000v etc
332         psrld xmm3,8                                    ; rotate v to get 00v0
333         por xmm3, xmm1
335         psubsw xmm3, [Const128]                 ; u = u - 128, v = v -128
337         pmaddwd xmm3, [RConst]                  ; multiply and add
338         psrad xmm3, 12                                  ; Scale back to original range
340         pshufb xmm3, [shuffconst]               ; duplicate results
341 ;       paddsw xmm3, xmm0                               ; add to y
343 ;       pxor xmm7,xmm7
344 ;       packuswb xmm3,xmm7                              ; clamp to 0,255 and pack R to 8 bit per pixel
346         movntdq [edi], xmm3                             ; output first 4 pixels bypassing cache
348 ; Cleanup
349         pop ebx
350         pop eax
351         pop ecx
352         pop esi
353         pop edi
354         mov esp, ebp
355         pop ebp
356         ret
357         
358 SECTION .note.GNU-stack noalloc noexec nowrite progbits