src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm

   1 ;
   2 ; Copyright (C) 2009-2010 David McPaul
   3 ;
   4 ; All rights reserved. Distributed under the terms of the MIT License.
   5 ;
   6
   7 ; A rather unoptimised set of sse yuv to rgb converters
   8 ; does 4 pixels per loop
   9
  10 ; inputer:
  11 ; reads 128 bits of yuv 8 bit data and puts
  12 ; the y values converted to 16 bit in mm0
  13 ; the u values converted to 16 bit and duplicated into mm1
  14 ; the v values converted to 16 bit and duplicated into mm2
  15
  16 ; conversion:
  17 ; does the yuv to rgb conversion using 16 bit fixed point and the
  18 ; results are placed into the following registers as 8 bit clamped values
  19 ; r values in mm3
  20 ; g values in mm4
  21 ; b values in mm5
  22
  23 ; outputer:
  24 ; writes out the rgba pixels as 8 bit values with 0 for alpha
  25
  26 ; mm6 used for scratch
  27 ; mm7 used for scratch
  28
  29 %macro  cglobal 1
  30         global  _%1
  31         %define %1 _%1
  32         align 16
  33 %1:
  34 %endmacro
  35
  36 ; conversion code
  37 %macro yuv2rgbsse 0
  38 ; u = u - 128
  39 ; v = v - 128
  40 ; r = y + v + v >> 2 + v >> 3 + v >> 5
  41 ; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
  42 ; b = y + u + u >> 1 + u >> 2 + u >> 6
  43 ; subtract 16 from y
  44         movq mm7, [Const16]                             ; loads a constant using data cache (slower on first fetch but then cached)
  45 ;       psubsw mm0,mm7                                  ; y = y - 16
  46 ; subtract 128 from u and v
  47         movq mm7, [Const128]                    ; loads a constant using data cache (slower on first fetch but then cached)
  48         psubsw mm1,mm7                                  ; u = u - 128
  49         psubsw mm2,mm7                                  ; v = v - 128
  50 ; load r,g,b with y
  51         movq mm3,mm0                                    ; r = y
  52         pshufw mm5,mm0, 0xE4                    ; b = y
  53
  54 ; r = r + v + v >> 2 + v >> 3 + v >> 5
  55         paddsw mm3, mm2                                 ; add v to r
  56         movq mm7, mm1                                   ; move u to scratch
  57         pshufw mm6, mm2, 0xE4                   ; move v to scratch
  58
  59         psraw  mm6,2                                    ; divide v by 4
  60         paddsw mm3, mm6                                 ; and add to r
  61         psraw  mm6,1                                    ; divide v by 2
  62         paddsw mm3, mm6                                 ; and add to r
  63         psraw  mm6,2                                    ; divide v by 4
  64         paddsw mm3, mm6                                 ; and add to r
  65
  66 ; b = y + u + u >> 1 + u >> 2 + u >> 6
  67         paddsw mm5, mm1                                 ; add u to b
  68         psraw  mm7,1                                    ; divide u by 2
  69         paddsw mm5, mm7                                 ; and add to b
  70         psraw  mm7,1                                    ; divide u by 2
  71         paddsw mm5, mm7                                 ; and add to b
  72         psraw  mm7,4                                    ; divide u by 32
  73         paddsw mm5, mm7                                 ; and add to b
  74
  75 ; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
  76         movq mm7,mm2                                    ; move v to scratch
  77         pshufw mm6,mm1, 0xE4                    ; move u to scratch
  78         movq mm4,mm0                                    ; g = y
  79
  80         psraw  mm6,2                                    ; divide u by 4
  81         psubsw mm4,mm6                                  ; subtract from g
  82         psraw  mm6,2                                    ; divide u by 4
  83         psubsw mm4,mm6                                  ; subtract from g
  84         psraw  mm6,1                                    ; divide u by 2
  85         psubsw mm4,mm6                                  ; subtract from g
  86
  87         psraw  mm7,1                                    ; divide v by 2
  88         psubsw mm4,mm7                                  ; subtract from g
  89         psraw  mm7,2                                    ; divide v by 4
  90         psubsw mm4,mm7                                  ; subtract from g
  91         psraw  mm7,1                                    ; divide v by 2
  92         psubsw mm4,mm7                                  ; subtract from g
  93         psraw  mm7,1                                    ; divide v by 2
  94         psubsw mm4,mm7                                  ; subtract from g
  95 %endmacro
  96
  97 ; outputer
  98 %macro rgba32sseoutput 0
  99 ; clamp values
 100         pxor mm7,mm7
 101         packuswb mm3,mm7                                ; clamp to 0,255 and pack R to 8 bit per pixel
 102         packuswb mm4,mm7                                ; clamp to 0,255 and pack G to 8 bit per pixel
 103         packuswb mm5,mm7                                ; clamp to 0,255 and pack B to 8 bit per pixel
 104 ; convert to bgra32 packed
 105         punpcklbw mm5,mm4                               ; bgbgbgbgbgbgbgbg
 106         movq mm0, mm5                                   ; save bg values
 107         punpcklbw mm3,mm7                               ; r0r0r0r0
 108         punpcklwd mm5,mm3                               ; lower half bgr0bgr0
 109         punpckhwd mm0,mm3                               ; upper half bgr0bgr0
 110 ; write to output ptr
 111         movq [edi], mm5                                 ; output first 2 pixels
 112         movq [edi+8], mm0                               ; output second 2 pixels
 113 %endmacro
 114
 115 SECTION .data align=16
 116
 117 Const16 dw      16
 118         dw      16
 119         dw      16
 120         dw      16
 121         dw      16
 122         dw      16
 123         dw      16
 124         dw      16
 125
 126 Const128        dw      128
 127         dw      128
 128         dw      128
 129         dw      128
 130         dw      128
 131         dw      128
 132         dw      128
 133         dw      128
 134
 135 ; Packed Convert
 136 ; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
 137 %define width   ebp+16
 138 %define toPtr   ebp+12
 139 %define fromPtr ebp+8
 140
 141 ; Planar Convert
 142 ; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
 143 %define width1   ebp+24
 144 %define toPtr1   ebp+20
 145 %define fromVPtr ebp+16
 146 %define fromUPtr ebp+12
 147 %define fromYPtr ebp+8
 148
 149 SECTION .text align=16
 150
 151 ; YUY2 FOURCC
 152 cglobal Convert_YUV422_RGBA32_SSE
 153 ; reserve variables
 154         push ebp
 155         mov ebp, esp
 156         push edi
 157         push esi
 158         push ecx
 159
 160         mov esi, [fromPtr]
 161         mov ecx, [width]
 162         mov edi, [toPtr]
 163 ; loop width / 4 times
 164         shr ecx,2
 165         test ecx,ecx
 166         jng ENDLOOP2
 167 REPEATLOOP2:                                            ; loop over width / 4
 168
 169 ; YUV422 packed inputer
 170         movq mm0, [esi]                                 ; should have yuyv yuyv
 171         pshufw mm1, mm0, 0xE4                   ; copy to mm1
 172         movq mm2, mm0                                   ; copy to mm2
 173 ; extract y
 174         pxor mm7,mm7                                    ; 0000000000000000
 175         pcmpeqb mm6,mm6                                 ; ffffffffffffffff
 176         punpckhbw mm6,mm7                               ; interleave mm7 into mm6 ff00ff00ff00ff00
 177         pand mm0, mm6                                   ; clear all but y values leaving y0y0 etc
 178 ; extract u and duplicate so each u in yuyv becomes 0u0u
 179         psrld mm6,8                                             ; 00ff0000 00ff0000
 180         pand mm1, mm6                                   ; clear all yv values leaving 0u00 etc
 181         psrld mm1,8                                             ; rotate u to get u000
 182         pshufw mm1,mm1, 0xA0                    ; copy u values to get u0u0             (SSE not MMX)
 183 ; extract v
 184         pslld mm6,16                                    ; 000000ff000000ff
 185         pand mm2, mm6                                   ; clear all yu values leaving 000v etc
 186         psrld mm2,8                                             ; rotate v to get 00v0
 187         pshufw mm2,mm2, 0xF5                    ; copy v values to get v0v0    (SSE not MMX)
 188
 189 yuv2rgbsse
 190
 191 rgba32sseoutput
 192
 193         ; endloop
 194         add edi,16
 195         add esi,8
 196         sub ecx, 1                                              ; apparently sub is better than dec
 197         jnz REPEATLOOP2
 198 ENDLOOP2:
 199 ; Cleanup
 200         emms                                                    ; reset mmx regs back to float
 201         pop ecx
 202         pop esi
 203         pop edi
 204         mov esp, ebp
 205         pop ebp
 206         ret
 207
 208 cglobal Convert_YUV420P_RGBA32_SSE
 209 ; reserve variables
 210         push ebp
 211         mov ebp, esp
 212         push edi
 213         push esi
 214         push ecx
 215         push eax
 216         push ebx
 217
 218         mov esi, [fromYPtr]
 219         mov eax, [fromUPtr]
 220         mov ebx, [fromVPtr]
 221         mov edi, [toPtr1]
 222         mov ecx, [width1]
 223 ; loop width / 4 times
 224         shr ecx,2
 225         test ecx,ecx
 226         jng ENDLOOP3
 227 REPEATLOOP3:                                            ; loop over width / 4
 228 ; YUV420 Planar inputer
 229         movq mm0, [esi]                                 ; fetch 4 y values (8 bit) yyyy0000
 230         movd mm1, [eax]                                 ; fetch 2 u values (8 bit) uu000000
 231         movd mm2, [ebx]                                 ; fetch 2 v values (8 bit) vv000000
 232
 233 ; extract y
 234         pxor mm7,mm7                                    ; 0000000000000000
 235         punpcklbw mm0,mm7                               ; interleave xmm7 into xmm0 y0y0y0y
 236 ; extract u and duplicate so each becomes 0u0u
 237         punpcklbw mm1,mm7                               ; interleave xmm7 into xmm1 u0u00000
 238         punpcklwd mm1,mm7                               ; interleave again u000u000
 239         pshufw mm1,mm1, 0xA0                    ; copy u values to get u0u0
 240 ; extract v
 241         punpcklbw mm2,mm7                               ; interleave xmm7 into xmm1 v0v00000
 242         punpcklwd mm2,mm7                               ; interleave again v000v000
 243         pshufw mm2,mm2, 0xA0                    ; copy v values to get v0v0
 244
 245 yuv2rgbsse
 246
 247 rgba32sseoutput
 248
 249 ; endloop
 250         add edi,16
 251         add esi,4
 252         add eax,2
 253         add ebx,2
 254         sub ecx, 1                              ; apparently sub is better than dec
 255         jnz REPEATLOOP3
 256 ENDLOOP3:
 257 ; Cleanup
 258         emms
 259         pop ebx
 260         pop eax
 261         pop ecx
 262         pop esi
 263         pop edi
 264         mov esp, ebp
 265         pop ebp
 266         ret
 267
 268 SECTION .note.GNU-stack noalloc noexec nowrite progbits