vp8/common/x86/postproc_mmx.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 %define VP8_FILTER_WEIGHT 128
  15 %define VP8_FILTER_SHIFT  7
  16
  17 ;void vp8_post_proc_down_and_across_mmx
  18 ;(
  19 ;    unsigned char *src_ptr,
  20 ;    unsigned char *dst_ptr,
  21 ;    int src_pixels_per_line,
  22 ;    int dst_pixels_per_line,
  23 ;    int rows,
  24 ;    int cols,
  25 ;    int flimit
  26 ;)
  27 global sym(vp8_post_proc_down_and_across_mmx)
  28 sym(vp8_post_proc_down_and_across_mmx):
  29     push        rbp
  30     mov         rbp, rsp
  31     SHADOW_ARGS_TO_STACK 7
  32     GET_GOT     rbx
  33     push        rsi
  34     push        rdi
  35     ; end prolog
  36
  37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
  38     ; move the global rd onto the stack, since we don't have enough registers
  39     ; to do PIC addressing
  40     movq        mm0, [GLOBAL(rd)]
  41     sub         rsp, 8
  42     movq        [rsp], mm0
  43 %define RD [rsp]
  44 %else
  45 %define RD [GLOBAL(rd)]
  46 %endif
  47
  48         push        rbx
  49         lea         rbx, [GLOBAL(Blur)]
  50         movd        mm2, dword ptr arg(6) ;flimit
  51         punpcklwd   mm2, mm2
  52         punpckldq   mm2, mm2
  53
  54         mov         rsi,        arg(0) ;src_ptr
  55         mov         rdi,        arg(1) ;dst_ptr
  56
  57         movsxd      rcx, DWORD PTR arg(4) ;rows
  58         movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
  59         pxor        mm0, mm0              ; mm0 = 00000000
  60
  61 nextrow:
  62
  63         xor         rdx,        rdx       ; clear out rdx for use as loop counter
  64 nextcol:
  65
  66         pxor        mm7, mm7              ; mm7 = 00000000
  67         movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
  68         movq        mm3, [rsi]            ; mm4 = r0 p0..p7
  69         punpcklbw   mm3, mm0              ; mm3 = p0..p3
  70         movq        mm1, mm3              ; mm1 = p0..p3
  71         pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
  72
  73         movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
  74         movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
  75         punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
  76         pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
  77         paddusw     mm3, mm6              ; mm3 += mm6
  78
  79         ; thresholding
  80         movq        mm7, mm1              ; mm7 = r0 p0..p3
  81         psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
  82         psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
  83         paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
  84         pcmpgtw     mm7, mm2
  85
  86         movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
  87         movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
  88         punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
  89         pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
  90         paddusw     mm3, mm6              ; mm3 += mm5
  91
  92         ; thresholding
  93         movq        mm6, mm1              ; mm6 = r0 p0..p3
  94         psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
  95         psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
  96         paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
  97         pcmpgtw     mm6, mm2
  98         por         mm7, mm6              ; accumulate thresholds
  99
 100
 101         neg         rax
 102         movq        mm6, [rbx ]           ; kernel 0 taps
 103         movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
 104         punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
 105         pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
 106         paddusw     mm3, mm6              ; mm3 += mm5
 107
 108         ; thresholding
 109         movq        mm6, mm1              ; mm6 = r0 p0..p3
 110         psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
 111         psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
 112         paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
 113         pcmpgtw     mm6, mm2
 114         por         mm7, mm6              ; accumulate thresholds
 115
 116         movq        mm6, [rbx + 16]       ; kernel 1 taps
 117         movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
 118         punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
 119         pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
 120         paddusw     mm3, mm6              ; mm3 += mm5
 121
 122         ; thresholding
 123         movq        mm6, mm1              ; mm6 = r0 p0..p3
 124         psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
 125         psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
 126         paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
 127         pcmpgtw     mm6, mm2
 128         por         mm7, mm6              ; accumulate thresholds
 129
 130
 131         paddusw     mm3, RD               ; mm3 += round value
 132         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
 133
 134         pand        mm1, mm7              ; mm1 select vals > thresh from source
 135         pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
 136         paddusw     mm1, mm7              ; combination
 137
 138         packuswb    mm1, mm0              ; pack to bytes
 139
 140         movd        [rdi], mm1            ;
 141         neg         rax                   ; pitch is positive
 142
 143
 144         add         rsi, 4
 145         add         rdi, 4
 146         add         rdx, 4
 147
 148         cmp         edx, dword ptr arg(5) ;cols
 149         jl          nextcol
 150         ; done with the all cols, start the across filtering in place
 151         sub         rsi, rdx
 152         sub         rdi, rdx
 153
 154
 155         push        rax
 156         xor         rdx,    rdx
 157         mov         rax,    [rdi-4];
 158
 159 acrossnextcol:
 160         pxor        mm7, mm7              ; mm7 = 00000000
 161         movq        mm6, [rbx + 32 ]      ;
 162         movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
 163         movq        mm3, mm4              ; mm3 = p0..p7
 164         punpcklbw   mm3, mm0              ; mm3 = p0..p3
 165         movq        mm1, mm3              ; mm1 = p0..p3
 166         pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
 167
 168         movq        mm6, [rbx + 48]
 169         psrlq       mm4, 8                ; mm4 = p1..p7
 170         movq        mm5, mm4              ; mm5 = p1..p7
 171         punpcklbw   mm5, mm0              ; mm5 = p1..p4
 172         pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
 173         paddusw     mm3, mm6              ; mm3 += mm6
 174
 175         ; thresholding
 176         movq        mm7, mm1              ; mm7 = p0..p3
 177         psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
 178         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
 179         paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
 180         pcmpgtw     mm7, mm2
 181
 182         movq        mm6, [rbx + 64 ]
 183         psrlq       mm4, 8                ; mm4 = p2..p7
 184         movq        mm5, mm4              ; mm5 = p2..p7
 185         punpcklbw   mm5, mm0              ; mm5 = p2..p5
 186         pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
 187         paddusw     mm3, mm6              ; mm3 += mm5
 188
 189         ; thresholding
 190         movq        mm6, mm1              ; mm6 = p0..p3
 191         psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
 192         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
 193         paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
 194         pcmpgtw     mm6, mm2
 195         por         mm7, mm6              ; accumulate thresholds
 196
 197
 198         movq        mm6, [rbx ]
 199         movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
 200         movq        mm5, mm4              ; mm5 = p-2..p5
 201         punpcklbw   mm5, mm0              ; mm5 = p-2..p1
 202         pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
 203         paddusw     mm3, mm6              ; mm3 += mm5
 204
 205         ; thresholding
 206         movq        mm6, mm1              ; mm6 = p0..p3
 207         psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
 208         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
 209         paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
 210         pcmpgtw     mm6, mm2
 211         por         mm7, mm6              ; accumulate thresholds
 212
 213         movq        mm6, [rbx + 16]
 214         psrlq       mm4, 8                ; mm4 = p-1..p5
 215         punpcklbw   mm4, mm0              ; mm4 = p-1..p2
 216         pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
 217         paddusw     mm3, mm6              ; mm3 += mm5
 218
 219         ; thresholding
 220         movq        mm6, mm1              ; mm6 = p0..p3
 221         psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
 222         psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
 223         paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
 224         pcmpgtw     mm6, mm2
 225         por         mm7, mm6              ; accumulate thresholds
 226
 227         paddusw     mm3, RD               ; mm3 += round value
 228         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
 229
 230         pand        mm1, mm7              ; mm1 select vals > thresh from source
 231         pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
 232         paddusw     mm1, mm7              ; combination
 233
 234         packuswb    mm1, mm0              ; pack to bytes
 235         mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
 236         movd        eax,    mm1
 237
 238         add         rdx, 4
 239         cmp         edx, dword ptr arg(5) ;cols
 240         jl          acrossnextcol;
 241
 242         mov         DWORD PTR [rdi+rdx-4],  eax
 243         pop         rax
 244
 245         ; done with this rwo
 246         add         rsi,rax               ; next line
 247         movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
 248         add         rdi,rax               ; next destination
 249         movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
 250
 251         dec         rcx                   ; decrement count
 252         jnz         nextrow               ; next row
 253         pop         rbx
 254
 255     ; begin epilog
 256     pop rdi
 257     pop rsi
 258     RESTORE_GOT
 259     UNSHADOW_ARGS
 260     pop         rbp
 261     ret
 262 %undef RD
 263
 264
 265 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
 266 ;                             int pitch, int rows, int cols,int flimit)
 267 extern sym(vp8_rv)
 268 global sym(vp8_mbpost_proc_down_mmx)
 269 sym(vp8_mbpost_proc_down_mmx):
 270     push        rbp
 271     mov         rbp, rsp
 272     SHADOW_ARGS_TO_STACK 5
 273     GET_GOT     rbx
 274     push        rsi
 275     push        rdi
 276     ; end prolog
 277
 278     ALIGN_STACK 16, rax
 279     sub         rsp, 136
 280
 281     ; unsigned char d[16][8] at [rsp]
 282     ; create flimit2 at [rsp+128]
 283     mov         eax, dword ptr arg(4) ;flimit
 284     mov         [rsp+128], eax
 285     mov         [rsp+128+4], eax
 286 %define flimit2 [rsp+128]
 287
 288 %if ABI_IS_32BIT=0
 289     lea         r8,       [GLOBAL(sym(vp8_rv))]
 290 %endif
 291
 292     ;rows +=8;
 293     add         dword ptr arg(2), 8
 294
 295     ;for(c=0; c<cols; c+=4)
 296 loop_col:
 297             mov         rsi,        arg(0)  ;s
 298             pxor        mm0,        mm0     ;
 299
 300             movsxd      rax,        dword ptr arg(1) ;pitch       ;
 301             neg         rax                                     ; rax = -pitch
 302
 303             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
 304             neg         rax
 305
 306
 307             pxor        mm5,        mm5
 308             pxor        mm6,        mm6     ;
 309
 310             pxor        mm7,        mm7     ;
 311             mov         rdi,        rsi
 312
 313             mov         rcx,        15          ;
 314
 315 loop_initvar:
 316             movd        mm1,        DWORD PTR [rdi];
 317             punpcklbw   mm1,        mm0     ;
 318
 319             paddw       mm5,        mm1     ;
 320             pmullw      mm1,        mm1     ;
 321
 322             movq        mm2,        mm1     ;
 323             punpcklwd   mm1,        mm0     ;
 324
 325             punpckhwd   mm2,        mm0     ;
 326             paddd       mm6,        mm1     ;
 327
 328             paddd       mm7,        mm2     ;
 329             lea         rdi,        [rdi+rax]   ;
 330
 331             dec         rcx
 332             jne         loop_initvar
 333             ;save the var and sum
 334             xor         rdx,        rdx
 335 loop_row:
 336             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
 337             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
 338
 339             punpcklbw   mm1,        mm0
 340             punpcklbw   mm2,        mm0
 341
 342             paddw       mm5,        mm2
 343             psubw       mm5,        mm1
 344
 345             pmullw      mm2,        mm2
 346             movq        mm4,        mm2
 347
 348             punpcklwd   mm2,        mm0
 349             punpckhwd   mm4,        mm0
 350
 351             paddd       mm6,        mm2
 352             paddd       mm7,        mm4
 353
 354             pmullw      mm1,        mm1
 355             movq        mm2,        mm1
 356
 357             punpcklwd   mm1,        mm0
 358             psubd       mm6,        mm1
 359
 360             punpckhwd   mm2,        mm0
 361             psubd       mm7,        mm2
 362
 363
 364             movq        mm3,        mm6
 365             pslld       mm3,        4
 366
 367             psubd       mm3,        mm6
 368             movq        mm1,        mm5
 369
 370             movq        mm4,        mm5
 371             pmullw      mm1,        mm1
 372
 373             pmulhw      mm4,        mm4
 374             movq        mm2,        mm1
 375
 376             punpcklwd   mm1,        mm4
 377             punpckhwd   mm2,        mm4
 378
 379             movq        mm4,        mm7
 380             pslld       mm4,        4
 381
 382             psubd       mm4,        mm7
 383
 384             psubd       mm3,        mm1
 385             psubd       mm4,        mm2
 386
 387             psubd       mm3,        flimit2
 388             psubd       mm4,        flimit2
 389
 390             psrad       mm3,        31
 391             psrad       mm4,        31
 392
 393             packssdw    mm3,        mm4
 394             packsswb    mm3,        mm0
 395
 396             movd        mm1,        DWORD PTR [rsi+rax*8]
 397
 398             movq        mm2,        mm1
 399             punpcklbw   mm1,        mm0
 400
 401             paddw       mm1,        mm5
 402             mov         rcx,        rdx
 403
 404             and         rcx,        127
 405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
 406             push        rax
 407             lea         rax,        [GLOBAL(sym(vp8_rv))]
 408             movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
 409             pop         rax
 410 %elif ABI_IS_32BIT=0
 411             movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
 412 %else
 413             movq        mm4,        [sym(vp8_rv) + rcx*2]
 414 %endif
 415             paddw       mm1,        mm4
 416             ;paddw     xmm1,       eight8s
 417             psraw       mm1,        4
 418
 419             packuswb    mm1,        mm0
 420             pand        mm1,        mm3
 421
 422             pandn       mm3,        mm2
 423             por         mm1,        mm3
 424
 425             and         rcx,        15
 426             movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
 427
 428             mov         rcx,        rdx
 429             sub         rcx,        8
 430
 431             and         rcx,        15
 432             movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
 433
 434             movd        [rsi],      mm1
 435             lea         rsi,        [rsi+rax]
 436
 437             lea         rdi,        [rdi+rax]
 438             add         rdx,        1
 439
 440             cmp         edx,        dword arg(2) ;rows
 441             jl          loop_row
 442
 443
 444         add         dword arg(0), 4 ; s += 4
 445         sub         dword arg(3), 4 ; cols -= 4
 446         cmp         dword arg(3), 0
 447         jg          loop_col
 448
 449     add         rsp, 136
 450     pop         rsp
 451
 452     ; begin epilog
 453     pop rdi
 454     pop rsi
 455     RESTORE_GOT
 456     UNSHADOW_ARGS
 457     pop         rbp
 458     ret
 459 %undef flimit2
 460
 461
 462 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
 463 ;                            unsigned char blackclamp[16],
 464 ;                            unsigned char whiteclamp[16],
 465 ;                            unsigned char bothclamp[16],
 466 ;                            unsigned int Width, unsigned int Height, int Pitch)
 467 extern sym(rand)
 468 global sym(vp8_plane_add_noise_mmx)
 469 sym(vp8_plane_add_noise_mmx):
 470     push        rbp
 471     mov         rbp, rsp
 472     SHADOW_ARGS_TO_STACK 8
 473     GET_GOT     rbx
 474     push        rsi
 475     push        rdi
 476     ; end prolog
 477
 478 addnoise_loop:
 479     call sym(rand) WRT_PLT
 480     mov     rcx, arg(1) ;noise
 481     and     rax, 0xff
 482     add     rcx, rax
 483
 484     ; we rely on the fact that the clamping vectors are stored contiguously
 485     ; in black/white/both order. Note that we have to reload this here because
 486     ; rdx could be trashed by rand()
 487     mov     rdx, arg(2) ; blackclamp
 488
 489
 490             mov     rdi, rcx
 491             movsxd  rcx, dword arg(5) ;[Width]
 492             mov     rsi, arg(0) ;Pos
 493             xor         rax,rax
 494
 495 addnoise_nextset:
 496             movq        mm1,[rsi+rax]         ; get the source
 497
 498             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
 499             paddusb     mm1, [rdx+32] ;bothclamp
 500             psubusb     mm1, [rdx+16] ;whiteclamp
 501
 502             movq        mm2,[rdi+rax]         ; get the noise for this line
 503             paddb       mm1,mm2              ; add it in
 504             movq        [rsi+rax],mm1         ; store the result
 505
 506             add         rax,8                 ; move to the next line
 507
 508             cmp         rax, rcx
 509             jl          addnoise_nextset
 510
 511     movsxd  rax, dword arg(7) ; Pitch
 512     add     arg(0), rax ; Start += Pitch
 513     sub     dword arg(6), 1   ; Height -= 1
 514     jg      addnoise_loop
 515
 516     ; begin epilog
 517     pop rdi
 518     pop rsi
 519     RESTORE_GOT
 520     UNSHADOW_ARGS
 521     pop         rbp
 522     ret
 523
 524
 525 SECTION_RODATA
 526 align 16
 527 Blur:
 528     times 16 dw 16
 529     times  8 dw 64
 530     times 16 dw 16
 531     times  8 dw  0
 532
 533 rd:
 534     times 4 dw 0x40