vp8/common/x86/postproc_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ;void vp8_post_proc_down_and_across_xmm
  15 ;(
  16 ;    unsigned char *src_ptr,
  17 ;    unsigned char *dst_ptr,
  18 ;    int src_pixels_per_line,
  19 ;    int dst_pixels_per_line,
  20 ;    int rows,
  21 ;    int cols,
  22 ;    int flimit
  23 ;)
  24 global sym(vp8_post_proc_down_and_across_xmm)
  25 sym(vp8_post_proc_down_and_across_xmm):
  26     push        rbp
  27     mov         rbp, rsp
  28     SHADOW_ARGS_TO_STACK 7
  29     SAVE_XMM
  30     GET_GOT     rbx
  31     push        rsi
  32     push        rdi
  33     ; end prolog
  34
  35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
  36     ALIGN_STACK 16, rax
  37     ; move the global rd onto the stack, since we don't have enough registers
  38     ; to do PIC addressing
  39     movdqa      xmm0, [GLOBAL(rd42)]
  40     sub         rsp, 16
  41     movdqa      [rsp], xmm0
  42 %define RD42 [rsp]
  43 %else
  44 %define RD42 [GLOBAL(rd42)]
  45 %endif
  46
  47
  48         movd        xmm2,       dword ptr arg(6) ;flimit
  49         punpcklwd   xmm2,       xmm2
  50         punpckldq   xmm2,       xmm2
  51         punpcklqdq  xmm2,       xmm2
  52
  53         mov         rsi,        arg(0) ;src_ptr
  54         mov         rdi,        arg(1) ;dst_ptr
  55
  56         movsxd      rcx,        DWORD PTR arg(4) ;rows
  57         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
  58         pxor        xmm0,       xmm0              ; mm0 = 00000000
  59
  60 nextrow:
  61
  62         xor         rdx,        rdx       ; clear out rdx for use as loop counter
  63 nextcol:
  64         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
  65         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
  66         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
  67         psllw       xmm3,       2                       ;
  68
  69         movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
  70         punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
  71         paddusw     xmm3,       xmm5                    ; mm3 += mm6
  72
  73         ; thresholding
  74         movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
  75         psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
  76         psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
  77         paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
  78         pcmpgtw     xmm7,       xmm2
  79
  80         movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
  81         punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
  82         paddusw     xmm3,       xmm5                    ; mm3 += mm5
  83
  84         ; thresholding
  85         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
  86         psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
  87         psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
  88         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
  89         pcmpgtw     xmm6,       xmm2
  90         por         xmm7,       xmm6                    ; accumulate thresholds
  91
  92
  93         neg         rax
  94         movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
  95         punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
  96         paddusw     xmm3,       xmm5                    ; mm3 += mm5
  97
  98         ; thresholding
  99         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
 100         psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
 101         psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
 102         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
 103         pcmpgtw     xmm6,       xmm2
 104         por         xmm7,       xmm6                    ; accumulate thresholds
 105
 106         movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
 107         punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
 108         paddusw     xmm3,       xmm4                    ; mm3 += mm5
 109
 110         ; thresholding
 111         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
 112         psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
 113         psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
 114         paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
 115         pcmpgtw     xmm6,       xmm2
 116         por         xmm7,       xmm6                    ; accumulate thresholds
 117
 118
 119         paddusw     xmm3,       RD42                    ; mm3 += round value
 120         psraw       xmm3,       3                       ; mm3 /= 8
 121
 122         pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
 123         pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
 124         paddusw     xmm1,       xmm7                    ; combination
 125
 126         packuswb    xmm1,       xmm0                    ; pack to bytes
 127         movq        QWORD PTR [rdi], xmm1             ;
 128
 129         neg         rax                   ; pitch is positive
 130         add         rsi,        8
 131         add         rdi,        8
 132
 133         add         rdx,        8
 134         cmp         edx,        dword arg(5) ;cols
 135
 136         jl          nextcol
 137
 138         ; done with the all cols, start the across filtering in place
 139         sub         rsi,        rdx
 140         sub         rdi,        rdx
 141
 142         xor         rdx,        rdx
 143         movq        mm0,        QWORD PTR [rdi-8];
 144
 145 acrossnextcol:
 146         movq        xmm7,       QWORD PTR [rdi +rdx -2]
 147         movd        xmm4,       DWORD PTR [rdi +rdx +6]
 148
 149         pslldq      xmm4,       8
 150         por         xmm4,       xmm7
 151
 152         movdqa      xmm3,       xmm4
 153         psrldq      xmm3,       2
 154         punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
 155         movdqa      xmm1,       xmm3              ; mm1 = p0..p3
 156         psllw       xmm3,       2
 157
 158
 159         movdqa      xmm5,       xmm4
 160         psrldq      xmm5,       3
 161         punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
 162         paddusw     xmm3,       xmm5              ; mm3 += mm6
 163
 164         ; thresholding
 165         movdqa      xmm7,       xmm1              ; mm7 = p0..p3
 166         psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
 167         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
 168         paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
 169         pcmpgtw     xmm7,       xmm2
 170
 171         movdqa      xmm5,       xmm4
 172         psrldq      xmm5,       4
 173         punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
 174         paddusw     xmm3,       xmm5              ; mm3 += mm5
 175
 176         ; thresholding
 177         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
 178         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
 179         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
 180         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
 181         pcmpgtw     xmm6,       xmm2
 182         por         xmm7,       xmm6              ; accumulate thresholds
 183
 184
 185         movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
 186         punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
 187         paddusw     xmm3,       xmm5              ; mm3 += mm5
 188
 189         ; thresholding
 190         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
 191         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
 192         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
 193         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
 194         pcmpgtw     xmm6,       xmm2
 195         por         xmm7,       xmm6              ; accumulate thresholds
 196
 197         psrldq      xmm4,       1                   ; mm4 = p-1..p5
 198         punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
 199         paddusw     xmm3,       xmm4              ; mm3 += mm5
 200
 201         ; thresholding
 202         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
 203         psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
 204         psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
 205         paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
 206         pcmpgtw     xmm6,       xmm2
 207         por         xmm7,       xmm6              ; accumulate thresholds
 208
 209         paddusw     xmm3,       RD42              ; mm3 += round value
 210         psraw       xmm3,       3                 ; mm3 /= 8
 211
 212         pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
 213         pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
 214         paddusw     xmm1,       xmm7              ; combination
 215
 216         packuswb    xmm1,       xmm0              ; pack to bytes
 217         movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
 218         movdq2q     mm0,        xmm1
 219
 220         add         rdx,        8
 221         cmp         edx,        dword arg(5) ;cols
 222         jl          acrossnextcol;
 223
 224         ; last 8 pixels
 225         movq        QWORD PTR [rdi+rdx-8],  mm0
 226
 227         ; done with this rwo
 228         add         rsi,rax               ; next line
 229         mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
 230         add         rdi,rax               ; next destination
 231         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
 232
 233         dec         rcx                   ; decrement count
 234         jnz         nextrow               ; next row
 235
 236 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
 237     add rsp,16
 238     pop rsp
 239 %endif
 240     ; begin epilog
 241     pop rdi
 242     pop rsi
 243     RESTORE_GOT
 244     RESTORE_XMM
 245     UNSHADOW_ARGS
 246     pop         rbp
 247     ret
 248 %undef RD42
 249
 250
 251 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
 252 ;                            int pitch, int rows, int cols,int flimit)
 253 extern sym(vp8_rv)
 254 global sym(vp8_mbpost_proc_down_xmm)
 255 sym(vp8_mbpost_proc_down_xmm):
 256     push        rbp
 257     mov         rbp, rsp
 258     SHADOW_ARGS_TO_STACK 5
 259     SAVE_XMM
 260     GET_GOT     rbx
 261     push        rsi
 262     push        rdi
 263     ; end prolog
 264
 265     ALIGN_STACK 16, rax
 266     sub         rsp, 128+16
 267
 268     ; unsigned char d[16][8] at [rsp]
 269     ; create flimit2 at [rsp+128]
 270     mov         eax, dword ptr arg(4) ;flimit
 271     mov         [rsp+128], eax
 272     mov         [rsp+128+4], eax
 273     mov         [rsp+128+8], eax
 274     mov         [rsp+128+12], eax
 275 %define flimit4 [rsp+128]
 276
 277 %if ABI_IS_32BIT=0
 278     lea         r8,       [GLOBAL(sym(vp8_rv))]
 279 %endif
 280
 281     ;rows +=8;
 282     add         dword arg(2), 8
 283
 284     ;for(c=0; c<cols; c+=8)
 285 loop_col:
 286             mov         rsi,        arg(0) ; s
 287             pxor        xmm0,       xmm0        ;
 288
 289             movsxd      rax,        dword ptr arg(1) ;pitch       ;
 290             neg         rax                                     ; rax = -pitch
 291
 292             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
 293             neg         rax
 294
 295
 296             pxor        xmm5,       xmm5
 297             pxor        xmm6,       xmm6        ;
 298
 299             pxor        xmm7,       xmm7        ;
 300             mov         rdi,        rsi
 301
 302             mov         rcx,        15          ;
 303
 304 loop_initvar:
 305             movq        xmm1,       QWORD PTR [rdi];
 306             punpcklbw   xmm1,       xmm0        ;
 307
 308             paddw       xmm5,       xmm1        ;
 309             pmullw      xmm1,       xmm1        ;
 310
 311             movdqa      xmm2,       xmm1        ;
 312             punpcklwd   xmm1,       xmm0        ;
 313
 314             punpckhwd   xmm2,       xmm0        ;
 315             paddd       xmm6,       xmm1        ;
 316
 317             paddd       xmm7,       xmm2        ;
 318             lea         rdi,        [rdi+rax]   ;
 319
 320             dec         rcx
 321             jne         loop_initvar
 322             ;save the var and sum
 323             xor         rdx,        rdx
 324 loop_row:
 325             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
 326             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
 327
 328             punpcklbw   xmm1,       xmm0
 329             punpcklbw   xmm2,       xmm0
 330
 331             paddw       xmm5,       xmm2
 332             psubw       xmm5,       xmm1
 333
 334             pmullw      xmm2,       xmm2
 335             movdqa      xmm4,       xmm2
 336
 337             punpcklwd   xmm2,       xmm0
 338             punpckhwd   xmm4,       xmm0
 339
 340             paddd       xmm6,       xmm2
 341             paddd       xmm7,       xmm4
 342
 343             pmullw      xmm1,       xmm1
 344             movdqa      xmm2,       xmm1
 345
 346             punpcklwd   xmm1,       xmm0
 347             psubd       xmm6,       xmm1
 348
 349             punpckhwd   xmm2,       xmm0
 350             psubd       xmm7,       xmm2
 351
 352
 353             movdqa      xmm3,       xmm6
 354             pslld       xmm3,       4
 355
 356             psubd       xmm3,       xmm6
 357             movdqa      xmm1,       xmm5
 358
 359             movdqa      xmm4,       xmm5
 360             pmullw      xmm1,       xmm1
 361
 362             pmulhw      xmm4,       xmm4
 363             movdqa      xmm2,       xmm1
 364
 365             punpcklwd   xmm1,       xmm4
 366             punpckhwd   xmm2,       xmm4
 367
 368             movdqa      xmm4,       xmm7
 369             pslld       xmm4,       4
 370
 371             psubd       xmm4,       xmm7
 372
 373             psubd       xmm3,       xmm1
 374             psubd       xmm4,       xmm2
 375
 376             psubd       xmm3,       flimit4
 377             psubd       xmm4,       flimit4
 378
 379             psrad       xmm3,       31
 380             psrad       xmm4,       31
 381
 382             packssdw    xmm3,       xmm4
 383             packsswb    xmm3,       xmm0
 384
 385             movq        xmm1,       QWORD PTR [rsi+rax*8]
 386
 387             movq        xmm2,       xmm1
 388             punpcklbw   xmm1,       xmm0
 389
 390             paddw       xmm1,       xmm5
 391             mov         rcx,        rdx
 392
 393             and         rcx,        127
 394 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
 395             push        rax
 396             lea         rax,        [GLOBAL(sym(vp8_rv))]
 397             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
 398             pop         rax
 399 %elif ABI_IS_32BIT=0
 400             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
 401 %else
 402             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
 403 %endif
 404
 405             paddw       xmm1,       xmm4
 406             ;paddw     xmm1,       eight8s
 407             psraw       xmm1,       4
 408
 409             packuswb    xmm1,       xmm0
 410             pand        xmm1,       xmm3
 411
 412             pandn       xmm3,       xmm2
 413             por         xmm1,       xmm3
 414
 415             and         rcx,        15
 416             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
 417
 418             mov         rcx,        rdx
 419             sub         rcx,        8
 420
 421             and         rcx,        15
 422             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
 423
 424             movq        [rsi],      mm0
 425             lea         rsi,        [rsi+rax]
 426
 427             lea         rdi,        [rdi+rax]
 428             add         rdx,        1
 429
 430             cmp         edx,        dword arg(2) ;rows
 431             jl          loop_row
 432
 433         add         dword arg(0), 8 ; s += 8
 434         sub         dword arg(3), 8 ; cols -= 8
 435         cmp         dword arg(3), 0
 436         jg          loop_col
 437
 438     add         rsp, 128+16
 439     pop         rsp
 440
 441     ; begin epilog
 442     pop rdi
 443     pop rsi
 444     RESTORE_GOT
 445     RESTORE_XMM
 446     UNSHADOW_ARGS
 447     pop         rbp
 448     ret
 449 %undef flimit4
 450
 451
 452 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
 453 ;                                int pitch, int rows, int cols,int flimit)
 454 global sym(vp8_mbpost_proc_across_ip_xmm)
 455 sym(vp8_mbpost_proc_across_ip_xmm):
 456     push        rbp
 457     mov         rbp, rsp
 458     SHADOW_ARGS_TO_STACK 5
 459     SAVE_XMM
 460     GET_GOT     rbx
 461     push        rsi
 462     push        rdi
 463     ; end prolog
 464
 465     ALIGN_STACK 16, rax
 466     sub         rsp, 16
 467
 468     ; create flimit4 at [rsp]
 469     mov         eax, dword ptr arg(4) ;flimit
 470     mov         [rsp], eax
 471     mov         [rsp+4], eax
 472     mov         [rsp+8], eax
 473     mov         [rsp+12], eax
 474 %define flimit4 [rsp]
 475
 476
 477     ;for(r=0;r<rows;r++)
 478 ip_row_loop:
 479
 480         xor         rdx,    rdx ;sumsq=0;
 481         xor         rcx,    rcx ;sum=0;
 482         mov         rsi,    arg(0); s
 483         mov         rdi,    -8
 484 ip_var_loop:
 485         ;for(i=-8;i<=6;i++)
 486         ;{
 487         ;    sumsq += s[i]*s[i];
 488         ;    sum   += s[i];
 489         ;}
 490         movzx       eax, byte [rsi+rdi]
 491         add         ecx, eax
 492         mul         al
 493         add         edx, eax
 494         add         rdi, 1
 495         cmp         rdi, 6
 496         jle         ip_var_loop
 497
 498
 499             ;mov         rax,    sumsq
 500             ;movd        xmm7,   rax
 501             movd        xmm7,   edx
 502
 503             ;mov         rax,    sum
 504             ;movd        xmm6,   rax
 505             movd        xmm6,   ecx
 506
 507             mov         rsi,    arg(0) ;s
 508             xor         rcx,    rcx
 509
 510             movsxd      rdx,    dword arg(3) ;cols
 511             add         rdx,    8
 512             pxor        mm0,    mm0
 513             pxor        mm1,    mm1
 514
 515             pxor        xmm0,   xmm0
 516 nextcol4:
 517
 518             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
 519             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
 520
 521             punpcklbw   xmm1,   xmm0                    ; expanding
 522             punpcklbw   xmm2,   xmm0                    ; expanding
 523
 524             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
 525             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
 526
 527             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
 528             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
 529
 530             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
 531             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
 532
 533             paddd       xmm6,   xmm2
 534             paddd       xmm7,   xmm1
 535
 536             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
 537             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
 538
 539             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
 540             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
 541
 542             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
 543             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
 544
 545             paddd       xmm6,   xmm4
 546             paddd       xmm7,   xmm3
 547
 548             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
 549             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
 550
 551             paddd       xmm7,   xmm3
 552             paddd       xmm6,   xmm4
 553
 554             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
 555             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
 556
 557             paddd       xmm7,   xmm3
 558             paddd       xmm6,   xmm4
 559
 560             movdqa      xmm3,   xmm6
 561             pmaddwd     xmm3,   xmm3
 562
 563             movdqa      xmm5,   xmm7
 564             pslld       xmm5,   4
 565
 566             psubd       xmm5,   xmm7
 567             psubd       xmm5,   xmm3
 568
 569             psubd       xmm5,   flimit4
 570             psrad       xmm5,   31
 571
 572             packssdw    xmm5,   xmm0
 573             packsswb    xmm5,   xmm0
 574
 575             movd        xmm1,   DWORD PTR [rsi+rcx]
 576             movq        xmm2,   xmm1
 577
 578             punpcklbw   xmm1,   xmm0
 579             punpcklwd   xmm1,   xmm0
 580
 581             paddd       xmm1,   xmm6
 582             paddd       xmm1,   [GLOBAL(four8s)]
 583
 584             psrad       xmm1,   4
 585             packssdw    xmm1,   xmm0
 586
 587             packuswb    xmm1,   xmm0
 588             pand        xmm1,   xmm5
 589
 590             pandn       xmm5,   xmm2
 591             por         xmm5,   xmm1
 592
 593             movd        [rsi+rcx-8],  mm0
 594             movq        mm0,    mm1
 595
 596             movdq2q     mm1,    xmm5
 597             psrldq      xmm7,   12
 598
 599             psrldq      xmm6,   12
 600             add         rcx,    4
 601
 602             cmp         rcx,    rdx
 603             jl          nextcol4
 604
 605         ;s+=pitch;
 606         movsxd rax, dword arg(1)
 607         add    arg(0), rax
 608
 609         sub dword arg(2), 1 ;rows-=1
 610         cmp dword arg(2), 0
 611         jg ip_row_loop
 612
 613     add         rsp, 16
 614     pop         rsp
 615
 616     ; begin epilog
 617     pop rdi
 618     pop rsi
 619     RESTORE_GOT
 620     RESTORE_XMM
 621     UNSHADOW_ARGS
 622     pop         rbp
 623     ret
 624 %undef flimit4
 625
 626
 627 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
 628 ;                            unsigned char blackclamp[16],
 629 ;                            unsigned char whiteclamp[16],
 630 ;                            unsigned char bothclamp[16],
 631 ;                            unsigned int Width, unsigned int Height, int Pitch)
 632 extern sym(rand)
 633 global sym(vp8_plane_add_noise_wmt)
 634 sym(vp8_plane_add_noise_wmt):
 635     push        rbp
 636     mov         rbp, rsp
 637     SHADOW_ARGS_TO_STACK 8
 638     GET_GOT     rbx
 639     push        rsi
 640     push        rdi
 641     ; end prolog
 642
 643 addnoise_loop:
 644     call sym(rand) WRT_PLT
 645     mov     rcx, arg(1) ;noise
 646     and     rax, 0xff
 647     add     rcx, rax
 648
 649     ; we rely on the fact that the clamping vectors are stored contiguously
 650     ; in black/white/both order. Note that we have to reload this here because
 651     ; rdx could be trashed by rand()
 652     mov     rdx, arg(2) ; blackclamp
 653
 654
 655             mov     rdi, rcx
 656             movsxd  rcx, dword arg(5) ;[Width]
 657             mov     rsi, arg(0) ;Pos
 658             xor         rax,rax
 659
 660 addnoise_nextset:
 661             movdqu      xmm1,[rsi+rax]         ; get the source
 662
 663             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
 664             paddusb     xmm1, [rdx+32] ;bothclamp
 665             psubusb     xmm1, [rdx+16] ;whiteclamp
 666
 667             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
 668             paddb       xmm1,xmm2              ; add it in
 669             movdqu      [rsi+rax],xmm1         ; store the result
 670
 671             add         rax,16                 ; move to the next line
 672
 673             cmp         rax, rcx
 674             jl          addnoise_nextset
 675
 676     movsxd  rax, dword arg(7) ; Pitch
 677     add     arg(0), rax ; Start += Pitch
 678     sub     dword arg(6), 1   ; Height -= 1
 679     jg      addnoise_loop
 680
 681     ; begin epilog
 682     pop rdi
 683     pop rsi
 684     RESTORE_GOT
 685     UNSHADOW_ARGS
 686     pop         rbp
 687     ret
 688
 689
 690 SECTION_RODATA
 691 align 16
 692 rd42:
 693     times 8 dw 0x04
 694 four8s:
 695     times 4 dd 8