vp8/common/x86/loopfilter_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
  15 ; in ffvp8
  16
  17 %macro LFH_FILTER_AND_HEV_MASK 1
  18 %if %1
  19         movdqa      xmm2,                   [rdi+2*rax]       ; q3
  20         movdqa      xmm1,                   [rsi+2*rax]       ; q2
  21         movdqa      xmm4,                   [rsi+rax]         ; q1
  22         movdqa      xmm5,                   [rsi]             ; q0
  23         neg         rax                     ; negate pitch to deal with above border
  24 %else
  25         movlps      xmm2,                   [rsi + rcx*2]     ; q3
  26         movlps      xmm1,                   [rsi + rcx]       ; q2
  27         movlps      xmm4,                   [rsi]             ; q1
  28         movlps      xmm5,                   [rsi + rax]       ; q0
  29
  30         movhps      xmm2,                   [rdi + rcx*2]
  31         movhps      xmm1,                   [rdi + rcx]
  32         movhps      xmm4,                   [rdi]
  33         movhps      xmm5,                   [rdi + rax]
  34
  35         lea         rsi,                    [rsi + rax*4]
  36         lea         rdi,                    [rdi + rax*4]
  37
  38         movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
  39         movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
  40 %endif
  41
  42         movdqa      xmm6,                   xmm1              ; q2
  43         movdqa      xmm3,                   xmm4              ; q1
  44
  45         psubusb     xmm1,                   xmm2              ; q2-=q3
  46         psubusb     xmm2,                   xmm6              ; q3-=q2
  47
  48         psubusb     xmm4,                   xmm6              ; q1-=q2
  49         psubusb     xmm6,                   xmm3              ; q2-=q1
  50
  51         por         xmm4,                   xmm6              ; abs(q2-q1)
  52         por         xmm1,                   xmm2              ; abs(q3-q2)
  53
  54         movdqa      xmm0,                   xmm5              ; q0
  55         pmaxub      xmm1,                   xmm4
  56
  57         psubusb     xmm5,                   xmm3              ; q0-=q1
  58         psubusb     xmm3,                   xmm0              ; q1-=q0
  59
  60         por         xmm5,                   xmm3              ; abs(q0-q1)
  61         movdqa      t0,                     xmm5              ; save to t0
  62
  63         pmaxub      xmm1,                   xmm5
  64
  65 %if %1
  66         movdqa      xmm2,                   [rsi+4*rax]       ; p3
  67         movdqa      xmm4,                   [rdi+4*rax]       ; p2
  68         movdqa      xmm6,                   [rsi+2*rax]       ; p1
  69 %else
  70         movlps      xmm2,                   [rsi + rax]       ; p3
  71         movlps      xmm4,                   [rsi]             ; p2
  72         movlps      xmm6,                   [rsi + rcx]       ; p1
  73
  74         movhps      xmm2,                   [rdi + rax]
  75         movhps      xmm4,                   [rdi]
  76         movhps      xmm6,                   [rdi + rcx]
  77
  78         movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
  79         movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
  80 %endif
  81
  82         movdqa      xmm5,                   xmm4              ; p2
  83         movdqa      xmm3,                   xmm6              ; p1
  84
  85         psubusb     xmm4,                   xmm2              ; p2-=p3
  86         psubusb     xmm2,                   xmm5              ; p3-=p2
  87
  88         psubusb     xmm3,                   xmm5              ; p1-=p2
  89         pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
  90
  91         psubusb     xmm5,                   xmm6              ; p2-=p1
  92         pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
  93
  94         pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
  95         movdqa      xmm2,                   xmm6              ; p1
  96
  97         pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
  98 %if %1
  99         movdqa      xmm4,                   [rsi+rax]         ; p0
 100         movdqa      xmm3,                   [rdi]             ; q1
 101 %else
 102         movlps      xmm4,                   [rsi + rcx*2]     ; p0
 103         movhps      xmm4,                   [rdi + rcx*2]
 104         movdqa      xmm3,                   q1                ; q1
 105 %endif
 106
 107         movdqa      xmm5,                   xmm4              ; p0
 108         psubusb     xmm4,                   xmm6              ; p0-=p1
 109
 110         psubusb     xmm6,                   xmm5              ; p1-=p0
 111
 112         por         xmm6,                   xmm4              ; abs(p1 - p0)
 113         mov         rdx,                    arg(2)            ; get blimit
 114
 115         movdqa        t1,                   xmm6              ; save to t1
 116
 117         movdqa      xmm4,                   xmm3              ; q1
 118         pmaxub      xmm1,                   xmm6
 119
 120         psubusb     xmm3,                   xmm2              ; q1-=p1
 121         psubusb     xmm2,                   xmm4              ; p1-=q1
 122
 123         psubusb     xmm1,                   xmm7
 124         por         xmm2,                   xmm3              ; abs(p1-q1)
 125
 126         movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
 127
 128         movdqa      xmm3,                   xmm0              ; q0
 129         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
 130
 131         mov         rdx,                    arg(4)            ; hev get thresh
 132
 133         movdqa      xmm6,                   xmm5              ; p0
 134         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
 135
 136         psubusb     xmm5,                   xmm3              ; p0-=q0
 137
 138         psubusb     xmm3,                   xmm6              ; q0-=p0
 139         por         xmm5,                   xmm3              ; abs(p0 - q0)
 140
 141         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
 142
 143         movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
 144
 145         movdqa      xmm3,                   t1                ; get abs (p1 - p0)
 146
 147         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 148
 149         movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
 150
 151         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
 152         psubusb     xmm4,                   xmm2              ; hev
 153
 154         psubusb     xmm3,                   xmm2              ; hev
 155         por         xmm1,                   xmm5
 156
 157         pxor        xmm7,                   xmm7
 158         paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 159
 160         pcmpeqb     xmm4,                   xmm5              ; hev
 161         pcmpeqb     xmm3,                   xmm3              ; hev
 162
 163         pcmpeqb     xmm1,                   xmm7              ; mask xmm1
 164         pxor        xmm4,                   xmm3              ; hev
 165 %endmacro
 166
 167 %macro B_FILTER 1
 168 %if %1 == 0
 169         movdqa      xmm2,                   p1                ; p1
 170         movdqa      xmm7,                   q1                ; q1
 171 %elif %1 == 1
 172         movdqa      xmm2,                   [rsi+2*rax]       ; p1
 173         movdqa      xmm7,                   [rdi]             ; q1
 174 %elif %1 == 2
 175         lea         rdx,                    srct
 176
 177         movdqa      xmm2,                   [rdx]             ; p1
 178         movdqa      xmm7,                   [rdx+48]          ; q1
 179         movdqa      xmm6,                   [rdx+16]          ; p0
 180         movdqa      xmm0,                   [rdx+32]          ; q0
 181 %endif
 182
 183         pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
 184         pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
 185
 186         psubsb      xmm2,                   xmm7              ; p1 - q1
 187         pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
 188
 189         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
 190         pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
 191
 192         movdqa      xmm3,                   xmm0              ; q0
 193         psubsb      xmm0,                   xmm6              ; q0 - p0
 194
 195         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
 196
 197         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
 198
 199         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
 200
 201         pand        xmm1,                   xmm2              ; mask filter values we don't care about
 202
 203         movdqa      xmm2,                   xmm1
 204
 205         paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
 206         paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
 207
 208         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
 209         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
 210
 211         punpcklbw   xmm0,                   xmm1              ; exfxgxhx
 212         psraw       xmm5,                   11                ; sign extended shift right by 3
 213
 214         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
 215         psraw       xmm2,                   11                ; sign extended shift right by 3
 216
 217         packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
 218         psraw       xmm0,                   11                ; sign extended shift right by 3
 219
 220         psraw       xmm1,                   11                ; sign extended shift right by 3
 221         movdqa      xmm5,                   xmm0              ; save results
 222
 223         packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
 224         paddsw      xmm5,                   [GLOBAL(ones)]
 225
 226         paddsw      xmm1,                   [GLOBAL(ones)]
 227         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
 228
 229         psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
 230
 231         paddsb      xmm6,                   xmm2              ; p0+= p0 add
 232         packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
 233
 234 %if %1 == 0
 235         movdqa      xmm1,                   p1                ; p1
 236 %elif %1 == 1
 237         movdqa      xmm1,                   [rsi+2*rax]       ; p1
 238 %elif %1 == 2
 239         movdqa      xmm1,                   [rdx]             ; p1
 240 %endif
 241         pandn       xmm4,                   xmm5              ; high edge variance additive
 242         pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
 243
 244         pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
 245         psubsb      xmm3,                   xmm0              ; q0-= q0 add
 246
 247         paddsb      xmm1,                   xmm4              ; p1+= p1 add
 248         pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
 249
 250         pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
 251         psubsb      xmm7,                   xmm4              ; q1-= q1 add
 252
 253         pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
 254 %if %1 == 0
 255         lea         rsi,                    [rsi + rcx*2]
 256         lea         rdi,                    [rdi + rcx*2]
 257         movq        MMWORD PTR [rsi],       xmm6              ; p0
 258         movhps      MMWORD PTR [rdi],       xmm6
 259         movq        MMWORD PTR [rsi + rax], xmm1              ; p1
 260         movhps      MMWORD PTR [rdi + rax], xmm1
 261         movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
 262         movhps      MMWORD PTR [rdi + rcx], xmm3
 263         movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
 264         movhps      MMWORD PTR [rdi + rcx*2],xmm7
 265 %elif %1 == 1
 266         movdqa      [rsi+rax],              xmm6              ; write back
 267         movdqa      [rsi+2*rax],            xmm1              ; write back
 268         movdqa      [rsi],                  xmm3              ; write back
 269         movdqa      [rdi],                  xmm7              ; write back
 270 %endif
 271
 272 %endmacro
 273
 274
 275 ;void vp8_loop_filter_horizontal_edge_sse2
 276 ;(
 277 ;    unsigned char *src_ptr,
 278 ;    int            src_pixel_step,
 279 ;    const char    *blimit,
 280 ;    const char    *limit,
 281 ;    const char    *thresh,
 282 ;    int            count
 283 ;)
 284 global sym(vp8_loop_filter_horizontal_edge_sse2)
 285 sym(vp8_loop_filter_horizontal_edge_sse2):
 286     push        rbp
 287     mov         rbp, rsp
 288     SHADOW_ARGS_TO_STACK 6
 289     SAVE_XMM 7
 290     GET_GOT     rbx
 291     push        rsi
 292     push        rdi
 293     ; end prolog
 294
 295     ALIGN_STACK 16, rax
 296     sub         rsp, 32     ; reserve 32 bytes
 297     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
 298     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
 299
 300         mov         rsi,                    arg(0)           ;src_ptr
 301         movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
 302
 303         mov         rdx,                    arg(3)           ;limit
 304         movdqa      xmm7,                   XMMWORD PTR [rdx]
 305
 306         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
 307
 308         ; calculate breakout conditions and high edge variance
 309         LFH_FILTER_AND_HEV_MASK 1
 310         ; filter and write back the result
 311         B_FILTER 1
 312
 313     add rsp, 32
 314     pop rsp
 315     ; begin epilog
 316     pop rdi
 317     pop rsi
 318     RESTORE_GOT
 319     RESTORE_XMM
 320     UNSHADOW_ARGS
 321     pop         rbp
 322     ret
 323
 324
 325 ;void vp8_loop_filter_horizontal_edge_uv_sse2
 326 ;(
 327 ;    unsigned char *src_ptr,
 328 ;    int            src_pixel_step,
 329 ;    const char    *blimit,
 330 ;    const char    *limit,
 331 ;    const char    *thresh,
 332 ;    int            count
 333 ;)
 334 global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
 335 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
 336     push        rbp
 337     mov         rbp, rsp
 338     SHADOW_ARGS_TO_STACK 6
 339     SAVE_XMM 7
 340     GET_GOT     rbx
 341     push        rsi
 342     push        rdi
 343     ; end prolog
 344
 345     ALIGN_STACK 16, rax
 346     sub         rsp, 96       ; reserve 96 bytes
 347     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
 348     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
 349     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
 350     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
 351     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
 352     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
 353
 354         mov         rsi,                    arg(0)             ; u
 355         mov         rdi,                    arg(5)             ; v
 356         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
 357         mov         rcx,                    rax
 358         neg         rax                     ; negate pitch to deal with above border
 359
 360         mov         rdx,                    arg(3)             ;limit
 361         movdqa      xmm7,                   XMMWORD PTR [rdx]
 362
 363         lea         rsi,                    [rsi + rcx]
 364         lea         rdi,                    [rdi + rcx]
 365
 366         ; calculate breakout conditions and high edge variance
 367         LFH_FILTER_AND_HEV_MASK 0
 368         ; filter and write back the result
 369         B_FILTER 0
 370
 371     add rsp, 96
 372     pop rsp
 373     ; begin epilog
 374     pop rdi
 375     pop rsi
 376     RESTORE_GOT
 377     RESTORE_XMM
 378     UNSHADOW_ARGS
 379     pop         rbp
 380     ret
 381
 382
 383 %macro MB_FILTER_AND_WRITEBACK 1
 384 %if %1 == 0
 385         movdqa      xmm2,                   p1              ; p1
 386         movdqa      xmm7,                   q1              ; q1
 387 %elif %1 == 1
 388         movdqa      xmm2,                   [rsi+2*rax]     ; p1
 389         movdqa      xmm7,                   [rdi]           ; q1
 390
 391         mov         rcx,                    rax
 392         neg         rcx
 393 %elif %1 == 2
 394         lea         rdx,                    srct
 395
 396         movdqa      xmm2,                   [rdx+32]        ; p1
 397         movdqa      xmm7,                   [rdx+80]        ; q1
 398         movdqa      xmm6,                   [rdx+48]        ; p0
 399         movdqa      xmm0,                   [rdx+64]        ; q0
 400 %endif
 401
 402         pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
 403         pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
 404         pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
 405         pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
 406
 407         psubsb      xmm2,                   xmm7            ; p1 - q1
 408         movdqa      xmm3,                   xmm0            ; q0
 409
 410         psubsb      xmm0,                   xmm6            ; q0 - p0
 411
 412         paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
 413
 414         paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
 415
 416         paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
 417
 418         pand        xmm1,                   xmm2            ; mask filter values we don't care about
 419
 420         movdqa      xmm2,                   xmm1            ; vp8_filter
 421
 422         pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
 423         pxor        xmm0,                   xmm0
 424
 425         pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
 426         pxor        xmm1,                   xmm1
 427
 428         punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
 429         movdqa      xmm5,                   xmm2
 430
 431         punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
 432         paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
 433
 434         pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
 435
 436         pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
 437
 438         punpckhbw   xmm7,                   xmm5            ; axbxcxdx
 439         paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
 440
 441         punpcklbw   xmm5,                   xmm5            ; exfxgxhx
 442         psraw       xmm7,                   11              ; sign extended shift right by 3
 443
 444         psraw       xmm5,                   11              ; sign extended shift right by 3
 445         punpckhbw   xmm4,                   xmm2            ; axbxcxdx
 446
 447         punpcklbw   xmm2,                   xmm2            ; exfxgxhx
 448         psraw       xmm4,                   11              ; sign extended shift right by 3
 449
 450         packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
 451         psraw       xmm2,                   11              ; sign extended shift right by 3
 452
 453         packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
 454         movdqa      xmm7,                   xmm1
 455
 456         paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
 457         movdqa      xmm4,                   xmm1
 458
 459         psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
 460         movdqa      xmm5,                   xmm0
 461
 462         movdqa      xmm2,                   xmm5
 463         paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
 464
 465         paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
 466         paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
 467
 468         paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
 469         paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
 470
 471         paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
 472         paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
 473
 474         paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
 475         psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
 476
 477         psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
 478         psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
 479
 480         packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
 481         psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
 482
 483         psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
 484         packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
 485
 486         psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
 487
 488         packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
 489
 490         psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
 491         paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
 492
 493 %if %1 == 0
 494         movdqa      xmm5,                   q2              ; q2
 495         movdqa      xmm1,                   q1              ; q1
 496         movdqa      xmm4,                   p1              ; p1
 497         movdqa      xmm7,                   p2              ; p2
 498
 499 %elif %1 == 1
 500         movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
 501         movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
 502         movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
 503         movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
 504 %elif %1 == 2
 505         movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
 506         movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
 507         movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
 508         movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
 509 %endif
 510
 511         pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
 512         pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
 513
 514         pxor        xmm1,                   [GLOBAL(t80)]
 515         pxor        xmm4,                   [GLOBAL(t80)]
 516
 517         psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
 518         paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
 519
 520         pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
 521         pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
 522
 523         pxor        xmm7,                   [GLOBAL(t80)]
 524         pxor        xmm5,                   [GLOBAL(t80)]
 525
 526         paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
 527         psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
 528
 529         pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
 530         pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
 531
 532 %if %1 == 0
 533         lea         rsi,                    [rsi+rcx*2]
 534         lea         rdi,                    [rdi+rcx*2]
 535
 536         movq        MMWORD PTR [rsi],       xmm6            ; p0
 537         movhps      MMWORD PTR [rdi],       xmm6
 538         movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
 539         movhps      MMWORD PTR [rdi + rcx], xmm3
 540
 541         movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
 542         movhps      MMWORD PTR [rdi+rcx*2], xmm1
 543
 544         movq        MMWORD PTR [rsi + rax], xmm4            ; p1
 545         movhps      MMWORD PTR [rdi + rax], xmm4
 546
 547         movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
 548         movhps      MMWORD PTR [rdi+rax*2], xmm7
 549
 550         lea         rsi,                    [rsi + rcx]
 551         lea         rdi,                    [rdi + rcx]
 552         movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
 553         movhps      MMWORD PTR [rdi+rcx*2], xmm5
 554 %elif %1 == 1
 555         movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
 556         movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
 557         movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
 558         movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
 559         movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
 560         movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
 561 %elif %1 == 2
 562         movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
 563         movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
 564         movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
 565         movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
 566 %endif
 567
 568 %endmacro
 569
 570
 571 ;void vp8_mbloop_filter_horizontal_edge_sse2
 572 ;(
 573 ;    unsigned char *src_ptr,
 574 ;    int            src_pixel_step,
 575 ;    const char    *blimit,
 576 ;    const char    *limit,
 577 ;    const char    *thresh,
 578 ;    int            count
 579 ;)
 580 global sym(vp8_mbloop_filter_horizontal_edge_sse2)
 581 sym(vp8_mbloop_filter_horizontal_edge_sse2):
 582     push        rbp
 583     mov         rbp, rsp
 584     SHADOW_ARGS_TO_STACK 6
 585     SAVE_XMM 7
 586     GET_GOT     rbx
 587     push        rsi
 588     push        rdi
 589     ; end prolog
 590
 591     ALIGN_STACK 16, rax
 592     sub         rsp, 32     ; reserve 32 bytes
 593     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
 594     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
 595
 596         mov         rsi,                    arg(0)            ;src_ptr
 597         movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
 598
 599         mov         rdx,                    arg(3)            ;limit
 600         movdqa      xmm7,                   XMMWORD PTR [rdx]
 601
 602         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
 603
 604         ; calculate breakout conditions and high edge variance
 605         LFH_FILTER_AND_HEV_MASK 1
 606         ; filter and write back the results
 607         MB_FILTER_AND_WRITEBACK 1
 608
 609     add rsp, 32
 610     pop rsp
 611     ; begin epilog
 612     pop rdi
 613     pop rsi
 614     RESTORE_GOT
 615     RESTORE_XMM
 616     UNSHADOW_ARGS
 617     pop         rbp
 618     ret
 619
 620
 621 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
 622 ;(
 623 ;    unsigned char *u,
 624 ;    int            src_pixel_step,
 625 ;    const char    *blimit,
 626 ;    const char    *limit,
 627 ;    const char    *thresh,
 628 ;    unsigned char *v
 629 ;)
 630 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
 631 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
 632     push        rbp
 633     mov         rbp, rsp
 634     SHADOW_ARGS_TO_STACK 6
 635     SAVE_XMM 7
 636     GET_GOT     rbx
 637     push        rsi
 638     push        rdi
 639     ; end prolog
 640
 641     ALIGN_STACK 16, rax
 642     sub         rsp, 96       ; reserve 96 bytes
 643     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
 644     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
 645     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
 646     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
 647     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
 648     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
 649
 650         mov         rsi,                    arg(0)             ; u
 651         mov         rdi,                    arg(5)             ; v
 652         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
 653         mov         rcx,                    rax
 654         neg         rax                     ; negate pitch to deal with above border
 655
 656         mov         rdx,                    arg(3)             ;limit
 657         movdqa      xmm7,                   XMMWORD PTR [rdx]
 658
 659         lea         rsi,                    [rsi + rcx]
 660         lea         rdi,                    [rdi + rcx]
 661
 662         ; calculate breakout conditions and high edge variance
 663         LFH_FILTER_AND_HEV_MASK 0
 664         ; filter and write back the results
 665         MB_FILTER_AND_WRITEBACK 0
 666
 667     add rsp, 96
 668     pop rsp
 669     ; begin epilog
 670     pop rdi
 671     pop rsi
 672     RESTORE_GOT
 673     RESTORE_XMM
 674     UNSHADOW_ARGS
 675     pop         rbp
 676     ret
 677
 678
 679 %macro TRANSPOSE_16X8 2
 680         movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
 681         movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
 682         movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
 683         movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
 684         movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
 685         movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
 686
 687         punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
 688
 689         movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
 690
 691         movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
 692         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
 693
 694         movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
 695
 696         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
 697 %if %1
 698         lea         rsi,                [rsi+rax*8]
 699 %else
 700         mov         rsi,                arg(5)          ; v_ptr
 701 %endif
 702
 703         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
 704         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
 705
 706         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
 707
 708         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
 709 %if %1
 710         lea         rdi,                [rdi+rax*8]
 711 %else
 712         lea         rsi,                [rsi - 4]
 713 %endif
 714
 715         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
 716 %if %1
 717         lea         rdx,                srct
 718 %else
 719         lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
 720 %endif
 721
 722         movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
 723         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
 724
 725         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
 726         punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 727
 728         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
 729
 730         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 731
 732         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
 733
 734         movdqa      t0,                 xmm2            ; save to free XMM2
 735         movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
 736         movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
 737         movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
 738         movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
 739         movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
 740
 741         punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
 742
 743         movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
 744
 745         punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
 746
 747         movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
 748
 749         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
 750
 751         movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
 752
 753         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
 754
 755         movdqa      xmm6,               xmm1            ;
 756         punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
 757
 758         punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
 759         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
 760
 761         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
 762
 763         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
 764
 765         movdqa      xmm0,               xmm5
 766         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
 767
 768         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
 769         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
 770
 771         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
 772
 773         punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
 774         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
 775
 776         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
 777
 778         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
 779 %if %2
 780         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 781         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 782
 783         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 784
 785         movdqa      [rdx],              xmm2            ; save 2
 786
 787         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 788         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 789
 790         movdqa      [rdx+16],           xmm3            ; save 3
 791
 792         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 793
 794         movdqa      [rdx+32],           xmm4            ; save 4
 795         movdqa      [rdx+48],           xmm5            ; save 5
 796         movdqa      xmm1,               t0              ; get
 797
 798         movdqa      xmm2,               xmm1            ;
 799         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
 800
 801         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 802 %else
 803         movdqa      [rdx+112],          xmm7            ; save 7
 804
 805         movdqa      [rdx+96],           xmm6            ; save 6
 806
 807         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 808         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 809
 810         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 811
 812         movdqa      [rdx+32],           xmm2            ; save 2
 813
 814         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 815         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 816
 817         movdqa      [rdx+48],           xmm3            ; save 3
 818
 819         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 820
 821         movdqa      [rdx+64],           xmm4            ; save 4
 822         movdqa      [rdx+80],           xmm5            ; save 5
 823         movdqa      xmm1,               t0              ; get
 824
 825         movdqa      xmm2,               xmm1
 826         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
 827
 828         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 829
 830         movdqa      [rdx+16],           xmm1
 831
 832         movdqa      [rdx],              xmm2
 833 %endif
 834 %endmacro
 835
 836 %macro LFV_FILTER_MASK_HEV_MASK 1
 837         movdqa      xmm0,               xmm6            ; q2
 838         psubusb     xmm0,               xmm7            ; q2-q3
 839
 840         psubusb     xmm7,               xmm6            ; q3-q2
 841         movdqa      xmm4,               xmm5            ; q1
 842
 843         por         xmm7,               xmm0            ; abs (q3-q2)
 844         psubusb     xmm4,               xmm6            ; q1-q2
 845
 846         movdqa      xmm0,               xmm1
 847         psubusb     xmm6,               xmm5            ; q2-q1
 848
 849         por         xmm6,               xmm4            ; abs (q2-q1)
 850         psubusb     xmm0,               xmm2            ; p2 - p3;
 851
 852         psubusb     xmm2,               xmm1            ; p3 - p2;
 853         por         xmm0,               xmm2            ; abs(p2-p3)
 854 %if %1
 855         movdqa      xmm2,               [rdx]           ; p1
 856 %else
 857         movdqa      xmm2,               [rdx+32]        ; p1
 858 %endif
 859         movdqa      xmm5,               xmm2            ; p1
 860         pmaxub      xmm0,               xmm7
 861
 862         psubusb     xmm5,               xmm1            ; p1-p2
 863         psubusb     xmm1,               xmm2            ; p2-p1
 864
 865         movdqa      xmm7,               xmm3            ; p0
 866         psubusb     xmm7,               xmm2            ; p0-p1
 867
 868         por         xmm1,               xmm5            ; abs(p2-p1)
 869         pmaxub      xmm0,               xmm6
 870
 871         pmaxub      xmm0,               xmm1
 872         movdqa      xmm1,               xmm2            ; p1
 873
 874         psubusb     xmm2,               xmm3            ; p1-p0
 875         lea         rdx,                srct
 876
 877         por         xmm2,               xmm7            ; abs(p1-p0)
 878
 879         movdqa      t0,                 xmm2            ; save abs(p1-p0)
 880
 881         pmaxub      xmm0,               xmm2
 882
 883 %if %1
 884         movdqa      xmm5,               [rdx+32]        ; q0
 885         movdqa      xmm7,               [rdx+48]        ; q1
 886 %else
 887         movdqa      xmm5,               [rdx+64]        ; q0
 888         movdqa      xmm7,               [rdx+80]        ; q1
 889 %endif
 890         mov         rdx,                arg(3)          ; limit
 891
 892         movdqa      xmm6,               xmm5            ; q0
 893         movdqa      xmm2,               xmm7            ; q1
 894
 895         psubusb     xmm5,               xmm7            ; q0-q1
 896         psubusb     xmm7,               xmm6            ; q1-q0
 897
 898         por         xmm7,               xmm5            ; abs(q1-q0)
 899
 900         movdqa      t1,                 xmm7            ; save abs(q1-q0)
 901
 902         movdqa      xmm4,               XMMWORD PTR [rdx]; limit
 903
 904         pmaxub      xmm0,               xmm7
 905         mov         rdx,                arg(2)          ; blimit
 906
 907         psubusb     xmm0,               xmm4
 908         movdqa      xmm5,               xmm2            ; q1
 909
 910         psubusb     xmm5,               xmm1            ; q1-=p1
 911         psubusb     xmm1,               xmm2            ; p1-=q1
 912
 913         por         xmm5,               xmm1            ; abs(p1-q1)
 914         movdqa      xmm1,               xmm3            ; p0
 915
 916         pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
 917         psubusb     xmm1,               xmm6            ; p0-q0
 918
 919         psrlw       xmm5,               1               ; abs(p1-q1)/2
 920         psubusb     xmm6,               xmm3            ; q0-p0
 921
 922         movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
 923
 924         mov         rdx,                arg(4)          ; get thresh
 925
 926         por         xmm1,               xmm6            ; abs(q0-p0)
 927
 928         movdqa      xmm6,               t0              ; get abs (q1 - q0)
 929
 930         paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
 931
 932         movdqa      xmm3,               t1              ; get abs (p1 - p0)
 933
 934         movdqa      xmm7,               XMMWORD PTR [rdx]
 935
 936         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
 937         psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
 938
 939         psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
 940
 941         psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
 942         por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 943
 944         por         xmm1,               xmm0            ; mask
 945         pcmpeqb     xmm6,               xmm0
 946
 947         pxor        xmm0,               xmm0
 948         pcmpeqb     xmm4,               xmm4
 949
 950         pcmpeqb     xmm1,               xmm0
 951         pxor        xmm4,               xmm6
 952 %endmacro
 953
 954 %macro BV_TRANSPOSE 0
 955         ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 956         ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 957         ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 958         ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 959         movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 960         punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
 961
 962         movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 963         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 964
 965         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
 966
 967         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
 968
 969         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
 970         punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
 971
 972         punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
 973         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 974
 975         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
 976
 977         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
 978         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
 979         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
 980         ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
 981         ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
 982 %endmacro
 983
 984 %macro BV_WRITEBACK 2
 985         movd        [rsi+2],            %1
 986         psrldq      %1,                 4
 987
 988         movd        [rdi+2],            %1
 989         psrldq      %1,                 4
 990
 991         movd        [rsi+2*rax+2],      %1
 992         psrldq      %1,                 4
 993
 994         movd        [rdi+2*rax+2],      %1
 995
 996         movd        [rsi+4*rax+2],      %2
 997         psrldq      %2,                 4
 998
 999         movd        [rdi+4*rax+2],      %2
1000         psrldq      %2,                 4
1001
1002         movd        [rsi+2*rcx+2],      %2
1003         psrldq      %2,                 4
1004
1005         movd        [rdi+2*rcx+2],      %2
1006 %endmacro
1007
1008
1009 ;void vp8_loop_filter_vertical_edge_sse2
1010 ;(
1011 ;    unsigned char *src_ptr,
1012 ;    int            src_pixel_step,
1013 ;    const char    *blimit,
1014 ;    const char    *limit,
1015 ;    const char    *thresh,
1016 ;    int            count
1017 ;)
1018 global sym(vp8_loop_filter_vertical_edge_sse2)
1019 sym(vp8_loop_filter_vertical_edge_sse2):
1020     push        rbp
1021     mov         rbp, rsp
1022     SHADOW_ARGS_TO_STACK 6
1023     SAVE_XMM 7
1024     GET_GOT     rbx
1025     push        rsi
1026     push        rdi
1027     ; end prolog
1028
1029     ALIGN_STACK 16, rax
1030     sub             rsp, 96      ; reserve 96 bytes
1031     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1032     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1033     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
1034
1035         mov         rsi,        arg(0)                  ; src_ptr
1036         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1037
1038         lea         rsi,        [rsi - 4]
1039         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1040         lea         rcx,        [rax*2+rax]
1041
1042         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1043         TRANSPOSE_16X8 1, 1
1044
1045         ; calculate filter mask and high edge variance
1046         LFV_FILTER_MASK_HEV_MASK 1
1047
1048         ; start work on filters
1049         B_FILTER 2
1050
1051         ; tranpose and write back - only work on q1, q0, p0, p1
1052         BV_TRANSPOSE
1053         ; store 16-line result
1054
1055         lea         rdx,        [rax]
1056         neg         rdx
1057
1058         BV_WRITEBACK xmm1, xmm5
1059
1060         lea         rsi,        [rsi+rdx*8]
1061         lea         rdi,        [rdi+rdx*8]
1062         BV_WRITEBACK xmm2, xmm6
1063
1064     add rsp, 96
1065     pop rsp
1066     ; begin epilog
1067     pop rdi
1068     pop rsi
1069     RESTORE_GOT
1070     RESTORE_XMM
1071     UNSHADOW_ARGS
1072     pop         rbp
1073     ret
1074
1075
1076 ;void vp8_loop_filter_vertical_edge_uv_sse2
1077 ;(
1078 ;    unsigned char *u,
1079 ;    int            src_pixel_step,
1080 ;    const char    *blimit,
1081 ;    const char    *limit,
1082 ;    const char    *thresh,
1083 ;    unsigned char *v
1084 ;)
1085 global sym(vp8_loop_filter_vertical_edge_uv_sse2)
1086 sym(vp8_loop_filter_vertical_edge_uv_sse2):
1087     push        rbp
1088     mov         rbp, rsp
1089     SHADOW_ARGS_TO_STACK 6
1090     SAVE_XMM 7
1091     GET_GOT     rbx
1092     push        rsi
1093     push        rdi
1094     ; end prolog
1095
1096     ALIGN_STACK 16, rax
1097     sub             rsp, 96      ; reserve 96 bytes
1098     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1099     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1100     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
1101
1102         mov         rsi,        arg(0)                  ; u_ptr
1103         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1104
1105         lea         rsi,        [rsi - 4]
1106         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1107         lea         rcx,        [rax+2*rax]
1108
1109         lea         rdx,        srct
1110
1111         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1112         TRANSPOSE_16X8 0, 1
1113
1114         ; calculate filter mask and high edge variance
1115         LFV_FILTER_MASK_HEV_MASK 1
1116
1117         ; start work on filters
1118         B_FILTER 2
1119
1120         ; tranpose and write back - only work on q1, q0, p0, p1
1121         BV_TRANSPOSE
1122
1123         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1124
1125         ; store 16-line result
1126         BV_WRITEBACK xmm1, xmm5
1127
1128         mov         rsi,        arg(0)                  ; u_ptr
1129         lea         rsi,        [rsi - 4]
1130         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1131         BV_WRITEBACK xmm2, xmm6
1132
1133     add rsp, 96
1134     pop rsp
1135     ; begin epilog
1136     pop rdi
1137     pop rsi
1138     RESTORE_GOT
1139     RESTORE_XMM
1140     UNSHADOW_ARGS
1141     pop         rbp
1142     ret
1143
1144 %macro MBV_TRANSPOSE 0
1145         movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1146         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1147
1148         punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1149         punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1150
1151         movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1152         movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1153
1154         punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1155         punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1156
1157         movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1158         punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1159
1160         punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1161         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1162
1163         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1164         punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1165
1166         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1167         punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1168
1169         movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1170         punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1171
1172         movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1173         punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1174
1175         punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1176         movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1177
1178         punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1179         punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1180 %endmacro
1181
1182 %macro MBV_WRITEBACK_1 0
1183         movq        QWORD  PTR [rsi],   xmm0
1184         movhps      MMWORD PTR [rdi],   xmm0
1185
1186         movq        QWORD  PTR [rsi+2*rax], xmm6
1187         movhps      MMWORD PTR [rdi+2*rax], xmm6
1188
1189         movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1190         punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1191
1192         punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1193
1194         movq        QWORD  PTR [rsi+4*rax], xmm0
1195         movhps      MMWORD PTR [rdi+4*rax], xmm0
1196
1197         movq        QWORD  PTR [rsi+2*rcx], xmm3
1198         movhps      MMWORD PTR [rdi+2*rcx], xmm3
1199
1200         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1201         punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1202
1203         punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1204         movdqa      xmm0,               xmm2
1205
1206         punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1207         punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1208
1209         movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1210         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1211
1212         punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1213 %endmacro
1214
1215 %macro MBV_WRITEBACK_2 0
1216         movq        QWORD  PTR [rsi],   xmm1
1217         movhps      MMWORD PTR [rdi],   xmm1
1218
1219         movq        QWORD  PTR [rsi+2*rax], xmm5
1220         movhps      MMWORD PTR [rdi+2*rax], xmm5
1221
1222         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1223         punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1224         punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1225
1226         movq        QWORD  PTR [rsi+4*rax], xmm1
1227         movhps      MMWORD PTR [rdi+4*rax], xmm1
1228
1229         movq        QWORD  PTR [rsi+2*rcx], xmm4
1230         movhps      MMWORD PTR [rdi+2*rcx], xmm4
1231 %endmacro
1232
1233
1234 ;void vp8_mbloop_filter_vertical_edge_sse2
1235 ;(
1236 ;    unsigned char *src_ptr,
1237 ;    int            src_pixel_step,
1238 ;    const char    *blimit,
1239 ;    const char    *limit,
1240 ;    const char    *thresh,
1241 ;    int            count
1242 ;)
1243 global sym(vp8_mbloop_filter_vertical_edge_sse2)
1244 sym(vp8_mbloop_filter_vertical_edge_sse2):
1245     push        rbp
1246     mov         rbp, rsp
1247     SHADOW_ARGS_TO_STACK 6
1248     SAVE_XMM 7
1249     GET_GOT     rbx
1250     push        rsi
1251     push        rdi
1252     ; end prolog
1253
1254     ALIGN_STACK 16, rax
1255     sub          rsp, 160     ; reserve 160 bytes
1256     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1257     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1258     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
1259
1260         mov         rsi,                arg(0)              ; src_ptr
1261         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1262
1263         lea         rsi,                [rsi - 4]
1264         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1265         lea         rcx,                [rax*2+rax]
1266
1267         ; Transpose
1268         TRANSPOSE_16X8 1, 0
1269
1270         ; calculate filter mask and high edge variance
1271         LFV_FILTER_MASK_HEV_MASK 0
1272
1273         neg         rax
1274         ; start work on filters
1275         MB_FILTER_AND_WRITEBACK 2
1276
1277         lea         rsi,                [rsi+rax*8]
1278         lea         rdi,                [rdi+rax*8]
1279
1280         ; transpose and write back
1281         MBV_TRANSPOSE
1282
1283         neg         rax
1284
1285         MBV_WRITEBACK_1
1286
1287         lea         rsi,                [rsi+rax*8]
1288         lea         rdi,                [rdi+rax*8]
1289         MBV_WRITEBACK_2
1290
1291     add rsp, 160
1292     pop rsp
1293     ; begin epilog
1294     pop rdi
1295     pop rsi
1296     RESTORE_GOT
1297     RESTORE_XMM
1298     UNSHADOW_ARGS
1299     pop         rbp
1300     ret
1301
1302
1303 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
1304 ;(
1305 ;    unsigned char *u,
1306 ;    int            src_pixel_step,
1307 ;    const char    *blimit,
1308 ;    const char    *limit,
1309 ;    const char    *thresh,
1310 ;    unsigned char *v
1311 ;)
1312 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
1313 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1314     push        rbp
1315     mov         rbp, rsp
1316     SHADOW_ARGS_TO_STACK 6
1317     SAVE_XMM 7
1318     GET_GOT     rbx
1319     push        rsi
1320     push        rdi
1321     ; end prolog
1322
1323     ALIGN_STACK 16, rax
1324     sub          rsp, 160     ; reserve 160 bytes
1325     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1326     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1327     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
1328
1329         mov         rsi,                arg(0)              ; u_ptr
1330         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1331
1332         lea         rsi,                [rsi - 4]
1333         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1334         lea         rcx,                [rax+2*rax]
1335
1336         lea         rdx,                srct
1337
1338         ; Transpose
1339         TRANSPOSE_16X8 0, 0
1340
1341         ; calculate filter mask and high edge variance
1342         LFV_FILTER_MASK_HEV_MASK 0
1343
1344         ; start work on filters
1345         MB_FILTER_AND_WRITEBACK 2
1346
1347         ; transpose and write back
1348         MBV_TRANSPOSE
1349
1350         mov         rsi,                arg(0)             ;u_ptr
1351         lea         rsi,                [rsi - 4]
1352         lea         rdi,                [rsi + rax]
1353         MBV_WRITEBACK_1
1354         mov         rsi,                arg(5)             ;v_ptr
1355         lea         rsi,                [rsi - 4]
1356         lea         rdi,                [rsi + rax]
1357         MBV_WRITEBACK_2
1358
1359     add rsp, 160
1360     pop rsp
1361     ; begin epilog
1362     pop rdi
1363     pop rsi
1364     RESTORE_GOT
1365     RESTORE_XMM
1366     UNSHADOW_ARGS
1367     pop         rbp
1368     ret
1369
1370
1371 ;void vp8_loop_filter_simple_horizontal_edge_sse2
1372 ;(
1373 ;    unsigned char *src_ptr,
1374 ;    int  src_pixel_step,
1375 ;    const char *blimit,
1376 ;)
1377 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
1378 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1379     push        rbp
1380     mov         rbp, rsp
1381     SHADOW_ARGS_TO_STACK 3
1382     SAVE_XMM 7
1383     GET_GOT     rbx
1384     push        rsi
1385     push        rdi
1386     ; end prolog
1387
1388         mov         rsi, arg(0)             ;src_ptr
1389         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
1390         mov         rdx, arg(2)             ;blimit
1391         movdqa      xmm3, XMMWORD PTR [rdx]
1392
1393         mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
1394         add         rdi, rax
1395         neg         rax
1396
1397         ; calculate mask
1398         movdqa      xmm1, [rsi+2*rax]       ; p1
1399         movdqa      xmm0, [rdi]             ; q1
1400         movdqa      xmm2, xmm1
1401         movdqa      xmm7, xmm0
1402         movdqa      xmm4, xmm0
1403         psubusb     xmm0, xmm1              ; q1-=p1
1404         psubusb     xmm1, xmm4              ; p1-=q1
1405         por         xmm1, xmm0              ; abs(p1-q1)
1406         pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
1407         psrlw       xmm1, 1                 ; abs(p1-q1)/2
1408
1409         movdqa      xmm5, [rsi+rax]         ; p0
1410         movdqa      xmm4, [rsi]             ; q0
1411         movdqa      xmm0, xmm4              ; q0
1412         movdqa      xmm6, xmm5              ; p0
1413         psubusb     xmm5, xmm4              ; p0-=q0
1414         psubusb     xmm4, xmm6              ; q0-=p0
1415         por         xmm5, xmm4              ; abs(p0 - q0)
1416         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
1417         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1418
1419         psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1420         pxor        xmm3, xmm3
1421         pcmpeqb     xmm5, xmm3
1422
1423         ; start work on filters
1424         pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
1425         pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
1426         psubsb      xmm2, xmm7              ; p1 - q1
1427
1428         pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
1429         pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
1430         movdqa      xmm3, xmm0              ; q0
1431         psubsb      xmm0, xmm6              ; q0 - p0
1432         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
1433         paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
1434         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
1435         pand        xmm5, xmm2              ; mask filter values we don't care about
1436
1437         ; do + 4 side
1438         paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
1439
1440         movdqa      xmm0, xmm5              ; get a copy of filters
1441         psllw       xmm0, 8                 ; shift left 8
1442         psraw       xmm0, 3                 ; arithmetic shift right 11
1443         psrlw       xmm0, 8
1444         movdqa      xmm1, xmm5              ; get a copy of filters
1445         psraw       xmm1, 11                ; arithmetic shift right 11
1446         psllw       xmm1, 8                 ; shift left 8 to put it back
1447
1448         por         xmm0, xmm1              ; put the two together to get result
1449
1450         psubsb      xmm3, xmm0              ; q0-= q0 add
1451         pxor        xmm3, [GLOBAL(t80)]     ; unoffset
1452         movdqa      [rsi], xmm3             ; write back
1453
1454         ; now do +3 side
1455         psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
1456
1457         movdqa      xmm0, xmm5              ; get a copy of filters
1458         psllw       xmm0, 8                 ; shift left 8
1459         psraw       xmm0, 3                 ; arithmetic shift right 11
1460         psrlw       xmm0, 8
1461         psraw       xmm5, 11                ; arithmetic shift right 11
1462         psllw       xmm5, 8                 ; shift left 8 to put it back
1463         por         xmm0, xmm5              ; put the two together to get result
1464
1465
1466         paddsb      xmm6, xmm0              ; p0+= p0 add
1467         pxor        xmm6, [GLOBAL(t80)]     ; unoffset
1468         movdqa      [rsi+rax], xmm6         ; write back
1469
1470     ; begin epilog
1471     pop rdi
1472     pop rsi
1473     RESTORE_GOT
1474     RESTORE_XMM
1475     UNSHADOW_ARGS
1476     pop         rbp
1477     ret
1478
1479
1480 ;void vp8_loop_filter_simple_vertical_edge_sse2
1481 ;(
1482 ;    unsigned char *src_ptr,
1483 ;    int  src_pixel_step,
1484 ;    const char *blimit,
1485 ;)
1486 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
1487 sym(vp8_loop_filter_simple_vertical_edge_sse2):
1488     push        rbp         ; save old base pointer value.
1489     mov         rbp, rsp    ; set new base pointer value.
1490     SHADOW_ARGS_TO_STACK 3
1491     SAVE_XMM 7
1492     GET_GOT     rbx         ; save callee-saved reg
1493     push        rsi
1494     push        rdi
1495     ; end prolog
1496
1497     ALIGN_STACK 16, rax
1498     sub         rsp, 32                         ; reserve 32 bytes
1499     %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
1500     %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
1501
1502         mov         rsi, arg(0) ;src_ptr
1503         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1504
1505         lea         rsi,        [rsi - 2 ]
1506         lea         rdi,        [rsi + rax]
1507         lea         rdx,        [rsi + rax*4]
1508         lea         rcx,        [rdx + rax]
1509
1510         movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
1511         movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
1512         movd        xmm2,       [rdi]                   ; 13 12 11 10
1513         movd        xmm3,       [rcx]                   ; 53 52 51 50
1514         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1515         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
1516
1517         movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
1518         movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
1519         movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
1520         movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
1521         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
1522         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
1523
1524         punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1525         punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1526
1527         movdqa      xmm1,       xmm0
1528         punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1529         punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1530
1531         movdqa      xmm2,       xmm0
1532         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1533         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1534
1535         movdqa      t0,         xmm0                    ; save to t0
1536         movdqa      t1,         xmm2                    ; save to t1
1537
1538         lea         rsi,        [rsi + rax*8]
1539         lea         rdi,        [rsi + rax]
1540         lea         rdx,        [rsi + rax*4]
1541         lea         rcx,        [rdx + rax]
1542
1543         movd        xmm4,       [rsi]                   ; 83 82 81 80
1544         movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
1545         movd        xmm6,       [rdi]                   ; 93 92 91 90
1546         movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
1547         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
1548         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
1549
1550         movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
1551         movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
1552         movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
1553         movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
1554         punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
1555         punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
1556
1557         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1558         punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1559
1560         movdqa      xmm1,       xmm4
1561         punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1562         punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1563
1564         movdqa      xmm6,       xmm4
1565         punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1566         punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1567
1568         movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1569         movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1570         movdqa      xmm1,       xmm0
1571         movdqa      xmm3,       xmm2
1572
1573         punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1574         punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1575         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1576         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1577
1578         ; calculate mask
1579         movdqa      xmm6,       xmm0                            ; p1
1580         movdqa      xmm7,       xmm3                            ; q1
1581         psubusb     xmm7,       xmm0                            ; q1-=p1
1582         psubusb     xmm6,       xmm3                            ; p1-=q1
1583         por         xmm6,       xmm7                            ; abs(p1-q1)
1584         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1585         psrlw       xmm6,       1                               ; abs(p1-q1)/2
1586
1587         movdqa      xmm5,       xmm1                            ; p0
1588         movdqa      xmm4,       xmm2                            ; q0
1589         psubusb     xmm5,       xmm2                            ; p0-=q0
1590         psubusb     xmm4,       xmm1                            ; q0-=p0
1591         por         xmm5,       xmm4                            ; abs(p0 - q0)
1592         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
1593         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
1594
1595         mov         rdx,        arg(2)                          ;blimit
1596         movdqa      xmm7, XMMWORD PTR [rdx]
1597
1598         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1599         pxor        xmm7,        xmm7
1600         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
1601
1602         ; start work on filters
1603         movdqa        t0,        xmm0
1604         movdqa        t1,        xmm3
1605
1606         pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
1607         pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
1608
1609         psubsb      xmm0,        xmm3                           ; p1 - q1
1610         movdqa      xmm6,        xmm1                           ; p0
1611
1612         movdqa      xmm7,        xmm2                           ; q0
1613         pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
1614
1615         pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
1616         movdqa      xmm3,        xmm7                           ; offseted ; q0
1617
1618         psubsb      xmm7,        xmm6                           ; q0 - p0
1619         paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
1620
1621         paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
1622         paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
1623
1624         pand        xmm5,        xmm0                           ; mask filter values we don't care about
1625
1626
1627         paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
1628
1629         movdqa      xmm0,        xmm5                           ; get a copy of filters
1630         psllw       xmm0,        8                              ; shift left 8
1631
1632         psraw       xmm0,        3                              ; arithmetic shift right 11
1633         psrlw       xmm0,        8
1634
1635         movdqa      xmm7,        xmm5                           ; get a copy of filters
1636         psraw       xmm7,        11                             ; arithmetic shift right 11
1637
1638         psllw       xmm7,        8                              ; shift left 8 to put it back
1639         por         xmm0,        xmm7                           ; put the two together to get result
1640
1641         psubsb      xmm3,        xmm0                           ; q0-= q0sz add
1642         pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
1643
1644         ; now do +3 side
1645         psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
1646         movdqa      xmm0,        xmm5                           ; get a copy of filters
1647
1648         psllw       xmm0,        8                              ; shift left 8
1649         psraw       xmm0,        3                              ; arithmetic shift right 11
1650
1651         psrlw       xmm0,        8
1652         psraw       xmm5,        11                             ; arithmetic shift right 11
1653
1654         psllw       xmm5,        8                              ; shift left 8 to put it back
1655         por         xmm0,        xmm5                           ; put the two together to get result
1656
1657         paddsb      xmm6,        xmm0                           ; p0+= p0 add
1658         pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
1659
1660         movdqa      xmm0,        t0                             ; p1
1661         movdqa      xmm4,        t1                             ; q1
1662
1663         ; transpose back to write out
1664         ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1665         ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1666         ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1667         ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1668         movdqa      xmm1,       xmm0
1669         punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1670         punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1671
1672         movdqa      xmm5,       xmm3
1673         punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1674         punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1675
1676         movdqa      xmm2,       xmm0
1677         punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1678         punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1679
1680         movdqa      xmm3,       xmm1
1681         punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1682         punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1683
1684         ; write out order: xmm0 xmm2 xmm1 xmm3
1685         lea         rdx,        [rsi + rax*4]
1686
1687         movd        [rsi],      xmm1                               ; write the second 8-line result
1688         psrldq      xmm1,       4
1689         movd        [rdi],      xmm1
1690         psrldq      xmm1,       4
1691         movd        [rsi + rax*2], xmm1
1692         psrldq      xmm1,       4
1693         movd        [rdi + rax*2], xmm1
1694
1695         movd        [rdx],      xmm3
1696         psrldq      xmm3,       4
1697         movd        [rcx],      xmm3
1698         psrldq      xmm3,       4
1699         movd        [rdx + rax*2], xmm3
1700         psrldq      xmm3,       4
1701         movd        [rcx + rax*2], xmm3
1702
1703         neg         rax
1704         lea         rsi,        [rsi + rax*8]
1705         neg         rax
1706         lea         rdi,        [rsi + rax]
1707         lea         rdx,        [rsi + rax*4]
1708         lea         rcx,        [rdx + rax]
1709
1710         movd        [rsi],      xmm0                                ; write the first 8-line result
1711         psrldq      xmm0,       4
1712         movd        [rdi],      xmm0
1713         psrldq      xmm0,       4
1714         movd        [rsi + rax*2], xmm0
1715         psrldq      xmm0,       4
1716         movd        [rdi + rax*2], xmm0
1717
1718         movd        [rdx],      xmm2
1719         psrldq      xmm2,       4
1720         movd        [rcx],      xmm2
1721         psrldq      xmm2,       4
1722         movd        [rdx + rax*2], xmm2
1723         psrldq      xmm2,       4
1724         movd        [rcx + rax*2], xmm2
1725
1726     add rsp, 32
1727     pop rsp
1728     ; begin epilog
1729     pop rdi
1730     pop rsi
1731     RESTORE_GOT
1732     RESTORE_XMM
1733     UNSHADOW_ARGS
1734     pop         rbp
1735     ret
1736
1737 SECTION_RODATA
1738 align 16
1739 tfe:
1740     times 16 db 0xfe
1741 align 16
1742 t80:
1743     times 16 db 0x80
1744 align 16
1745 t1s:
1746     times 16 db 0x01
1747 align 16
1748 t3:
1749     times 16 db 0x03
1750 align 16
1751 t4:
1752     times 16 db 0x04
1753 align 16
1754 ones:
1755     times 8 dw 0x0001
1756 align 16
1757 s9:
1758     times 8 dw 0x0900
1759 align 16
1760 s63:
1761     times 8 dw 0x003f