vp8/common/x86/loopfilter_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
  15 ; in ffvp8
  16
  17 %macro LFH_FILTER_AND_HEV_MASK 1
  18 %if %1
  19         movdqa      xmm2,                   [rdi+2*rax]       ; q3
  20         movdqa      xmm1,                   [rsi+2*rax]       ; q2
  21         movdqa      xmm4,                   [rsi+rax]         ; q1
  22         movdqa      xmm5,                   [rsi]             ; q0
  23         neg         rax                     ; negate pitch to deal with above border
  24 %else
  25         movlps      xmm2,                   [rsi + rcx*2]     ; q3
  26         movlps      xmm1,                   [rsi + rcx]       ; q2
  27         movlps      xmm4,                   [rsi]             ; q1
  28         movlps      xmm5,                   [rsi + rax]       ; q0
  29
  30         movhps      xmm2,                   [rdi + rcx*2]
  31         movhps      xmm1,                   [rdi + rcx]
  32         movhps      xmm4,                   [rdi]
  33         movhps      xmm5,                   [rdi + rax]
  34
  35         lea         rsi,                    [rsi + rax*4]
  36         lea         rdi,                    [rdi + rax*4]
  37
  38         movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
  39         movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
  40 %endif
  41
  42         movdqa      xmm6,                   xmm1              ; q2
  43         movdqa      xmm3,                   xmm4              ; q1
  44
  45         psubusb     xmm1,                   xmm2              ; q2-=q3
  46         psubusb     xmm2,                   xmm6              ; q3-=q2
  47
  48         psubusb     xmm4,                   xmm6              ; q1-=q2
  49         psubusb     xmm6,                   xmm3              ; q2-=q1
  50
  51         por         xmm4,                   xmm6              ; abs(q2-q1)
  52         por         xmm1,                   xmm2              ; abs(q3-q2)
  53
  54         movdqa      xmm0,                   xmm5              ; q0
  55         pmaxub      xmm1,                   xmm4
  56
  57         psubusb     xmm5,                   xmm3              ; q0-=q1
  58         psubusb     xmm3,                   xmm0              ; q1-=q0
  59
  60         por         xmm5,                   xmm3              ; abs(q0-q1)
  61         movdqa      t0,                     xmm5              ; save to t0
  62
  63         pmaxub      xmm1,                   xmm5
  64
  65 %if %1
  66         movdqa      xmm2,                   [rsi+4*rax]       ; p3
  67         movdqa      xmm4,                   [rdi+4*rax]       ; p2
  68         movdqa      xmm6,                   [rsi+2*rax]       ; p1
  69 %else
  70         movlps      xmm2,                   [rsi + rax]       ; p3
  71         movlps      xmm4,                   [rsi]             ; p2
  72         movlps      xmm6,                   [rsi + rcx]       ; p1
  73
  74         movhps      xmm2,                   [rdi + rax]
  75         movhps      xmm4,                   [rdi]
  76         movhps      xmm6,                   [rdi + rcx]
  77
  78         movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
  79         movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
  80 %endif
  81
  82         movdqa      xmm5,                   xmm4              ; p2
  83         movdqa      xmm3,                   xmm6              ; p1
  84
  85         psubusb     xmm4,                   xmm2              ; p2-=p3
  86         psubusb     xmm2,                   xmm5              ; p3-=p2
  87
  88         psubusb     xmm3,                   xmm5              ; p1-=p2
  89         pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
  90
  91         psubusb     xmm5,                   xmm6              ; p2-=p1
  92         pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
  93
  94         pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
  95         movdqa      xmm2,                   xmm6              ; p1
  96
  97         pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
  98 %if %1
  99         movdqa      xmm4,                   [rsi+rax]         ; p0
 100         movdqa      xmm3,                   [rdi]             ; q1
 101 %else
 102         movlps      xmm4,                   [rsi + rcx*2]     ; p0
 103         movhps      xmm4,                   [rdi + rcx*2]
 104         movdqa      xmm3,                   q1                ; q1
 105 %endif
 106
 107         movdqa      xmm5,                   xmm4              ; p0
 108         psubusb     xmm4,                   xmm6              ; p0-=p1
 109
 110         psubusb     xmm6,                   xmm5              ; p1-=p0
 111
 112         por         xmm6,                   xmm4              ; abs(p1 - p0)
 113         mov         rdx,                    arg(2)            ; get flimit
 114
 115         movdqa        t1,                   xmm6              ; save to t1
 116
 117         movdqa      xmm4,                   xmm3              ; q1
 118         pmaxub      xmm1,                   xmm6
 119
 120         psubusb     xmm3,                   xmm2              ; q1-=p1
 121         psubusb     xmm2,                   xmm4              ; p1-=q1
 122
 123         psubusb     xmm1,                   xmm7
 124         por         xmm2,                   xmm3              ; abs(p1-q1)
 125
 126         movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
 127
 128         movdqa      xmm3,                   xmm0              ; q0
 129         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
 130
 131         mov         rdx,                    arg(4)            ; hev get thresh
 132
 133         movdqa      xmm6,                   xmm5              ; p0
 134         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
 135
 136         psubusb     xmm5,                   xmm3              ; p0-=q0
 137         paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
 138
 139         psubusb     xmm3,                   xmm6              ; q0-=p0
 140         por         xmm5,                   xmm3              ; abs(p0 - q0)
 141
 142         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
 143         paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
 144
 145         movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
 146
 147         movdqa      xmm3,                   t1                ; get abs (p1 - p0)
 148
 149         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 150
 151         movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
 152
 153         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
 154         psubusb     xmm4,                   xmm2              ; hev
 155
 156         psubusb     xmm3,                   xmm2              ; hev
 157         por         xmm1,                   xmm5
 158
 159         pxor        xmm7,                   xmm7
 160         paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 161
 162         pcmpeqb     xmm4,                   xmm5              ; hev
 163         pcmpeqb     xmm3,                   xmm3              ; hev
 164
 165         pcmpeqb     xmm1,                   xmm7              ; mask xmm1
 166         pxor        xmm4,                   xmm3              ; hev
 167 %endmacro
 168
 169 %macro B_FILTER 1
 170 %if %1 == 0
 171         movdqa      xmm2,                   p1                ; p1
 172         movdqa      xmm7,                   q1                ; q1
 173 %elif %1 == 1
 174         movdqa      xmm2,                   [rsi+2*rax]       ; p1
 175         movdqa      xmm7,                   [rdi]             ; q1
 176 %elif %1 == 2
 177         lea         rdx,                    srct
 178
 179         movdqa      xmm2,                   [rdx]             ; p1
 180         movdqa      xmm7,                   [rdx+48]          ; q1
 181         movdqa      xmm6,                   [rdx+16]          ; p0
 182         movdqa      xmm0,                   [rdx+32]          ; q0
 183 %endif
 184
 185         pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
 186         pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
 187
 188         psubsb      xmm2,                   xmm7              ; p1 - q1
 189         pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
 190
 191         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
 192         pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
 193
 194         movdqa      xmm3,                   xmm0              ; q0
 195         psubsb      xmm0,                   xmm6              ; q0 - p0
 196
 197         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
 198
 199         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
 200
 201         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
 202
 203         pand        xmm1,                   xmm2              ; mask filter values we don't care about
 204
 205         movdqa      xmm2,                   xmm1
 206
 207         paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
 208         paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
 209
 210         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
 211         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
 212
 213         punpcklbw   xmm0,                   xmm1              ; exfxgxhx
 214         psraw       xmm5,                   11                ; sign extended shift right by 3
 215
 216         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
 217         psraw       xmm2,                   11                ; sign extended shift right by 3
 218
 219         packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
 220         psraw       xmm0,                   11                ; sign extended shift right by 3
 221
 222         psraw       xmm1,                   11                ; sign extended shift right by 3
 223         movdqa      xmm5,                   xmm0              ; save results
 224
 225         packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
 226         paddsw      xmm5,                   [GLOBAL(ones)]
 227
 228         paddsw      xmm1,                   [GLOBAL(ones)]
 229         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
 230
 231         psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
 232
 233         paddsb      xmm6,                   xmm2              ; p0+= p0 add
 234         packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
 235
 236 %if %1 == 0
 237         movdqa      xmm1,                   p1                ; p1
 238 %elif %1 == 1
 239         movdqa      xmm1,                   [rsi+2*rax]       ; p1
 240 %elif %1 == 2
 241         movdqa      xmm1,                   [rdx]             ; p1
 242 %endif
 243         pandn       xmm4,                   xmm5              ; high edge variance additive
 244         pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
 245
 246         pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
 247         psubsb      xmm3,                   xmm0              ; q0-= q0 add
 248
 249         paddsb      xmm1,                   xmm4              ; p1+= p1 add
 250         pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
 251
 252         pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
 253         psubsb      xmm7,                   xmm4              ; q1-= q1 add
 254
 255         pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
 256 %if %1 == 0
 257         lea         rsi,                    [rsi + rcx*2]
 258         lea         rdi,                    [rdi + rcx*2]
 259         movq        MMWORD PTR [rsi],       xmm6              ; p0
 260         movhps      MMWORD PTR [rdi],       xmm6
 261         movq        MMWORD PTR [rsi + rax], xmm1              ; p1
 262         movhps      MMWORD PTR [rdi + rax], xmm1
 263         movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
 264         movhps      MMWORD PTR [rdi + rcx], xmm3
 265         movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
 266         movhps      MMWORD PTR [rdi + rcx*2],xmm7
 267 %elif %1 == 1
 268         movdqa      [rsi+rax],              xmm6              ; write back
 269         movdqa      [rsi+2*rax],            xmm1              ; write back
 270         movdqa      [rsi],                  xmm3              ; write back
 271         movdqa      [rdi],                  xmm7              ; write back
 272 %endif
 273
 274 %endmacro
 275
 276
 277 ;void vp8_loop_filter_horizontal_edge_sse2
 278 ;(
 279 ;    unsigned char *src_ptr,
 280 ;    int            src_pixel_step,
 281 ;    const char    *flimit,
 282 ;    const char    *limit,
 283 ;    const char    *thresh,
 284 ;    int            count
 285 ;)
 286 global sym(vp8_loop_filter_horizontal_edge_sse2)
 287 sym(vp8_loop_filter_horizontal_edge_sse2):
 288     push        rbp
 289     mov         rbp, rsp
 290     SHADOW_ARGS_TO_STACK 6
 291     SAVE_XMM
 292     GET_GOT     rbx
 293     push        rsi
 294     push        rdi
 295     ; end prolog
 296
 297     ALIGN_STACK 16, rax
 298     sub         rsp, 32     ; reserve 32 bytes
 299     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
 300     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
 301
 302         mov         rsi,                    arg(0)           ;src_ptr
 303         movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
 304
 305         mov         rdx,                    arg(3)           ;limit
 306         movdqa      xmm7,                   XMMWORD PTR [rdx]
 307
 308         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
 309
 310         ; calculate breakout conditions and high edge variance
 311         LFH_FILTER_AND_HEV_MASK 1
 312         ; filter and write back the result
 313         B_FILTER 1
 314
 315     add rsp, 32
 316     pop rsp
 317     ; begin epilog
 318     pop rdi
 319     pop rsi
 320     RESTORE_GOT
 321     RESTORE_XMM
 322     UNSHADOW_ARGS
 323     pop         rbp
 324     ret
 325
 326
 327 ;void vp8_loop_filter_horizontal_edge_uv_sse2
 328 ;(
 329 ;    unsigned char *src_ptr,
 330 ;    int            src_pixel_step,
 331 ;    const char    *flimit,
 332 ;    const char    *limit,
 333 ;    const char    *thresh,
 334 ;    int            count
 335 ;)
 336 global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
 337 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
 338     push        rbp
 339     mov         rbp, rsp
 340     SHADOW_ARGS_TO_STACK 6
 341     SAVE_XMM
 342     GET_GOT     rbx
 343     push        rsi
 344     push        rdi
 345     ; end prolog
 346
 347     ALIGN_STACK 16, rax
 348     sub         rsp, 96       ; reserve 96 bytes
 349     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
 350     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
 351     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
 352     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
 353     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
 354     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
 355
 356         mov         rsi,                    arg(0)             ; u
 357         mov         rdi,                    arg(5)             ; v
 358         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
 359         mov         rcx,                    rax
 360         neg         rax                     ; negate pitch to deal with above border
 361
 362         mov         rdx,                    arg(3)             ;limit
 363         movdqa      xmm7,                   XMMWORD PTR [rdx]
 364
 365         lea         rsi,                    [rsi + rcx]
 366         lea         rdi,                    [rdi + rcx]
 367
 368         ; calculate breakout conditions and high edge variance
 369         LFH_FILTER_AND_HEV_MASK 0
 370         ; filter and write back the result
 371         B_FILTER 0
 372
 373     add rsp, 96
 374     pop rsp
 375     ; begin epilog
 376     pop rdi
 377     pop rsi
 378     RESTORE_GOT
 379     RESTORE_XMM
 380     UNSHADOW_ARGS
 381     pop         rbp
 382     ret
 383
 384
 385 %macro MB_FILTER_AND_WRITEBACK 1
 386 %if %1 == 0
 387         movdqa      xmm2,                   p1              ; p1
 388         movdqa      xmm7,                   q1              ; q1
 389 %elif %1 == 1
 390         movdqa      xmm2,                   [rsi+2*rax]     ; p1
 391         movdqa      xmm7,                   [rdi]           ; q1
 392
 393         mov         rcx,                    rax
 394         neg         rcx
 395 %elif %1 == 2
 396         lea         rdx,                    srct
 397
 398         movdqa      xmm2,                   [rdx+32]        ; p1
 399         movdqa      xmm7,                   [rdx+80]        ; q1
 400         movdqa      xmm6,                   [rdx+48]        ; p0
 401         movdqa      xmm0,                   [rdx+64]        ; q0
 402 %endif
 403
 404         pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
 405         pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
 406         pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
 407         pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
 408
 409         psubsb      xmm2,                   xmm7            ; p1 - q1
 410         movdqa      xmm3,                   xmm0            ; q0
 411
 412         psubsb      xmm0,                   xmm6            ; q0 - p0
 413
 414         paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
 415
 416         paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
 417
 418         paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
 419
 420         pand        xmm1,                   xmm2            ; mask filter values we don't care about
 421
 422         movdqa      xmm2,                   xmm1            ; vp8_filter
 423
 424         pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
 425         pxor        xmm0,                   xmm0
 426
 427         pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
 428         pxor        xmm1,                   xmm1
 429
 430         punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
 431         movdqa      xmm5,                   xmm2
 432
 433         punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
 434         paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
 435
 436         pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
 437
 438         pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
 439
 440         punpckhbw   xmm7,                   xmm5            ; axbxcxdx
 441         paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
 442
 443         punpcklbw   xmm5,                   xmm5            ; exfxgxhx
 444         psraw       xmm7,                   11              ; sign extended shift right by 3
 445
 446         psraw       xmm5,                   11              ; sign extended shift right by 3
 447         punpckhbw   xmm4,                   xmm2            ; axbxcxdx
 448
 449         punpcklbw   xmm2,                   xmm2            ; exfxgxhx
 450         psraw       xmm4,                   11              ; sign extended shift right by 3
 451
 452         packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
 453         psraw       xmm2,                   11              ; sign extended shift right by 3
 454
 455         packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
 456         movdqa      xmm7,                   xmm1
 457
 458         paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
 459         movdqa      xmm4,                   xmm1
 460
 461         psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
 462         movdqa      xmm5,                   xmm0
 463
 464         movdqa      xmm2,                   xmm5
 465         paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
 466
 467         paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
 468         paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
 469
 470         paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
 471         paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
 472
 473         paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
 474         paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
 475
 476         paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
 477         psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
 478
 479         psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
 480         psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
 481
 482         packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
 483         psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
 484
 485         psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
 486         packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
 487
 488         psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
 489
 490         packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
 491
 492         psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
 493         paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
 494
 495 %if %1 == 0
 496         movdqa      xmm5,                   q2              ; q2
 497         movdqa      xmm1,                   q1              ; q1
 498         movdqa      xmm4,                   p1              ; p1
 499         movdqa      xmm7,                   p2              ; p2
 500
 501 %elif %1 == 1
 502         movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
 503         movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
 504         movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
 505         movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
 506 %elif %1 == 2
 507         movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
 508         movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
 509         movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
 510         movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
 511 %endif
 512
 513         pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
 514         pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
 515
 516         pxor        xmm1,                   [GLOBAL(t80)]
 517         pxor        xmm4,                   [GLOBAL(t80)]
 518
 519         psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
 520         paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
 521
 522         pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
 523         pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
 524
 525         pxor        xmm7,                   [GLOBAL(t80)]
 526         pxor        xmm5,                   [GLOBAL(t80)]
 527
 528         paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
 529         psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
 530
 531         pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
 532         pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
 533
 534 %if %1 == 0
 535         lea         rsi,                    [rsi+rcx*2]
 536         lea         rdi,                    [rdi+rcx*2]
 537
 538         movq        MMWORD PTR [rsi],       xmm6            ; p0
 539         movhps      MMWORD PTR [rdi],       xmm6
 540         movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
 541         movhps      MMWORD PTR [rdi + rcx], xmm3
 542
 543         movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
 544         movhps      MMWORD PTR [rdi+rcx*2], xmm1
 545
 546         movq        MMWORD PTR [rsi + rax], xmm4            ; p1
 547         movhps      MMWORD PTR [rdi + rax], xmm4
 548
 549         movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
 550         movhps      MMWORD PTR [rdi+rax*2], xmm7
 551
 552         lea         rsi,                    [rsi + rcx]
 553         lea         rdi,                    [rdi + rcx]
 554         movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
 555         movhps      MMWORD PTR [rdi+rcx*2], xmm5
 556 %elif %1 == 1
 557         movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
 558         movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
 559         movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
 560         movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
 561         movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
 562         movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
 563 %elif %1 == 2
 564         movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
 565         movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
 566         movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
 567         movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
 568 %endif
 569
 570 %endmacro
 571
 572
 573 ;void vp8_mbloop_filter_horizontal_edge_sse2
 574 ;(
 575 ;    unsigned char *src_ptr,
 576 ;    int            src_pixel_step,
 577 ;    const char    *flimit,
 578 ;    const char    *limit,
 579 ;    const char    *thresh,
 580 ;    int            count
 581 ;)
 582 global sym(vp8_mbloop_filter_horizontal_edge_sse2)
 583 sym(vp8_mbloop_filter_horizontal_edge_sse2):
 584     push        rbp
 585     mov         rbp, rsp
 586     SHADOW_ARGS_TO_STACK 6
 587     SAVE_XMM
 588     GET_GOT     rbx
 589     push        rsi
 590     push        rdi
 591     ; end prolog
 592
 593     ALIGN_STACK 16, rax
 594     sub         rsp, 32     ; reserve 32 bytes
 595     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
 596     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
 597
 598         mov         rsi,                    arg(0)            ;src_ptr
 599         movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
 600
 601         mov         rdx,                    arg(3)            ;limit
 602         movdqa      xmm7,                   XMMWORD PTR [rdx]
 603
 604         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
 605
 606         ; calculate breakout conditions and high edge variance
 607         LFH_FILTER_AND_HEV_MASK 1
 608         ; filter and write back the results
 609         MB_FILTER_AND_WRITEBACK 1
 610
 611     add rsp, 32
 612     pop rsp
 613     ; begin epilog
 614     pop rdi
 615     pop rsi
 616     RESTORE_GOT
 617     RESTORE_XMM
 618     UNSHADOW_ARGS
 619     pop         rbp
 620     ret
 621
 622
 623 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
 624 ;(
 625 ;    unsigned char *u,
 626 ;    int            src_pixel_step,
 627 ;    const char    *flimit,
 628 ;    const char    *limit,
 629 ;    const char    *thresh,
 630 ;    unsigned char *v
 631 ;)
 632 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
 633 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
 634     push        rbp
 635     mov         rbp, rsp
 636     SHADOW_ARGS_TO_STACK 6
 637     SAVE_XMM
 638     GET_GOT     rbx
 639     push        rsi
 640     push        rdi
 641     ; end prolog
 642
 643     ALIGN_STACK 16, rax
 644     sub         rsp, 96       ; reserve 96 bytes
 645     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
 646     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
 647     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
 648     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
 649     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
 650     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
 651
 652         mov         rsi,                    arg(0)             ; u
 653         mov         rdi,                    arg(5)             ; v
 654         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
 655         mov         rcx,                    rax
 656         neg         rax                     ; negate pitch to deal with above border
 657
 658         mov         rdx,                    arg(3)             ;limit
 659         movdqa      xmm7,                   XMMWORD PTR [rdx]
 660
 661         lea         rsi,                    [rsi + rcx]
 662         lea         rdi,                    [rdi + rcx]
 663
 664         ; calculate breakout conditions and high edge variance
 665         LFH_FILTER_AND_HEV_MASK 0
 666         ; filter and write back the results
 667         MB_FILTER_AND_WRITEBACK 0
 668
 669     add rsp, 96
 670     pop rsp
 671     ; begin epilog
 672     pop rdi
 673     pop rsi
 674     RESTORE_GOT
 675     RESTORE_XMM
 676     UNSHADOW_ARGS
 677     pop         rbp
 678     ret
 679
 680
 681 %macro TRANSPOSE_16X8 2
 682         movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
 683         movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
 684         movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
 685         movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
 686         movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
 687         movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
 688
 689         punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
 690
 691         movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
 692
 693         movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
 694         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
 695
 696         movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
 697
 698         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
 699 %if %1
 700         lea         rsi,                [rsi+rax*8]
 701 %else
 702         mov         rsi,                arg(5)          ; v_ptr
 703 %endif
 704
 705         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
 706         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
 707
 708         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
 709
 710         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
 711 %if %1
 712         lea         rdi,                [rdi+rax*8]
 713 %else
 714         lea         rsi,                [rsi - 4]
 715 %endif
 716
 717         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
 718 %if %1
 719         lea         rdx,                srct
 720 %else
 721         lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
 722 %endif
 723
 724         movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
 725         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
 726
 727         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
 728         punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 729
 730         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
 731
 732         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 733
 734         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
 735
 736         movdqa      t0,                 xmm2            ; save to free XMM2
 737         movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
 738         movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
 739         movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
 740         movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
 741         movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
 742
 743         punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
 744
 745         movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
 746
 747         punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
 748
 749         movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
 750
 751         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
 752
 753         movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
 754
 755         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
 756
 757         movdqa      xmm6,               xmm1            ;
 758         punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
 759
 760         punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
 761         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
 762
 763         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
 764
 765         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
 766
 767         movdqa      xmm0,               xmm5
 768         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
 769
 770         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
 771         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
 772
 773         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
 774
 775         punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
 776         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
 777
 778         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
 779
 780         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
 781 %if %2
 782         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 783         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 784
 785         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 786
 787         movdqa      [rdx],              xmm2            ; save 2
 788
 789         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 790         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 791
 792         movdqa      [rdx+16],           xmm3            ; save 3
 793
 794         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 795
 796         movdqa      [rdx+32],           xmm4            ; save 4
 797         movdqa      [rdx+48],           xmm5            ; save 5
 798         movdqa      xmm1,               t0              ; get
 799
 800         movdqa      xmm2,               xmm1            ;
 801         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
 802
 803         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 804 %else
 805         movdqa      [rdx+112],          xmm7            ; save 7
 806
 807         movdqa      [rdx+96],           xmm6            ; save 6
 808
 809         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 810         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 811
 812         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 813
 814         movdqa      [rdx+32],           xmm2            ; save 2
 815
 816         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 817         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 818
 819         movdqa      [rdx+48],           xmm3            ; save 3
 820
 821         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 822
 823         movdqa      [rdx+64],           xmm4            ; save 4
 824         movdqa      [rdx+80],           xmm5            ; save 5
 825         movdqa      xmm1,               t0              ; get
 826
 827         movdqa      xmm2,               xmm1
 828         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
 829
 830         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 831
 832         movdqa      [rdx+16],           xmm1
 833
 834         movdqa      [rdx],              xmm2
 835 %endif
 836 %endmacro
 837
 838 %macro LFV_FILTER_MASK_HEV_MASK 1
 839         movdqa      xmm0,               xmm6            ; q2
 840         psubusb     xmm0,               xmm7            ; q2-q3
 841
 842         psubusb     xmm7,               xmm6            ; q3-q2
 843         movdqa      xmm4,               xmm5            ; q1
 844
 845         por         xmm7,               xmm0            ; abs (q3-q2)
 846         psubusb     xmm4,               xmm6            ; q1-q2
 847
 848         movdqa      xmm0,               xmm1
 849         psubusb     xmm6,               xmm5            ; q2-q1
 850
 851         por         xmm6,               xmm4            ; abs (q2-q1)
 852         psubusb     xmm0,               xmm2            ; p2 - p3;
 853
 854         psubusb     xmm2,               xmm1            ; p3 - p2;
 855         por         xmm0,               xmm2            ; abs(p2-p3)
 856 %if %1
 857         movdqa      xmm2,               [rdx]           ; p1
 858 %else
 859         movdqa      xmm2,               [rdx+32]        ; p1
 860 %endif
 861         movdqa      xmm5,               xmm2            ; p1
 862         pmaxub      xmm0,               xmm7
 863
 864         psubusb     xmm5,               xmm1            ; p1-p2
 865         psubusb     xmm1,               xmm2            ; p2-p1
 866
 867         movdqa      xmm7,               xmm3            ; p0
 868         psubusb     xmm7,               xmm2            ; p0-p1
 869
 870         por         xmm1,               xmm5            ; abs(p2-p1)
 871         pmaxub      xmm0,               xmm6
 872
 873         pmaxub      xmm0,               xmm1
 874         movdqa      xmm1,               xmm2            ; p1
 875
 876         psubusb     xmm2,               xmm3            ; p1-p0
 877         lea         rdx,                srct
 878
 879         por         xmm2,               xmm7            ; abs(p1-p0)
 880
 881         movdqa      t0,                 xmm2            ; save abs(p1-p0)
 882
 883         pmaxub      xmm0,               xmm2
 884
 885 %if %1
 886         movdqa      xmm5,               [rdx+32]        ; q0
 887         movdqa      xmm7,               [rdx+48]        ; q1
 888 %else
 889         movdqa      xmm5,               [rdx+64]        ; q0
 890         movdqa      xmm7,               [rdx+80]        ; q1
 891 %endif
 892         mov         rdx,                arg(3)          ; limit
 893
 894         movdqa      xmm6,               xmm5            ; q0
 895         movdqa      xmm2,               xmm7            ; q1
 896
 897         psubusb     xmm5,               xmm7            ; q0-q1
 898         psubusb     xmm7,               xmm6            ; q1-q0
 899
 900         por         xmm7,               xmm5            ; abs(q1-q0)
 901
 902         movdqa      t1,                 xmm7            ; save abs(q1-q0)
 903
 904         movdqa      xmm4,               XMMWORD PTR [rdx]; limit
 905
 906         pmaxub      xmm0,               xmm7
 907         mov         rdx,                arg(2)          ; flimit
 908
 909         psubusb     xmm0,               xmm4
 910         movdqa      xmm5,               xmm2            ; q1
 911
 912         psubusb     xmm5,               xmm1            ; q1-=p1
 913         psubusb     xmm1,               xmm2            ; p1-=q1
 914
 915         por         xmm5,               xmm1            ; abs(p1-q1)
 916         movdqa      xmm1,               xmm3            ; p0
 917
 918         pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
 919         psubusb     xmm1,               xmm6            ; p0-q0
 920
 921         psrlw       xmm5,               1               ; abs(p1-q1)/2
 922         psubusb     xmm6,               xmm3            ; q0-p0
 923
 924         movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
 925
 926         mov         rdx,                arg(4)          ; get thresh
 927
 928         por         xmm1,               xmm6            ; abs(q0-p0)
 929         paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
 930
 931         movdqa      xmm6,               t0              ; get abs (q1 - q0)
 932
 933         paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
 934
 935         movdqa      xmm3,               t1              ; get abs (p1 - p0)
 936
 937         movdqa      xmm7,               XMMWORD PTR [rdx]
 938
 939         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
 940         psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
 941
 942         paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
 943         psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
 944
 945         psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
 946         por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 947
 948         por         xmm1,               xmm0            ; mask
 949         pcmpeqb     xmm6,               xmm0
 950
 951         pxor        xmm0,               xmm0
 952         pcmpeqb     xmm4,               xmm4
 953
 954         pcmpeqb     xmm1,               xmm0
 955         pxor        xmm4,               xmm6
 956 %endmacro
 957
 958 %macro BV_TRANSPOSE 0
 959         ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 960         ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
 961         ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 962         ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 963         movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 964         punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
 965
 966         movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 967         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 968
 969         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
 970
 971         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
 972
 973         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
 974         punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
 975
 976         punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
 977         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 978
 979         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
 980
 981         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
 982         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
 983         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
 984         ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
 985         ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
 986 %endmacro
 987
 988 %macro BV_WRITEBACK 2
 989         movd        [rsi+2],            %1
 990         psrldq      %1,                 4
 991
 992         movd        [rdi+2],            %1
 993         psrldq      %1,                 4
 994
 995         movd        [rsi+2*rax+2],      %1
 996         psrldq      %1,                 4
 997
 998         movd        [rdi+2*rax+2],      %1
 999
1000         movd        [rsi+4*rax+2],      %2
1001         psrldq      %2,                 4
1002
1003         movd        [rdi+4*rax+2],      %2
1004         psrldq      %2,                 4
1005
1006         movd        [rsi+2*rcx+2],      %2
1007         psrldq      %2,                 4
1008
1009         movd        [rdi+2*rcx+2],      %2
1010 %endmacro
1011
1012
1013 ;void vp8_loop_filter_vertical_edge_sse2
1014 ;(
1015 ;    unsigned char *src_ptr,
1016 ;    int            src_pixel_step,
1017 ;    const char    *flimit,
1018 ;    const char    *limit,
1019 ;    const char    *thresh,
1020 ;    int            count
1021 ;)
1022 global sym(vp8_loop_filter_vertical_edge_sse2)
1023 sym(vp8_loop_filter_vertical_edge_sse2):
1024     push        rbp
1025     mov         rbp, rsp
1026     SHADOW_ARGS_TO_STACK 6
1027     SAVE_XMM
1028     GET_GOT     rbx
1029     push        rsi
1030     push        rdi
1031     ; end prolog
1032
1033     ALIGN_STACK 16, rax
1034     sub             rsp, 96      ; reserve 96 bytes
1035     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1036     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1037     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
1038
1039         mov         rsi,        arg(0)                  ; src_ptr
1040         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1041
1042         lea         rsi,        [rsi - 4]
1043         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1044         lea         rcx,        [rax*2+rax]
1045
1046         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1047         TRANSPOSE_16X8 1, 1
1048
1049         ; calculate filter mask and high edge variance
1050         LFV_FILTER_MASK_HEV_MASK 1
1051
1052         ; start work on filters
1053         B_FILTER 2
1054
1055         ; tranpose and write back - only work on q1, q0, p0, p1
1056         BV_TRANSPOSE
1057         ; store 16-line result
1058
1059         lea         rdx,        [rax]
1060         neg         rdx
1061
1062         BV_WRITEBACK xmm1, xmm5
1063
1064         lea         rsi,        [rsi+rdx*8]
1065         lea         rdi,        [rdi+rdx*8]
1066         BV_WRITEBACK xmm2, xmm6
1067
1068     add rsp, 96
1069     pop rsp
1070     ; begin epilog
1071     pop rdi
1072     pop rsi
1073     RESTORE_GOT
1074     RESTORE_XMM
1075     UNSHADOW_ARGS
1076     pop         rbp
1077     ret
1078
1079
1080 ;void vp8_loop_filter_vertical_edge_uv_sse2
1081 ;(
1082 ;    unsigned char *u,
1083 ;    int            src_pixel_step,
1084 ;    const char    *flimit,
1085 ;    const char    *limit,
1086 ;    const char    *thresh,
1087 ;    unsigned char *v
1088 ;)
1089 global sym(vp8_loop_filter_vertical_edge_uv_sse2)
1090 sym(vp8_loop_filter_vertical_edge_uv_sse2):
1091     push        rbp
1092     mov         rbp, rsp
1093     SHADOW_ARGS_TO_STACK 6
1094     SAVE_XMM
1095     GET_GOT     rbx
1096     push        rsi
1097     push        rdi
1098     ; end prolog
1099
1100     ALIGN_STACK 16, rax
1101     sub             rsp, 96      ; reserve 96 bytes
1102     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1103     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1104     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
1105
1106         mov         rsi,        arg(0)                  ; u_ptr
1107         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1108
1109         lea         rsi,        [rsi - 4]
1110         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1111         lea         rcx,        [rax+2*rax]
1112
1113         lea         rdx,        srct
1114
1115         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1116         TRANSPOSE_16X8 0, 1
1117
1118         ; calculate filter mask and high edge variance
1119         LFV_FILTER_MASK_HEV_MASK 1
1120
1121         ; start work on filters
1122         B_FILTER 2
1123
1124         ; tranpose and write back - only work on q1, q0, p0, p1
1125         BV_TRANSPOSE
1126
1127         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1128
1129         ; store 16-line result
1130         BV_WRITEBACK xmm1, xmm5
1131
1132         mov         rsi,        arg(0)                  ; u_ptr
1133         lea         rsi,        [rsi - 4]
1134         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1135         BV_WRITEBACK xmm2, xmm6
1136
1137     add rsp, 96
1138     pop rsp
1139     ; begin epilog
1140     pop rdi
1141     pop rsi
1142     RESTORE_GOT
1143     RESTORE_XMM
1144     UNSHADOW_ARGS
1145     pop         rbp
1146     ret
1147
1148 %macro MBV_TRANSPOSE 0
1149         movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1150         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1151
1152         punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1153         punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1154
1155         movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1156         movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1157
1158         punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1159         punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1160
1161         movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1162         punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1163
1164         punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1165         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1166
1167         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1168         punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1169
1170         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1171         punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1172
1173         movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1174         punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1175
1176         movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1177         punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1178
1179         punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1180         movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1181
1182         punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1183         punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1184 %endmacro
1185
1186 %macro MBV_WRITEBACK_1 0
1187         movq        QWORD  PTR [rsi],   xmm0
1188         movhps      MMWORD PTR [rdi],   xmm0
1189
1190         movq        QWORD  PTR [rsi+2*rax], xmm6
1191         movhps      MMWORD PTR [rdi+2*rax], xmm6
1192
1193         movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1194         punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1195
1196         punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1197
1198         movq        QWORD  PTR [rsi+4*rax], xmm0
1199         movhps      MMWORD PTR [rdi+4*rax], xmm0
1200
1201         movq        QWORD  PTR [rsi+2*rcx], xmm3
1202         movhps      MMWORD PTR [rdi+2*rcx], xmm3
1203
1204         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1205         punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1206
1207         punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1208         movdqa      xmm0,               xmm2
1209
1210         punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1211         punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1212
1213         movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1214         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1215
1216         punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1217 %endmacro
1218
1219 %macro MBV_WRITEBACK_2 0
1220         movq        QWORD  PTR [rsi],   xmm1
1221         movhps      MMWORD PTR [rdi],   xmm1
1222
1223         movq        QWORD  PTR [rsi+2*rax], xmm5
1224         movhps      MMWORD PTR [rdi+2*rax], xmm5
1225
1226         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1227         punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1228         punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1229
1230         movq        QWORD  PTR [rsi+4*rax], xmm1
1231         movhps      MMWORD PTR [rdi+4*rax], xmm1
1232
1233         movq        QWORD  PTR [rsi+2*rcx], xmm4
1234         movhps      MMWORD PTR [rdi+2*rcx], xmm4
1235 %endmacro
1236
1237
1238 ;void vp8_mbloop_filter_vertical_edge_sse2
1239 ;(
1240 ;    unsigned char *src_ptr,
1241 ;    int            src_pixel_step,
1242 ;    const char    *flimit,
1243 ;    const char    *limit,
1244 ;    const char    *thresh,
1245 ;    int            count
1246 ;)
1247 global sym(vp8_mbloop_filter_vertical_edge_sse2)
1248 sym(vp8_mbloop_filter_vertical_edge_sse2):
1249     push        rbp
1250     mov         rbp, rsp
1251     SHADOW_ARGS_TO_STACK 6
1252     SAVE_XMM
1253     GET_GOT     rbx
1254     push        rsi
1255     push        rdi
1256     ; end prolog
1257
1258     ALIGN_STACK 16, rax
1259     sub          rsp, 160     ; reserve 160 bytes
1260     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1261     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1262     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
1263
1264         mov         rsi,                arg(0)              ; src_ptr
1265         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1266
1267         lea         rsi,                [rsi - 4]
1268         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1269         lea         rcx,                [rax*2+rax]
1270
1271         ; Transpose
1272         TRANSPOSE_16X8 1, 0
1273
1274         ; calculate filter mask and high edge variance
1275         LFV_FILTER_MASK_HEV_MASK 0
1276
1277         neg         rax
1278         ; start work on filters
1279         MB_FILTER_AND_WRITEBACK 2
1280
1281         lea         rsi,                [rsi+rax*8]
1282         lea         rdi,                [rdi+rax*8]
1283
1284         ; transpose and write back
1285         MBV_TRANSPOSE
1286
1287         neg         rax
1288
1289         MBV_WRITEBACK_1
1290
1291         lea         rsi,                [rsi+rax*8]
1292         lea         rdi,                [rdi+rax*8]
1293         MBV_WRITEBACK_2
1294
1295     add rsp, 160
1296     pop rsp
1297     ; begin epilog
1298     pop rdi
1299     pop rsi
1300     RESTORE_GOT
1301     RESTORE_XMM
1302     UNSHADOW_ARGS
1303     pop         rbp
1304     ret
1305
1306
1307 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
1308 ;(
1309 ;    unsigned char *u,
1310 ;    int            src_pixel_step,
1311 ;    const char    *flimit,
1312 ;    const char    *limit,
1313 ;    const char    *thresh,
1314 ;    unsigned char *v
1315 ;)
1316 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
1317 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1318     push        rbp
1319     mov         rbp, rsp
1320     SHADOW_ARGS_TO_STACK 6
1321     SAVE_XMM
1322     GET_GOT     rbx
1323     push        rsi
1324     push        rdi
1325     ; end prolog
1326
1327     ALIGN_STACK 16, rax
1328     sub          rsp, 160     ; reserve 160 bytes
1329     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1330     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1331     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
1332
1333         mov         rsi,                arg(0)              ; u_ptr
1334         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1335
1336         lea         rsi,                [rsi - 4]
1337         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1338         lea         rcx,                [rax+2*rax]
1339
1340         lea         rdx,                srct
1341
1342         ; Transpose
1343         TRANSPOSE_16X8 0, 0
1344
1345         ; calculate filter mask and high edge variance
1346         LFV_FILTER_MASK_HEV_MASK 0
1347
1348         ; start work on filters
1349         MB_FILTER_AND_WRITEBACK 2
1350
1351         ; transpose and write back
1352         MBV_TRANSPOSE
1353
1354         mov         rsi,                arg(0)             ;u_ptr
1355         lea         rsi,                [rsi - 4]
1356         lea         rdi,                [rsi + rax]
1357         MBV_WRITEBACK_1
1358         mov         rsi,                arg(5)             ;v_ptr
1359         lea         rsi,                [rsi - 4]
1360         lea         rdi,                [rsi + rax]
1361         MBV_WRITEBACK_2
1362
1363     add rsp, 160
1364     pop rsp
1365     ; begin epilog
1366     pop rdi
1367     pop rsi
1368     RESTORE_GOT
1369     RESTORE_XMM
1370     UNSHADOW_ARGS
1371     pop         rbp
1372     ret
1373
1374
1375 ;void vp8_loop_filter_simple_horizontal_edge_sse2
1376 ;(
1377 ;    unsigned char *src_ptr,
1378 ;    int  src_pixel_step,
1379 ;    const char *flimit,
1380 ;    const char *limit,
1381 ;    const char *thresh,
1382 ;    int count
1383 ;)
1384 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
1385 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1386     push        rbp
1387     mov         rbp, rsp
1388     SHADOW_ARGS_TO_STACK 6
1389     SAVE_XMM
1390     GET_GOT     rbx
1391     push        rsi
1392     push        rdi
1393     ; end prolog
1394
1395         mov         rsi, arg(0)             ;src_ptr
1396         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
1397         mov         rdx, arg(2) ;flimit     ; get flimit
1398         movdqa      xmm3, XMMWORD PTR [rdx]
1399         mov         rdx, arg(3) ;limit
1400         movdqa      xmm7, XMMWORD PTR [rdx]
1401
1402         paddb       xmm3, xmm3              ; flimit*2 (less than 255)
1403         paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
1404
1405         mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
1406         add         rdi, rax
1407         neg         rax
1408
1409         ; calculate mask
1410         movdqu      xmm1, [rsi+2*rax]       ; p1
1411         movdqu      xmm0, [rdi]             ; q1
1412         movdqa      xmm2, xmm1
1413         movdqa      xmm7, xmm0
1414         movdqa      xmm4, xmm0
1415         psubusb     xmm0, xmm1              ; q1-=p1
1416         psubusb     xmm1, xmm4              ; p1-=q1
1417         por         xmm1, xmm0              ; abs(p1-q1)
1418         pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
1419         psrlw       xmm1, 1                 ; abs(p1-q1)/2
1420
1421         movdqu      xmm5, [rsi+rax]         ; p0
1422         movdqu      xmm4, [rsi]             ; q0
1423         movdqa      xmm0, xmm4              ; q0
1424         movdqa      xmm6, xmm5              ; p0
1425         psubusb     xmm5, xmm4              ; p0-=q0
1426         psubusb     xmm4, xmm6              ; q0-=p0
1427         por         xmm5, xmm4              ; abs(p0 - q0)
1428         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
1429         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1430
1431         psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1432         pxor        xmm3, xmm3
1433         pcmpeqb     xmm5, xmm3
1434
1435         ; start work on filters
1436         pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
1437         pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
1438         psubsb      xmm2, xmm7              ; p1 - q1
1439
1440         pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
1441         pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
1442         movdqa      xmm3, xmm0              ; q0
1443         psubsb      xmm0, xmm6              ; q0 - p0
1444         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
1445         paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
1446         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
1447         pand        xmm5, xmm2              ; mask filter values we don't care about
1448
1449         ; do + 4 side
1450         paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
1451
1452         movdqa      xmm0, xmm5              ; get a copy of filters
1453         psllw       xmm0, 8                 ; shift left 8
1454         psraw       xmm0, 3                 ; arithmetic shift right 11
1455         psrlw       xmm0, 8
1456         movdqa      xmm1, xmm5              ; get a copy of filters
1457         psraw       xmm1, 11                ; arithmetic shift right 11
1458         psllw       xmm1, 8                 ; shift left 8 to put it back
1459
1460         por         xmm0, xmm1              ; put the two together to get result
1461
1462         psubsb      xmm3, xmm0              ; q0-= q0 add
1463         pxor        xmm3, [GLOBAL(t80)]     ; unoffset
1464         movdqu      [rsi], xmm3             ; write back
1465
1466         ; now do +3 side
1467         psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
1468
1469         movdqa      xmm0, xmm5              ; get a copy of filters
1470         psllw       xmm0, 8                 ; shift left 8
1471         psraw       xmm0, 3                 ; arithmetic shift right 11
1472         psrlw       xmm0, 8
1473         psraw       xmm5, 11                ; arithmetic shift right 11
1474         psllw       xmm5, 8                 ; shift left 8 to put it back
1475         por         xmm0, xmm5              ; put the two together to get result
1476
1477
1478         paddsb      xmm6, xmm0              ; p0+= p0 add
1479         pxor        xmm6, [GLOBAL(t80)]     ; unoffset
1480         movdqu      [rsi+rax], xmm6         ; write back
1481
1482     ; begin epilog
1483     pop rdi
1484     pop rsi
1485     RESTORE_GOT
1486     RESTORE_XMM
1487     UNSHADOW_ARGS
1488     pop         rbp
1489     ret
1490
1491
1492 ;void vp8_loop_filter_simple_vertical_edge_sse2
1493 ;(
1494 ;    unsigned char *src_ptr,
1495 ;    int  src_pixel_step,
1496 ;    const char *flimit,
1497 ;    const char *limit,
1498 ;    const char *thresh,
1499 ;    int count
1500 ;)
1501 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
1502 sym(vp8_loop_filter_simple_vertical_edge_sse2):
1503     push        rbp         ; save old base pointer value.
1504     mov         rbp, rsp    ; set new base pointer value.
1505     SHADOW_ARGS_TO_STACK 6
1506     SAVE_XMM
1507     GET_GOT     rbx         ; save callee-saved reg
1508     push        rsi
1509     push        rdi
1510     ; end prolog
1511
1512     ALIGN_STACK 16, rax
1513     sub         rsp, 32                         ; reserve 32 bytes
1514     %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
1515     %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
1516
1517         mov         rsi, arg(0) ;src_ptr
1518         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1519
1520         lea         rsi,        [rsi - 2 ]
1521         lea         rdi,        [rsi + rax]
1522         lea         rdx,        [rsi + rax*4]
1523         lea         rcx,        [rdx + rax]
1524
1525         movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
1526         movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
1527         movdqu      xmm2,       [rdi]                   ; 13 12 11 10
1528         movdqu      xmm3,       [rcx]                   ; 53 52 51 50
1529         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1530         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
1531
1532         movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
1533         movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
1534         movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
1535         movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
1536         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
1537         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
1538
1539         punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1540         punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1541
1542         movdqa      xmm1,       xmm0
1543         punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1544         punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1545
1546         movdqa      xmm2,       xmm0
1547         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1548         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1549
1550         movdqa      t0,         xmm0                    ; save to t0
1551         movdqa      t1,         xmm2                    ; save to t1
1552
1553         lea         rsi,        [rsi + rax*8]
1554         lea         rdi,        [rsi + rax]
1555         lea         rdx,        [rsi + rax*4]
1556         lea         rcx,        [rdx + rax]
1557
1558         movdqu      xmm4,       [rsi]                   ; 83 82 81 80
1559         movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
1560         movdqu      xmm6,       [rdi]                   ; 93 92 91 90
1561         movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
1562         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
1563         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
1564
1565         movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
1566         movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
1567         movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
1568         movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
1569         punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
1570         punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
1571
1572         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1573         punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1574
1575         movdqa      xmm1,       xmm4
1576         punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1577         punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1578
1579         movdqa      xmm6,       xmm4
1580         punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1581         punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1582
1583         movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1584         movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1585         movdqa      xmm1,       xmm0
1586         movdqa      xmm3,       xmm2
1587
1588         punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1589         punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1590         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1591         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1592
1593         ; calculate mask
1594         movdqa      xmm6,       xmm0                            ; p1
1595         movdqa      xmm7,       xmm3                            ; q1
1596         psubusb     xmm7,       xmm0                            ; q1-=p1
1597         psubusb     xmm6,       xmm3                            ; p1-=q1
1598         por         xmm6,       xmm7                            ; abs(p1-q1)
1599         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1600         psrlw       xmm6,       1                               ; abs(p1-q1)/2
1601
1602         movdqa      xmm5,       xmm1                            ; p0
1603         movdqa      xmm4,       xmm2                            ; q0
1604         psubusb     xmm5,       xmm2                            ; p0-=q0
1605         psubusb     xmm4,       xmm1                            ; q0-=p0
1606         por         xmm5,       xmm4                            ; abs(p0 - q0)
1607         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
1608         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
1609
1610         mov         rdx,        arg(2)                          ;flimit
1611         movdqa      xmm7, XMMWORD PTR [rdx]
1612         mov         rdx,        arg(3)                          ; get limit
1613         movdqa      xmm6, XMMWORD PTR [rdx]
1614         paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
1615         paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
1616
1617         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
1618         pxor        xmm7,        xmm7
1619         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
1620
1621         ; start work on filters
1622         movdqa        t0,        xmm0
1623         movdqa        t1,        xmm3
1624
1625         pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
1626         pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
1627
1628         psubsb      xmm0,        xmm3                           ; p1 - q1
1629         movdqa      xmm6,        xmm1                           ; p0
1630
1631         movdqa      xmm7,        xmm2                           ; q0
1632         pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
1633
1634         pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
1635         movdqa      xmm3,        xmm7                           ; offseted ; q0
1636
1637         psubsb      xmm7,        xmm6                           ; q0 - p0
1638         paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
1639
1640         paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
1641         paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
1642
1643         pand        xmm5,        xmm0                           ; mask filter values we don't care about
1644
1645
1646         paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
1647
1648         movdqa      xmm0,        xmm5                           ; get a copy of filters
1649         psllw       xmm0,        8                              ; shift left 8
1650
1651         psraw       xmm0,        3                              ; arithmetic shift right 11
1652         psrlw       xmm0,        8
1653
1654         movdqa      xmm7,        xmm5                           ; get a copy of filters
1655         psraw       xmm7,        11                             ; arithmetic shift right 11
1656
1657         psllw       xmm7,        8                              ; shift left 8 to put it back
1658         por         xmm0,        xmm7                           ; put the two together to get result
1659
1660         psubsb      xmm3,        xmm0                           ; q0-= q0sz add
1661         pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
1662
1663         ; now do +3 side
1664         psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
1665         movdqa      xmm0,        xmm5                           ; get a copy of filters
1666
1667         psllw       xmm0,        8                              ; shift left 8
1668         psraw       xmm0,        3                              ; arithmetic shift right 11
1669
1670         psrlw       xmm0,        8
1671         psraw       xmm5,        11                             ; arithmetic shift right 11
1672
1673         psllw       xmm5,        8                              ; shift left 8 to put it back
1674         por         xmm0,        xmm5                           ; put the two together to get result
1675
1676         paddsb      xmm6,        xmm0                           ; p0+= p0 add
1677         pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
1678
1679         movdqa      xmm0,        t0                             ; p1
1680         movdqa      xmm4,        t1                             ; q1
1681
1682         ; transpose back to write out
1683         ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1684         ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1685         ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1686         ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1687         movdqa      xmm1,       xmm0
1688         punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1689         punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1690
1691         movdqa      xmm5,       xmm3
1692         punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1693         punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1694
1695         movdqa      xmm2,       xmm0
1696         punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1697         punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1698
1699         movdqa      xmm3,       xmm1
1700         punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1701         punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1702
1703         ; write out order: xmm0 xmm2 xmm1 xmm3
1704         lea         rdx,        [rsi + rax*4]
1705
1706         movd        [rsi],      xmm1                               ; write the second 8-line result
1707         psrldq      xmm1,       4
1708         movd        [rdi],      xmm1
1709         psrldq      xmm1,       4
1710         movd        [rsi + rax*2], xmm1
1711         psrldq      xmm1,       4
1712         movd        [rdi + rax*2], xmm1
1713
1714         movd        [rdx],      xmm3
1715         psrldq      xmm3,       4
1716         movd        [rcx],      xmm3
1717         psrldq      xmm3,       4
1718         movd        [rdx + rax*2], xmm3
1719         psrldq      xmm3,       4
1720         movd        [rcx + rax*2], xmm3
1721
1722         neg         rax
1723         lea         rsi,        [rsi + rax*8]
1724         neg         rax
1725         lea         rdi,        [rsi + rax]
1726         lea         rdx,        [rsi + rax*4]
1727         lea         rcx,        [rdx + rax]
1728
1729         movd        [rsi],      xmm0                                ; write the first 8-line result
1730         psrldq      xmm0,       4
1731         movd        [rdi],      xmm0
1732         psrldq      xmm0,       4
1733         movd        [rsi + rax*2], xmm0
1734         psrldq      xmm0,       4
1735         movd        [rdi + rax*2], xmm0
1736
1737         movd        [rdx],      xmm2
1738         psrldq      xmm2,       4
1739         movd        [rcx],      xmm2
1740         psrldq      xmm2,       4
1741         movd        [rdx + rax*2], xmm2
1742         psrldq      xmm2,       4
1743         movd        [rcx + rax*2], xmm2
1744
1745     add rsp, 32
1746     pop rsp
1747     ; begin epilog
1748     pop rdi
1749     pop rsi
1750     RESTORE_GOT
1751     RESTORE_XMM
1752     UNSHADOW_ARGS
1753     pop         rbp
1754     ret
1755
1756 SECTION_RODATA
1757 align 16
1758 tfe:
1759     times 16 db 0xfe
1760 align 16
1761 t80:
1762     times 16 db 0x80
1763 align 16
1764 t1s:
1765     times 16 db 0x01
1766 align 16
1767 t3:
1768     times 16 db 0x03
1769 align 16
1770 t4:
1771     times 16 db 0x04
1772 align 16
1773 ones:
1774     times 8 dw 0x0001
1775 align 16
1776 s9:
1777     times 8 dw 0x0900
1778 align 16
1779 s63:
1780     times 8 dw 0x003f