vp8/common/x86/subpixel_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 %define BLOCK_HEIGHT_WIDTH 4
  15 %define VP8_FILTER_WEIGHT 128
  16 %define VP8_FILTER_SHIFT  7
  17
  18
  19 ;/************************************************************************************
  20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
  21 ; input pixel array has output_height rows. This routine assumes that output_height is an
  22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
  23 ; rows each iteration to take advantage of the 128 bits operations.
  24 ;*************************************************************************************/
  25 ;void vp8_filter_block1d8_h6_sse2
  26 ;(
  27 ;    unsigned char  *src_ptr,
  28 ;    unsigned short *output_ptr,
  29 ;    unsigned int    src_pixels_per_line,
  30 ;    unsigned int    pixel_step,
  31 ;    unsigned int    output_height,
  32 ;    unsigned int    output_width,
  33 ;    short           *vp8_filter
  34 ;)
  35 global sym(vp8_filter_block1d8_h6_sse2)
  36 sym(vp8_filter_block1d8_h6_sse2):
  37     push        rbp
  38     mov         rbp, rsp
  39     SHADOW_ARGS_TO_STACK 7
  40     SAVE_XMM
  41     GET_GOT     rbx
  42     push        rsi
  43     push        rdi
  44     ; end prolog
  45
  46         mov         rdx,        arg(6) ;vp8_filter
  47         mov         rsi,        arg(0) ;src_ptr
  48
  49         mov         rdi,        arg(1) ;output_ptr
  50
  51         movsxd      rcx,        dword ptr arg(4) ;output_height
  52         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
  53 %if ABI_IS_32BIT=0
  54         movsxd      r8,         dword ptr arg(5) ;output_width
  55 %endif
  56         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
  57
  58 filter_block1d8_h6_rowloop:
  59         movq        xmm3,       MMWORD PTR [rsi - 2]
  60         movq        xmm1,       MMWORD PTR [rsi + 6]
  61
  62         prefetcht2  [rsi+rax-2]
  63
  64         pslldq      xmm1,       8
  65         por         xmm1,       xmm3
  66
  67         movdqa      xmm4,       xmm1
  68         movdqa      xmm5,       xmm1
  69
  70         movdqa      xmm6,       xmm1
  71         movdqa      xmm7,       xmm1
  72
  73         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
  74         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
  75
  76         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
  77         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
  78
  79         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
  80         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
  81
  82
  83         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
  84         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
  85
  86         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
  87
  88         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
  89         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
  90
  91         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
  92
  93         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
  94         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
  95
  96
  97         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
  98
  99         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
 100         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
 101
 102
 103         paddsw      xmm4,       xmm7
 104         paddsw      xmm4,       xmm5
 105
 106         paddsw      xmm4,       xmm3
 107         paddsw      xmm4,       xmm6
 108
 109         paddsw      xmm4,       xmm1
 110         paddsw      xmm4,       [GLOBAL(rd)]
 111
 112         psraw       xmm4,       7
 113
 114         packuswb    xmm4,       xmm0
 115         punpcklbw   xmm4,       xmm0
 116
 117         movdqa      XMMWORD Ptr [rdi],         xmm4
 118         lea         rsi,        [rsi + rax]
 119
 120 %if ABI_IS_32BIT
 121         add         rdi,        DWORD Ptr arg(5) ;[output_width]
 122 %else
 123         add         rdi,        r8
 124 %endif
 125         dec         rcx
 126
 127         jnz         filter_block1d8_h6_rowloop                ; next row
 128
 129     ; begin epilog
 130     pop rdi
 131     pop rsi
 132     RESTORE_GOT
 133     RESTORE_XMM
 134     UNSHADOW_ARGS
 135     pop         rbp
 136     ret
 137
 138
 139 ;void vp8_filter_block1d16_h6_sse2
 140 ;(
 141 ;    unsigned char  *src_ptr,
 142 ;    unsigned short *output_ptr,
 143 ;    unsigned int    src_pixels_per_line,
 144 ;    unsigned int    pixel_step,
 145 ;    unsigned int    output_height,
 146 ;    unsigned int    output_width,
 147 ;    short           *vp8_filter
 148 ;)
 149 ;/************************************************************************************
 150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
 151 ; input pixel array has output_height rows. This routine assumes that output_height is an
 152 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 153 ; rows each iteration to take advantage of the 128 bits operations.
 154 ;*************************************************************************************/
 155 global sym(vp8_filter_block1d16_h6_sse2)
 156 sym(vp8_filter_block1d16_h6_sse2):
 157     push        rbp
 158     mov         rbp, rsp
 159     SHADOW_ARGS_TO_STACK 7
 160     SAVE_XMM
 161     GET_GOT     rbx
 162     push        rsi
 163     push        rdi
 164     ; end prolog
 165
 166         mov         rdx,        arg(6) ;vp8_filter
 167         mov         rsi,        arg(0) ;src_ptr
 168
 169         mov         rdi,        arg(1) ;output_ptr
 170
 171         movsxd      rcx,        dword ptr arg(4) ;output_height
 172         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
 173 %if ABI_IS_32BIT=0
 174         movsxd      r8,         dword ptr arg(5) ;output_width
 175 %endif
 176
 177         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 178
 179 filter_block1d16_h6_sse2_rowloop:
 180         movq        xmm3,       MMWORD PTR [rsi - 2]
 181         movq        xmm1,       MMWORD PTR [rsi + 6]
 182
 183         movq        xmm2,       MMWORD PTR [rsi +14]
 184         pslldq      xmm2,       8
 185
 186         por         xmm2,       xmm1
 187         prefetcht2  [rsi+rax-2]
 188
 189         pslldq      xmm1,       8
 190         por         xmm1,       xmm3
 191
 192         movdqa      xmm4,       xmm1
 193         movdqa      xmm5,       xmm1
 194
 195         movdqa      xmm6,       xmm1
 196         movdqa      xmm7,       xmm1
 197
 198         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
 199         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
 200
 201         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
 202         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
 203
 204         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
 205         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
 206
 207
 208         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
 209         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
 210
 211         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
 212
 213         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
 214         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
 215
 216         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
 217
 218         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
 219         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
 220
 221
 222         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
 223
 224         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
 225         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
 226
 227         paddsw      xmm4,       xmm7
 228         paddsw      xmm4,       xmm5
 229
 230         paddsw      xmm4,       xmm3
 231         paddsw      xmm4,       xmm6
 232
 233         paddsw      xmm4,       xmm1
 234         paddsw      xmm4,       [GLOBAL(rd)]
 235
 236         psraw       xmm4,       7
 237
 238         packuswb    xmm4,       xmm0
 239         punpcklbw   xmm4,       xmm0
 240
 241         movdqa      XMMWORD Ptr [rdi],         xmm4
 242
 243         movdqa      xmm3,       xmm2
 244         movdqa      xmm4,       xmm2
 245
 246         movdqa      xmm5,       xmm2
 247         movdqa      xmm6,       xmm2
 248
 249         movdqa      xmm7,       xmm2
 250
 251         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
 252         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
 253
 254         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
 255         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
 256
 257         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
 258         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
 259
 260
 261         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
 262         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
 263
 264         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
 265
 266         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
 267         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
 268
 269         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
 270
 271         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
 272         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
 273
 274         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
 275
 276         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
 277         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
 278
 279
 280         paddsw      xmm4,       xmm7
 281         paddsw      xmm4,       xmm5
 282
 283         paddsw      xmm4,       xmm3
 284         paddsw      xmm4,       xmm6
 285
 286         paddsw      xmm4,       xmm2
 287         paddsw      xmm4,       [GLOBAL(rd)]
 288
 289         psraw       xmm4,       7
 290
 291         packuswb    xmm4,       xmm0
 292         punpcklbw   xmm4,       xmm0
 293
 294         movdqa      XMMWORD Ptr [rdi+16],      xmm4
 295
 296         lea         rsi,        [rsi + rax]
 297 %if ABI_IS_32BIT
 298         add         rdi,        DWORD Ptr arg(5) ;[output_width]
 299 %else
 300         add         rdi,        r8
 301 %endif
 302
 303         dec         rcx
 304         jnz         filter_block1d16_h6_sse2_rowloop                ; next row
 305
 306     ; begin epilog
 307     pop rdi
 308     pop rsi
 309     RESTORE_GOT
 310     RESTORE_XMM
 311     UNSHADOW_ARGS
 312     pop         rbp
 313     ret
 314
 315
 316 ;void vp8_filter_block1d8_v6_sse2
 317 ;(
 318 ;    short *src_ptr,
 319 ;    unsigned char *output_ptr,
 320 ;    int dst_ptich,
 321 ;    unsigned int pixels_per_line,
 322 ;    unsigned int pixel_step,
 323 ;    unsigned int output_height,
 324 ;    unsigned int output_width,
 325 ;    short * vp8_filter
 326 ;)
 327 ;/************************************************************************************
 328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
 329 ; input pixel array has output_height rows.
 330 ;*************************************************************************************/
 331 global sym(vp8_filter_block1d8_v6_sse2)
 332 sym(vp8_filter_block1d8_v6_sse2):
 333     push        rbp
 334     mov         rbp, rsp
 335     SHADOW_ARGS_TO_STACK 8
 336     SAVE_XMM
 337     GET_GOT     rbx
 338     push        rsi
 339     push        rdi
 340     ; end prolog
 341
 342         mov         rax,        arg(7) ;vp8_filter
 343         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
 344
 345         mov         rdi,        arg(1) ;output_ptr
 346         mov         rsi,        arg(0) ;src_ptr
 347
 348         sub         rsi,        rdx
 349         sub         rsi,        rdx
 350
 351         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
 352         pxor        xmm0,       xmm0                        ; clear xmm0
 353
 354         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
 355 %if ABI_IS_32BIT=0
 356         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 357 %endif
 358
 359 vp8_filter_block1d8_v6_sse2_loop:
 360         movdqa      xmm1,       XMMWORD PTR [rsi]
 361         pmullw      xmm1,       [rax]
 362
 363         movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
 364         pmullw      xmm2,       [rax + 16]
 365
 366         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
 367         pmullw      xmm3,       [rax + 32]
 368
 369         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
 370         pmullw      xmm5,       [rax + 64]
 371
 372         add         rsi,        rdx
 373         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
 374
 375         pmullw      xmm4,       [rax + 48]
 376         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
 377
 378         pmullw      xmm6,       [rax + 80]
 379
 380         paddsw      xmm2,       xmm5
 381         paddsw      xmm2,       xmm3
 382
 383         paddsw      xmm2,       xmm1
 384         paddsw      xmm2,       xmm4
 385
 386         paddsw      xmm2,       xmm6
 387         paddsw      xmm2,       xmm7
 388
 389         psraw       xmm2,       7
 390         packuswb    xmm2,       xmm0              ; pack and saturate
 391
 392         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
 393 %if ABI_IS_32BIT
 394         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
 395 %else
 396         add         rdi,        r8
 397 %endif
 398         dec         rcx         ; decrement count
 399         jnz         vp8_filter_block1d8_v6_sse2_loop               ; next row
 400
 401     ; begin epilog
 402     pop rdi
 403     pop rsi
 404     RESTORE_GOT
 405     RESTORE_XMM
 406     UNSHADOW_ARGS
 407     pop         rbp
 408     ret
 409
 410
 411 ;void vp8_filter_block1d16_v6_sse2
 412 ;(
 413 ;    unsigned short *src_ptr,
 414 ;    unsigned char *output_ptr,
 415 ;    int dst_ptich,
 416 ;    unsigned int pixels_per_line,
 417 ;    unsigned int pixel_step,
 418 ;    unsigned int output_height,
 419 ;    unsigned int output_width,
 420 ;    const short    *vp8_filter
 421 ;)
 422 ;/************************************************************************************
 423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
 424 ; input pixel array has output_height rows.
 425 ;*************************************************************************************/
 426 global sym(vp8_filter_block1d16_v6_sse2)
 427 sym(vp8_filter_block1d16_v6_sse2):
 428     push        rbp
 429     mov         rbp, rsp
 430     SHADOW_ARGS_TO_STACK 8
 431     SAVE_XMM
 432     GET_GOT     rbx
 433     push        rsi
 434     push        rdi
 435     ; end prolog
 436
 437         mov         rax,        arg(7) ;vp8_filter
 438         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
 439
 440         mov         rdi,        arg(1) ;output_ptr
 441         mov         rsi,        arg(0) ;src_ptr
 442
 443         sub         rsi,        rdx
 444         sub         rsi,        rdx
 445
 446         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
 447 %if ABI_IS_32BIT=0
 448         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 449 %endif
 450
 451 vp8_filter_block1d16_v6_sse2_loop:
 452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
 453         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
 454         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
 455         pmullw      xmm1,       [rax + 16]
 456         pmullw      xmm2,       [rax + 16]
 457
 458         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
 459         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
 460         pmullw      xmm3,       [rax + 64]
 461         pmullw      xmm4,       [rax + 64]
 462
 463         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
 464         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
 465         pmullw      xmm5,       [rax + 32]
 466         pmullw      xmm6,       [rax + 32]
 467
 468         movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
 469         movdqa      xmm0,       XMMWORD PTR [rsi + 16]
 470         pmullw      xmm7,       [rax]
 471         pmullw      xmm0,       [rax]
 472
 473         paddsw      xmm1,       xmm3
 474         paddsw      xmm2,       xmm4
 475         paddsw      xmm1,       xmm5
 476         paddsw      xmm2,       xmm6
 477         paddsw      xmm1,       xmm7
 478         paddsw      xmm2,       xmm0
 479
 480         add         rsi,        rdx
 481
 482         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
 483         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
 484         pmullw      xmm3,       [rax + 48]
 485         pmullw      xmm4,       [rax + 48]
 486
 487         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
 488         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
 489         pmullw      xmm5,       [rax + 80]
 490         pmullw      xmm6,       [rax + 80]
 491
 492         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
 493         pxor        xmm0,       xmm0                        ; clear xmm0
 494
 495         paddsw      xmm1,       xmm3
 496         paddsw      xmm2,       xmm4
 497         paddsw      xmm1,       xmm5
 498         paddsw      xmm2,       xmm6
 499
 500         paddsw      xmm1,       xmm7
 501         paddsw      xmm2,       xmm7
 502
 503         psraw       xmm1,       7
 504         psraw       xmm2,       7
 505
 506         packuswb    xmm1,       xmm2              ; pack and saturate
 507         movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
 508 %if ABI_IS_32BIT
 509         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
 510 %else
 511         add         rdi,        r8
 512 %endif
 513         dec         rcx         ; decrement count
 514         jnz         vp8_filter_block1d16_v6_sse2_loop               ; next row
 515
 516     ; begin epilog
 517     pop rdi
 518     pop rsi
 519     RESTORE_GOT
 520     RESTORE_XMM
 521     UNSHADOW_ARGS
 522     pop         rbp
 523     ret
 524
 525
 526 ;void vp8_filter_block1d8_h6_only_sse2
 527 ;(
 528 ;    unsigned char  *src_ptr,
 529 ;    unsigned int    src_pixels_per_line,
 530 ;    unsigned char  *output_ptr,
 531 ;    int dst_ptich,
 532 ;    unsigned int    output_height,
 533 ;    const short    *vp8_filter
 534 ;)
 535 ; First-pass filter only when yoffset==0
 536 global sym(vp8_filter_block1d8_h6_only_sse2)
 537 sym(vp8_filter_block1d8_h6_only_sse2):
 538     push        rbp
 539     mov         rbp, rsp
 540     SHADOW_ARGS_TO_STACK 6
 541     SAVE_XMM
 542     GET_GOT     rbx
 543     push        rsi
 544     push        rdi
 545     ; end prolog
 546
 547         mov         rdx,        arg(5) ;vp8_filter
 548         mov         rsi,        arg(0) ;src_ptr
 549
 550         mov         rdi,        arg(2) ;output_ptr
 551
 552         movsxd      rcx,        dword ptr arg(4) ;output_height
 553         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
 554 %if ABI_IS_32BIT=0
 555         movsxd      r8,         dword ptr arg(3) ;dst_ptich
 556 %endif
 557         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 558
 559 filter_block1d8_h6_only_rowloop:
 560         movq        xmm3,       MMWORD PTR [rsi - 2]
 561         movq        xmm1,       MMWORD PTR [rsi + 6]
 562
 563         prefetcht2  [rsi+rax-2]
 564
 565         pslldq      xmm1,       8
 566         por         xmm1,       xmm3
 567
 568         movdqa      xmm4,       xmm1
 569         movdqa      xmm5,       xmm1
 570
 571         movdqa      xmm6,       xmm1
 572         movdqa      xmm7,       xmm1
 573
 574         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
 575         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
 576
 577         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
 578         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
 579
 580         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
 581         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
 582
 583
 584         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
 585         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
 586
 587         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
 588
 589         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
 590         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
 591
 592         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
 593
 594         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
 595         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
 596
 597
 598         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
 599
 600         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
 601         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
 602
 603
 604         paddsw      xmm4,       xmm7
 605         paddsw      xmm4,       xmm5
 606
 607         paddsw      xmm4,       xmm3
 608         paddsw      xmm4,       xmm6
 609
 610         paddsw      xmm4,       xmm1
 611         paddsw      xmm4,       [GLOBAL(rd)]
 612
 613         psraw       xmm4,       7
 614
 615         packuswb    xmm4,       xmm0
 616
 617         movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
 618         lea         rsi,        [rsi + rax]
 619
 620 %if ABI_IS_32BIT
 621         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
 622 %else
 623         add         rdi,        r8
 624 %endif
 625         dec         rcx
 626
 627         jnz         filter_block1d8_h6_only_rowloop                ; next row
 628
 629     ; begin epilog
 630     pop rdi
 631     pop rsi
 632     RESTORE_GOT
 633     RESTORE_XMM
 634     UNSHADOW_ARGS
 635     pop         rbp
 636     ret
 637
 638
 639 ;void vp8_filter_block1d16_h6_only_sse2
 640 ;(
 641 ;    unsigned char  *src_ptr,
 642 ;    unsigned int    src_pixels_per_line,
 643 ;    unsigned char  *output_ptr,
 644 ;    int dst_ptich,
 645 ;    unsigned int    output_height,
 646 ;    const short    *vp8_filter
 647 ;)
 648 ; First-pass filter only when yoffset==0
 649 global sym(vp8_filter_block1d16_h6_only_sse2)
 650 sym(vp8_filter_block1d16_h6_only_sse2):
 651     push        rbp
 652     mov         rbp, rsp
 653     SHADOW_ARGS_TO_STACK 6
 654     SAVE_XMM
 655     GET_GOT     rbx
 656     push        rsi
 657     push        rdi
 658     ; end prolog
 659
 660         mov         rdx,        arg(5) ;vp8_filter
 661         mov         rsi,        arg(0) ;src_ptr
 662
 663         mov         rdi,        arg(2) ;output_ptr
 664
 665         movsxd      rcx,        dword ptr arg(4) ;output_height
 666         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
 667 %if ABI_IS_32BIT=0
 668         movsxd      r8,         dword ptr arg(3) ;dst_ptich
 669 %endif
 670
 671         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 672
 673 filter_block1d16_h6_only_sse2_rowloop:
 674         movq        xmm3,       MMWORD PTR [rsi - 2]
 675         movq        xmm1,       MMWORD PTR [rsi + 6]
 676
 677         movq        xmm2,       MMWORD PTR [rsi +14]
 678         pslldq      xmm2,       8
 679
 680         por         xmm2,       xmm1
 681         prefetcht2  [rsi+rax-2]
 682
 683         pslldq      xmm1,       8
 684         por         xmm1,       xmm3
 685
 686         movdqa      xmm4,       xmm1
 687         movdqa      xmm5,       xmm1
 688
 689         movdqa      xmm6,       xmm1
 690         movdqa      xmm7,       xmm1
 691
 692         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
 693         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
 694
 695         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
 696         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
 697
 698         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
 699         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
 700
 701         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
 702         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
 703
 704         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
 705
 706         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
 707         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
 708
 709         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
 710
 711         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
 712         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
 713
 714         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
 715
 716         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
 717         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
 718
 719         paddsw      xmm4,       xmm7
 720         paddsw      xmm4,       xmm5
 721
 722         paddsw      xmm4,       xmm3
 723         paddsw      xmm4,       xmm6
 724
 725         paddsw      xmm4,       xmm1
 726         paddsw      xmm4,       [GLOBAL(rd)]
 727
 728         psraw       xmm4,       7
 729
 730         packuswb    xmm4,       xmm0                        ; lower 8 bytes
 731
 732         movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
 733
 734         movdqa      xmm3,       xmm2
 735         movdqa      xmm4,       xmm2
 736
 737         movdqa      xmm5,       xmm2
 738         movdqa      xmm6,       xmm2
 739
 740         movdqa      xmm7,       xmm2
 741
 742         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
 743         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
 744
 745         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
 746         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
 747
 748         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
 749         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
 750
 751         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
 752         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
 753
 754         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
 755
 756         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
 757         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
 758
 759         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
 760
 761         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
 762         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
 763
 764         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
 765
 766         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
 767         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
 768
 769         paddsw      xmm4,       xmm7
 770         paddsw      xmm4,       xmm5
 771
 772         paddsw      xmm4,       xmm3
 773         paddsw      xmm4,       xmm6
 774
 775         paddsw      xmm4,       xmm2
 776         paddsw      xmm4,       [GLOBAL(rd)]
 777
 778         psraw       xmm4,       7
 779
 780         packuswb    xmm4,       xmm0                        ; higher 8 bytes
 781
 782         movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
 783
 784         lea         rsi,        [rsi + rax]
 785 %if ABI_IS_32BIT
 786         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
 787 %else
 788         add         rdi,        r8
 789 %endif
 790
 791         dec         rcx
 792         jnz         filter_block1d16_h6_only_sse2_rowloop                ; next row
 793
 794     ; begin epilog
 795     pop rdi
 796     pop rsi
 797     RESTORE_GOT
 798     RESTORE_XMM
 799     UNSHADOW_ARGS
 800     pop         rbp
 801     ret
 802
 803
 804 ;void vp8_filter_block1d8_v6_only_sse2
 805 ;(
 806 ;    unsigned char *src_ptr,
 807 ;    unsigned int    src_pixels_per_line,
 808 ;    unsigned char *output_ptr,
 809 ;    int dst_ptich,
 810 ;    unsigned int output_height,
 811 ;    const short    *vp8_filter
 812 ;)
 813 ; Second-pass filter only when xoffset==0
 814 global sym(vp8_filter_block1d8_v6_only_sse2)
 815 sym(vp8_filter_block1d8_v6_only_sse2):
 816     push        rbp
 817     mov         rbp, rsp
 818     SHADOW_ARGS_TO_STACK 6
 819     SAVE_XMM
 820     GET_GOT     rbx
 821     push        rsi
 822     push        rdi
 823     ; end prolog
 824
 825         mov         rsi,        arg(0) ;src_ptr
 826         mov         rdi,        arg(2) ;output_ptr
 827
 828         movsxd      rcx,        dword ptr arg(4) ;output_height
 829         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
 830
 831         mov         rax,        arg(5) ;vp8_filter
 832
 833         pxor        xmm0,       xmm0                        ; clear xmm0
 834
 835         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
 836 %if ABI_IS_32BIT=0
 837         movsxd      r8,         dword ptr arg(3) ; dst_ptich
 838 %endif
 839
 840 vp8_filter_block1d8_v6_only_sse2_loop:
 841         movq        xmm1,       MMWORD PTR [rsi]
 842         movq        xmm2,       MMWORD PTR [rsi + rdx]
 843         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
 844         movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
 845         add         rsi,        rdx
 846         movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
 847         movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
 848
 849         punpcklbw   xmm1,       xmm0
 850         pmullw      xmm1,       [rax]
 851
 852         punpcklbw   xmm2,       xmm0
 853         pmullw      xmm2,       [rax + 16]
 854
 855         punpcklbw   xmm3,       xmm0
 856         pmullw      xmm3,       [rax + 32]
 857
 858         punpcklbw   xmm5,       xmm0
 859         pmullw      xmm5,       [rax + 64]
 860
 861         punpcklbw   xmm4,       xmm0
 862         pmullw      xmm4,       [rax + 48]
 863
 864         punpcklbw   xmm6,       xmm0
 865         pmullw      xmm6,       [rax + 80]
 866
 867         paddsw      xmm2,       xmm5
 868         paddsw      xmm2,       xmm3
 869
 870         paddsw      xmm2,       xmm1
 871         paddsw      xmm2,       xmm4
 872
 873         paddsw      xmm2,       xmm6
 874         paddsw      xmm2,       xmm7
 875
 876         psraw       xmm2,       7
 877         packuswb    xmm2,       xmm0              ; pack and saturate
 878
 879         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
 880 %if ABI_IS_32BIT
 881         add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
 882 %else
 883         add         rdi,        r8
 884 %endif
 885         dec         rcx         ; decrement count
 886         jnz         vp8_filter_block1d8_v6_only_sse2_loop               ; next row
 887
 888     ; begin epilog
 889     pop rdi
 890     pop rsi
 891     RESTORE_GOT
 892     RESTORE_XMM
 893     UNSHADOW_ARGS
 894     pop         rbp
 895     ret
 896
 897
 898 ;void vp8_unpack_block1d16_h6_sse2
 899 ;(
 900 ;    unsigned char  *src_ptr,
 901 ;    unsigned short *output_ptr,
 902 ;    unsigned int    src_pixels_per_line,
 903 ;    unsigned int    output_height,
 904 ;    unsigned int    output_width
 905 ;)
 906 global sym(vp8_unpack_block1d16_h6_sse2)
 907 sym(vp8_unpack_block1d16_h6_sse2):
 908     push        rbp
 909     mov         rbp, rsp
 910     SHADOW_ARGS_TO_STACK 5
 911     ;SAVE_XMM                          ;xmm6, xmm7 are not used here.
 912     GET_GOT     rbx
 913     push        rsi
 914     push        rdi
 915     ; end prolog
 916
 917         mov         rsi,        arg(0) ;src_ptr
 918         mov         rdi,        arg(1) ;output_ptr
 919
 920         movsxd      rcx,        dword ptr arg(3) ;output_height
 921         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
 922
 923         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 924 %if ABI_IS_32BIT=0
 925         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
 926 %endif
 927
 928 unpack_block1d16_h6_sse2_rowloop:
 929         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
 930         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
 931
 932         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
 933         punpcklbw   xmm1,       xmm0
 934
 935         movdqa      XMMWORD Ptr [rdi],         xmm1
 936         movdqa      XMMWORD Ptr [rdi + 16],    xmm3
 937
 938         lea         rsi,        [rsi + rax]
 939 %if ABI_IS_32BIT
 940         add         rdi,        DWORD Ptr arg(4) ;[output_width]
 941 %else
 942         add         rdi,        r8
 943 %endif
 944         dec         rcx
 945         jnz         unpack_block1d16_h6_sse2_rowloop                ; next row
 946
 947     ; begin epilog
 948     pop rdi
 949     pop rsi
 950     RESTORE_GOT
 951     ;RESTORE_XMM
 952     UNSHADOW_ARGS
 953     pop         rbp
 954     ret
 955
 956
 957 ;void vp8_bilinear_predict16x16_sse2
 958 ;(
 959 ;    unsigned char  *src_ptr,
 960 ;    int   src_pixels_per_line,
 961 ;    int  xoffset,
 962 ;    int  yoffset,
 963 ;    unsigned char *dst_ptr,
 964 ;    int dst_pitch
 965 ;)
 966 extern sym(vp8_bilinear_filters_mmx)
 967 global sym(vp8_bilinear_predict16x16_sse2)
 968 sym(vp8_bilinear_predict16x16_sse2):
 969     push        rbp
 970     mov         rbp, rsp
 971     SHADOW_ARGS_TO_STACK 6
 972     SAVE_XMM
 973     GET_GOT     rbx
 974     push        rsi
 975     push        rdi
 976     ; end prolog
 977
 978     ;const short *HFilter = bilinear_filters_mmx[xoffset]
 979     ;const short *VFilter = bilinear_filters_mmx[yoffset]
 980
 981         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
 982         movsxd      rax,        dword ptr arg(2) ;xoffset
 983
 984         cmp         rax,        0      ;skip first_pass filter if xoffset=0
 985         je          b16x16_sp_only
 986
 987         shl         rax,        5
 988         add         rax,        rcx    ;HFilter
 989
 990         mov         rdi,        arg(4) ;dst_ptr
 991         mov         rsi,        arg(0) ;src_ptr
 992         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
 993
 994         movdqa      xmm1,       [rax]
 995         movdqa      xmm2,       [rax+16]
 996
 997         movsxd      rax,        dword ptr arg(3) ;yoffset
 998
 999         cmp         rax,        0      ;skip second_pass filter if yoffset=0
1000         je          b16x16_fp_only
1001
1002         shl         rax,        5
1003         add         rax,        rcx    ;VFilter
1004
1005         lea         rcx,        [rdi+rdx*8]
1006         lea         rcx,        [rcx+rdx*8]
1007         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1008
1009         pxor        xmm0,       xmm0
1010
1011 %if ABI_IS_32BIT=0
1012         movsxd      r8,         dword ptr arg(5) ;dst_pitch
1013 %endif
1014         ; get the first horizontal line done
1015         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1016         movdqa      xmm4,       xmm3                 ; make a copy of current line
1017
1018         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1019         punpckhbw   xmm4,       xmm0
1020
1021         pmullw      xmm3,       xmm1
1022         pmullw      xmm4,       xmm1
1023
1024         movdqu      xmm5,       [rsi+1]
1025         movdqa      xmm6,       xmm5
1026
1027         punpcklbw   xmm5,       xmm0
1028         punpckhbw   xmm6,       xmm0
1029
1030         pmullw      xmm5,       xmm2
1031         pmullw      xmm6,       xmm2
1032
1033         paddw       xmm3,       xmm5
1034         paddw       xmm4,       xmm6
1035
1036         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1037         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1038
1039         paddw       xmm4,       [GLOBAL(rd)]
1040         psraw       xmm4,       VP8_FILTER_SHIFT
1041
1042         movdqa      xmm7,       xmm3
1043         packuswb    xmm7,       xmm4
1044
1045         add         rsi,        rdx                 ; next line
1046 next_row:
1047         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1048         movdqa      xmm4,       xmm3                 ; make a copy of current line
1049
1050         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1051         punpckhbw   xmm4,       xmm0
1052
1053         pmullw      xmm3,       xmm1
1054         pmullw      xmm4,       xmm1
1055
1056         movdqu      xmm5,       [rsi+1]
1057         movdqa      xmm6,       xmm5
1058
1059         punpcklbw   xmm5,       xmm0
1060         punpckhbw   xmm6,       xmm0
1061
1062         pmullw      xmm5,       xmm2
1063         pmullw      xmm6,       xmm2
1064
1065         paddw       xmm3,       xmm5
1066         paddw       xmm4,       xmm6
1067
1068         movdqa      xmm5,       xmm7
1069         movdqa      xmm6,       xmm7
1070
1071         punpcklbw   xmm5,       xmm0
1072         punpckhbw   xmm6,       xmm0
1073
1074         pmullw      xmm5,       [rax]
1075         pmullw      xmm6,       [rax]
1076
1077         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1078         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1079
1080         paddw       xmm4,       [GLOBAL(rd)]
1081         psraw       xmm4,       VP8_FILTER_SHIFT
1082
1083         movdqa      xmm7,       xmm3
1084         packuswb    xmm7,       xmm4
1085
1086         pmullw      xmm3,       [rax+16]
1087         pmullw      xmm4,       [rax+16]
1088
1089         paddw       xmm3,       xmm5
1090         paddw       xmm4,       xmm6
1091
1092         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1093         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1094
1095         paddw       xmm4,       [GLOBAL(rd)]
1096         psraw       xmm4,       VP8_FILTER_SHIFT
1097
1098         packuswb    xmm3,       xmm4
1099         movdqa      [rdi],      xmm3                 ; store the results in the destination
1100
1101         add         rsi,        rdx                 ; next line
1102 %if ABI_IS_32BIT
1103         add         rdi,        DWORD PTR arg(5) ;dst_pitch
1104 %else
1105         add         rdi,        r8
1106 %endif
1107
1108         cmp         rdi,        rcx
1109         jne         next_row
1110
1111         jmp         done
1112
1113 b16x16_sp_only:
1114         movsxd      rax,        dword ptr arg(3) ;yoffset
1115         shl         rax,        5
1116         add         rax,        rcx    ;VFilter
1117
1118         mov         rdi,        arg(4) ;dst_ptr
1119         mov         rsi,        arg(0) ;src_ptr
1120         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
1121
1122         movdqa      xmm1,       [rax]
1123         movdqa      xmm2,       [rax+16]
1124
1125         lea         rcx,        [rdi+rdx*8]
1126         lea         rcx,        [rcx+rdx*8]
1127         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
1128
1129         pxor        xmm0,       xmm0
1130
1131         ; get the first horizontal line done
1132         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1133
1134         add         rsi,        rax                 ; next line
1135 next_row_spo:
1136         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1137
1138         movdqa      xmm5,       xmm7
1139         movdqa      xmm6,       xmm7
1140
1141         movdqa      xmm4,       xmm3                 ; make a copy of current line
1142         movdqa      xmm7,       xmm3
1143
1144         punpcklbw   xmm5,       xmm0
1145         punpckhbw   xmm6,       xmm0
1146         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1147         punpckhbw   xmm4,       xmm0
1148
1149         pmullw      xmm5,       xmm1
1150         pmullw      xmm6,       xmm1
1151         pmullw      xmm3,       xmm2
1152         pmullw      xmm4,       xmm2
1153
1154         paddw       xmm3,       xmm5
1155         paddw       xmm4,       xmm6
1156
1157         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1158         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1159
1160         paddw       xmm4,       [GLOBAL(rd)]
1161         psraw       xmm4,       VP8_FILTER_SHIFT
1162
1163         packuswb    xmm3,       xmm4
1164         movdqa      [rdi],      xmm3                 ; store the results in the destination
1165
1166         add         rsi,        rax                 ; next line
1167         add         rdi,        rdx                 ;dst_pitch
1168         cmp         rdi,        rcx
1169         jne         next_row_spo
1170
1171         jmp         done
1172
1173 b16x16_fp_only:
1174         lea         rcx,        [rdi+rdx*8]
1175         lea         rcx,        [rcx+rdx*8]
1176         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
1177         pxor        xmm0,       xmm0
1178
1179 next_row_fpo:
1180         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1181         movdqa      xmm4,       xmm3                 ; make a copy of current line
1182
1183         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1184         punpckhbw   xmm4,       xmm0
1185
1186         pmullw      xmm3,       xmm1
1187         pmullw      xmm4,       xmm1
1188
1189         movdqu      xmm5,       [rsi+1]
1190         movdqa      xmm6,       xmm5
1191
1192         punpcklbw   xmm5,       xmm0
1193         punpckhbw   xmm6,       xmm0
1194
1195         pmullw      xmm5,       xmm2
1196         pmullw      xmm6,       xmm2
1197
1198         paddw       xmm3,       xmm5
1199         paddw       xmm4,       xmm6
1200
1201         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1202         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1203
1204         paddw       xmm4,       [GLOBAL(rd)]
1205         psraw       xmm4,       VP8_FILTER_SHIFT
1206
1207         packuswb    xmm3,       xmm4
1208         movdqa      [rdi],      xmm3                 ; store the results in the destination
1209
1210         add         rsi,        rax                 ; next line
1211         add         rdi,        rdx                 ; dst_pitch
1212         cmp         rdi,        rcx
1213         jne         next_row_fpo
1214
1215 done:
1216     ; begin epilog
1217     pop rdi
1218     pop rsi
1219     RESTORE_GOT
1220     RESTORE_XMM
1221     UNSHADOW_ARGS
1222     pop         rbp
1223     ret
1224
1225
1226 ;void vp8_bilinear_predict8x8_sse2
1227 ;(
1228 ;    unsigned char  *src_ptr,
1229 ;    int   src_pixels_per_line,
1230 ;    int  xoffset,
1231 ;    int  yoffset,
1232 ;    unsigned char *dst_ptr,
1233 ;    int dst_pitch
1234 ;)
1235 extern sym(vp8_bilinear_filters_mmx)
1236 global sym(vp8_bilinear_predict8x8_sse2)
1237 sym(vp8_bilinear_predict8x8_sse2):
1238     push        rbp
1239     mov         rbp, rsp
1240     SHADOW_ARGS_TO_STACK 6
1241     SAVE_XMM
1242     GET_GOT     rbx
1243     push        rsi
1244     push        rdi
1245     ; end prolog
1246
1247     ALIGN_STACK 16, rax
1248     sub         rsp, 144                         ; reserve 144 bytes
1249
1250     ;const short *HFilter = bilinear_filters_mmx[xoffset]
1251     ;const short *VFilter = bilinear_filters_mmx[yoffset]
1252         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
1253
1254         mov         rsi,        arg(0) ;src_ptr
1255         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1256
1257     ;Read 9-line unaligned data in and put them on stack. This gives a big
1258     ;performance boost.
1259         movdqu      xmm0,       [rsi]
1260         lea         rax,        [rdx + rdx*2]
1261         movdqu      xmm1,       [rsi+rdx]
1262         movdqu      xmm2,       [rsi+rdx*2]
1263         add         rsi,        rax
1264         movdqu      xmm3,       [rsi]
1265         movdqu      xmm4,       [rsi+rdx]
1266         movdqu      xmm5,       [rsi+rdx*2]
1267         add         rsi,        rax
1268         movdqu      xmm6,       [rsi]
1269         movdqu      xmm7,       [rsi+rdx]
1270
1271         movdqa      XMMWORD PTR [rsp],            xmm0
1272
1273         movdqu      xmm0,       [rsi+rdx*2]
1274
1275         movdqa      XMMWORD PTR [rsp+16],         xmm1
1276         movdqa      XMMWORD PTR [rsp+32],         xmm2
1277         movdqa      XMMWORD PTR [rsp+48],         xmm3
1278         movdqa      XMMWORD PTR [rsp+64],         xmm4
1279         movdqa      XMMWORD PTR [rsp+80],         xmm5
1280         movdqa      XMMWORD PTR [rsp+96],         xmm6
1281         movdqa      XMMWORD PTR [rsp+112],        xmm7
1282         movdqa      XMMWORD PTR [rsp+128],        xmm0
1283
1284         movsxd      rax,        dword ptr arg(2) ;xoffset
1285         shl         rax,        5
1286         add         rax,        rcx    ;HFilter
1287
1288         mov         rdi,        arg(4) ;dst_ptr
1289         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
1290
1291         movdqa      xmm1,       [rax]
1292         movdqa      xmm2,       [rax+16]
1293
1294         movsxd      rax,        dword ptr arg(3) ;yoffset
1295         shl         rax,        5
1296         add         rax,        rcx    ;VFilter
1297
1298         lea         rcx,        [rdi+rdx*8]
1299
1300         movdqa      xmm5,       [rax]
1301         movdqa      xmm6,       [rax+16]
1302
1303         pxor        xmm0,       xmm0
1304
1305         ; get the first horizontal line done
1306         movdqa      xmm3,       XMMWORD PTR [rsp]
1307         movdqa      xmm4,       xmm3                 ; make a copy of current line
1308         psrldq      xmm4,       1
1309
1310         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
1311         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
1312
1313         pmullw      xmm3,       xmm1
1314         pmullw      xmm4,       xmm2
1315
1316         paddw       xmm3,       xmm4
1317
1318         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1319         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1320
1321         movdqa      xmm7,       xmm3
1322         add         rsp,        16                 ; next line
1323 next_row8x8:
1324         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1325         movdqa      xmm4,       xmm3                 ; make a copy of current line
1326         psrldq      xmm4,       1
1327
1328         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
1329         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
1330
1331         pmullw      xmm3,       xmm1
1332         pmullw      xmm4,       xmm2
1333
1334         paddw       xmm3,       xmm4
1335         pmullw      xmm7,       xmm5
1336
1337         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1338         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1339
1340         movdqa      xmm4,       xmm3
1341
1342         pmullw      xmm3,       xmm6
1343         paddw       xmm3,       xmm7
1344
1345         movdqa      xmm7,       xmm4
1346
1347         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1348         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1349
1350         packuswb    xmm3,       xmm0
1351         movq        [rdi],      xmm3                 ; store the results in the destination
1352
1353         add         rsp,        16                 ; next line
1354         add         rdi,        rdx
1355
1356         cmp         rdi,        rcx
1357         jne         next_row8x8
1358
1359     ;add rsp, 144
1360     pop rsp
1361     ; begin epilog
1362     pop rdi
1363     pop rsi
1364     RESTORE_GOT
1365     RESTORE_XMM
1366     UNSHADOW_ARGS
1367     pop         rbp
1368     ret
1369
1370
1371 SECTION_RODATA
1372 align 16
1373 rd:
1374     times 8 dw 0x40