vp8/encoder/x86/variance_impl_mmx.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
  15 global sym(vp8_get_mb_ss_mmx)
  16 sym(vp8_get_mb_ss_mmx):
  17     push        rbp
  18     mov         rbp, rsp
  19     SHADOW_ARGS_TO_STACK 7
  20     GET_GOT     rbx
  21     push rsi
  22     push rdi
  23     sub         rsp, 8
  24     ; end prolog
  25
  26         mov         rax, arg(0) ;src_ptr
  27         mov         rcx, 16
  28         pxor        mm4, mm4
  29
  30 NEXTROW:
  31         movq        mm0, [rax]
  32         movq        mm1, [rax+8]
  33         movq        mm2, [rax+16]
  34         movq        mm3, [rax+24]
  35         pmaddwd     mm0, mm0
  36         pmaddwd     mm1, mm1
  37         pmaddwd     mm2, mm2
  38         pmaddwd     mm3, mm3
  39
  40         paddd       mm4, mm0
  41         paddd       mm4, mm1
  42         paddd       mm4, mm2
  43         paddd       mm4, mm3
  44
  45         add         rax, 32
  46         dec         rcx
  47         ja          NEXTROW
  48         movq        QWORD PTR [rsp], mm4
  49
  50         ;return sum[0]+sum[1];
  51         movsxd      rax, dword ptr [rsp]
  52         movsxd      rcx, dword ptr [rsp+4]
  53         add         rax, rcx
  54
  55
  56     ; begin epilog
  57     add rsp, 8
  58     pop rdi
  59     pop rsi
  60     RESTORE_GOT
  61     UNSHADOW_ARGS
  62     pop         rbp
  63     ret
  64
  65
  66 ;unsigned int vp8_get8x8var_mmx
  67 ;(
  68 ;    unsigned char *src_ptr,
  69 ;    int  source_stride,
  70 ;    unsigned char *ref_ptr,
  71 ;    int  recon_stride,
  72 ;    unsigned int *SSE,
  73 ;    int *Sum
  74 ;)
  75 global sym(vp8_get8x8var_mmx)
  76 sym(vp8_get8x8var_mmx):
  77     push        rbp
  78     mov         rbp, rsp
  79     SHADOW_ARGS_TO_STACK 6
  80     push rsi
  81     push rdi
  82     push rbx
  83     sub         rsp, 16
  84     ; end prolog
  85
  86
  87         pxor        mm5, mm5                    ; Blank mmx6
  88         pxor        mm6, mm6                    ; Blank mmx7
  89         pxor        mm7, mm7                    ; Blank mmx7
  90
  91         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
  92         mov         rbx, arg(2) ;[ref_ptr]
  93         movsxd      rcx, dword ptr arg(1) ;[source_stride]
  94         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
  95
  96         ; Row 1
  97         movq        mm0, [rax]                  ; Copy eight bytes to mm0
  98         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
  99         movq        mm2, mm0                    ; Take copies
 100         movq        mm3, mm1                    ; Take copies
 101
 102         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 103         punpcklbw   mm1, mm6
 104         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 105         punpckhbw   mm3, mm6
 106         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 107         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 108
 109         paddw       mm5, mm0                    ; accumulate differences in mm5
 110         paddw       mm5, mm2                    ; accumulate differences in mm5
 111
 112         pmaddwd     mm0, mm0                    ; square and accumulate
 113         pmaddwd     mm2, mm2                    ; square and accumulate
 114         add         rbx,rdx                     ; Inc pointer into ref data
 115         add         rax,rcx                     ; Inc pointer into the new data
 116         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 117         paddd       mm7, mm0                    ; accumulate in mm7
 118         paddd       mm7, mm2                    ; accumulate in mm7
 119
 120
 121         ; Row 2
 122         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 123         movq        mm2, mm0                    ; Take copies
 124         movq        mm3, mm1                    ; Take copies
 125
 126         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 127         punpcklbw   mm1, mm6
 128         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 129         punpckhbw   mm3, mm6
 130         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 131         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 132
 133         paddw       mm5, mm0                    ; accumulate differences in mm5
 134         paddw       mm5, mm2                    ; accumulate differences in mm5
 135
 136         pmaddwd     mm0, mm0                    ; square and accumulate
 137         pmaddwd     mm2, mm2                    ; square and accumulate
 138         add         rbx,rdx                     ; Inc pointer into ref data
 139         add         rax,rcx                     ; Inc pointer into the new data
 140         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 141         paddd       mm7, mm0                    ; accumulate in mm7
 142         paddd       mm7, mm2                    ; accumulate in mm7
 143
 144         ; Row 3
 145         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 146         movq        mm2, mm0                    ; Take copies
 147         movq        mm3, mm1                    ; Take copies
 148
 149         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 150         punpcklbw   mm1, mm6
 151         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 152         punpckhbw   mm3, mm6
 153         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 154         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 155
 156         paddw       mm5, mm0                    ; accumulate differences in mm5
 157         paddw       mm5, mm2                    ; accumulate differences in mm5
 158
 159         pmaddwd     mm0, mm0                    ; square and accumulate
 160         pmaddwd     mm2, mm2                    ; square and accumulate
 161         add         rbx,rdx                     ; Inc pointer into ref data
 162         add         rax,rcx                     ; Inc pointer into the new data
 163         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 164         paddd       mm7, mm0                    ; accumulate in mm7
 165         paddd       mm7, mm2                    ; accumulate in mm7
 166
 167         ; Row 4
 168         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 169         movq        mm2, mm0                    ; Take copies
 170         movq        mm3, mm1                    ; Take copies
 171
 172         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 173         punpcklbw   mm1, mm6
 174         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 175         punpckhbw   mm3, mm6
 176         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 177         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 178
 179         paddw       mm5, mm0                    ; accumulate differences in mm5
 180         paddw       mm5, mm2                    ; accumulate differences in mm5
 181
 182         pmaddwd     mm0, mm0                    ; square and accumulate
 183         pmaddwd     mm2, mm2                    ; square and accumulate
 184         add         rbx,rdx                     ; Inc pointer into ref data
 185         add         rax,rcx                     ; Inc pointer into the new data
 186         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 187         paddd       mm7, mm0                    ; accumulate in mm7
 188         paddd       mm7, mm2                    ; accumulate in mm7
 189
 190         ; Row 5
 191         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 192         movq        mm2, mm0                    ; Take copies
 193         movq        mm3, mm1                    ; Take copies
 194
 195         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 196         punpcklbw   mm1, mm6
 197         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 198         punpckhbw   mm3, mm6
 199         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 200         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 201
 202         paddw       mm5, mm0                    ; accumulate differences in mm5
 203         paddw       mm5, mm2                    ; accumulate differences in mm5
 204
 205         pmaddwd     mm0, mm0                    ; square and accumulate
 206         pmaddwd     mm2, mm2                    ; square and accumulate
 207         add         rbx,rdx                     ; Inc pointer into ref data
 208         add         rax,rcx                     ; Inc pointer into the new data
 209         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 210         ;              movq        mm4, [rbx + rdx]
 211         paddd       mm7, mm0                    ; accumulate in mm7
 212         paddd       mm7, mm2                    ; accumulate in mm7
 213
 214         ; Row 6
 215         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 216         movq        mm2, mm0                    ; Take copies
 217         movq        mm3, mm1                    ; Take copies
 218
 219         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 220         punpcklbw   mm1, mm6
 221         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 222         punpckhbw   mm3, mm6
 223         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 224         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 225
 226         paddw       mm5, mm0                    ; accumulate differences in mm5
 227         paddw       mm5, mm2                    ; accumulate differences in mm5
 228
 229         pmaddwd     mm0, mm0                    ; square and accumulate
 230         pmaddwd     mm2, mm2                    ; square and accumulate
 231         add         rbx,rdx                     ; Inc pointer into ref data
 232         add         rax,rcx                     ; Inc pointer into the new data
 233         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 234         paddd       mm7, mm0                    ; accumulate in mm7
 235         paddd       mm7, mm2                    ; accumulate in mm7
 236
 237         ; Row 7
 238         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 239         movq        mm2, mm0                    ; Take copies
 240         movq        mm3, mm1                    ; Take copies
 241
 242         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 243         punpcklbw   mm1, mm6
 244         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 245         punpckhbw   mm3, mm6
 246         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 247         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 248
 249         paddw       mm5, mm0                    ; accumulate differences in mm5
 250         paddw       mm5, mm2                    ; accumulate differences in mm5
 251
 252         pmaddwd     mm0, mm0                    ; square and accumulate
 253         pmaddwd     mm2, mm2                    ; square and accumulate
 254         add         rbx,rdx                     ; Inc pointer into ref data
 255         add         rax,rcx                     ; Inc pointer into the new data
 256         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 257         paddd       mm7, mm0                    ; accumulate in mm7
 258         paddd       mm7, mm2                    ; accumulate in mm7
 259
 260         ; Row 8
 261         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 262         movq        mm2, mm0                    ; Take copies
 263         movq        mm3, mm1                    ; Take copies
 264
 265         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 266         punpcklbw   mm1, mm6
 267         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
 268         punpckhbw   mm3, mm6
 269         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 270         psubsw      mm2, mm3                    ; A-B (high order) to MM2
 271
 272         paddw       mm5, mm0                    ; accumulate differences in mm5
 273         paddw       mm5, mm2                    ; accumulate differences in mm5
 274
 275         pmaddwd     mm0, mm0                    ; square and accumulate
 276         pmaddwd     mm2, mm2                    ; square and accumulate
 277         add         rbx,rdx                     ; Inc pointer into ref data
 278         add         rax,rcx                     ; Inc pointer into the new data
 279         paddd       mm7, mm0                    ; accumulate in mm7
 280         paddd       mm7, mm2                    ; accumulate in mm7
 281
 282         ; Now accumulate the final results.
 283         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
 284         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
 285         movsx       rdx, WORD PTR [rsp+8]
 286         movsx       rcx, WORD PTR [rsp+10]
 287         movsx       rbx, WORD PTR [rsp+12]
 288         movsx       rax, WORD PTR [rsp+14]
 289         add         rdx, rcx
 290         add         rbx, rax
 291         add         rdx, rbx    ;XSum
 292         movsxd      rax, DWORD PTR [rsp]
 293         movsxd      rcx, DWORD PTR [rsp+4]
 294         add         rax, rcx    ;XXSum
 295         mov         rsi, arg(4) ;SSE
 296         mov         rdi, arg(5) ;Sum
 297         mov         dword ptr [rsi], eax
 298         mov         dword ptr [rdi], edx
 299         xor         rax, rax    ; return 0
 300
 301
 302     ; begin epilog
 303     add rsp, 16
 304     pop rbx
 305     pop rdi
 306     pop rsi
 307     UNSHADOW_ARGS
 308     pop         rbp
 309     ret
 310
 311
 312
 313 ;unsigned int
 314 ;vp8_get4x4var_mmx
 315 ;(
 316 ;    unsigned char *src_ptr,
 317 ;    int  source_stride,
 318 ;    unsigned char *ref_ptr,
 319 ;    int  recon_stride,
 320 ;    unsigned int *SSE,
 321 ;    int *Sum
 322 ;)
 323 global sym(vp8_get4x4var_mmx)
 324 sym(vp8_get4x4var_mmx):
 325     push        rbp
 326     mov         rbp, rsp
 327     SHADOW_ARGS_TO_STACK 6
 328     push rsi
 329     push rdi
 330     push rbx
 331     sub         rsp, 16
 332     ; end prolog
 333
 334
 335         pxor        mm5, mm5                    ; Blank mmx6
 336         pxor        mm6, mm6                    ; Blank mmx7
 337         pxor        mm7, mm7                    ; Blank mmx7
 338
 339         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
 340         mov         rbx, arg(2) ;[ref_ptr]
 341         movsxd      rcx, dword ptr arg(1) ;[source_stride]
 342         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
 343
 344         ; Row 1
 345         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 346         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 347         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 348         punpcklbw   mm1, mm6
 349         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 350         paddw       mm5, mm0                    ; accumulate differences in mm5
 351         pmaddwd     mm0, mm0                    ; square and accumulate
 352         add         rbx,rdx                     ; Inc pointer into ref data
 353         add         rax,rcx                     ; Inc pointer into the new data
 354         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 355         paddd       mm7, mm0                    ; accumulate in mm7
 356
 357
 358         ; Row 2
 359         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 360         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 361         punpcklbw   mm1, mm6
 362         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 363         paddw       mm5, mm0                    ; accumulate differences in mm5
 364
 365         pmaddwd     mm0, mm0                    ; square and accumulate
 366         add         rbx,rdx                     ; Inc pointer into ref data
 367         add         rax,rcx                     ; Inc pointer into the new data
 368         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 369         paddd       mm7, mm0                    ; accumulate in mm7
 370
 371         ; Row 3
 372         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 373         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 374         punpcklbw   mm1, mm6
 375         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 376         paddw       mm5, mm0                    ; accumulate differences in mm5
 377
 378         pmaddwd     mm0, mm0                    ; square and accumulate
 379         add         rbx,rdx                     ; Inc pointer into ref data
 380         add         rax,rcx                     ; Inc pointer into the new data
 381         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
 382         paddd       mm7, mm0                    ; accumulate in mm7
 383
 384         ; Row 4
 385         movq        mm0, [rax]                  ; Copy eight bytes to mm0
 386
 387         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 388         punpcklbw   mm1, mm6
 389         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 390
 391         paddw       mm5, mm0                    ; accumulate differences in mm5
 392
 393         pmaddwd     mm0, mm0                    ; square and accumulate
 394         paddd       mm7, mm0                    ; accumulate in mm7
 395
 396
 397         ; Now accumulate the final results.
 398         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
 399         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
 400         movsx       rdx, WORD PTR [rsp+8]
 401         movsx       rcx, WORD PTR [rsp+10]
 402         movsx       rbx, WORD PTR [rsp+12]
 403         movsx       rax, WORD PTR [rsp+14]
 404         add         rdx, rcx
 405         add         rbx, rax
 406         add         rdx, rbx    ;XSum
 407         movsxd      rax, DWORD PTR [rsp]
 408         movsxd      rcx, DWORD PTR [rsp+4]
 409         add         rax, rcx    ;XXSum
 410         mov         rsi, arg(4) ;SSE
 411         mov         rdi, arg(5) ;Sum
 412         mov         dword ptr [rsi], eax
 413         mov         dword ptr [rdi], edx
 414         xor         rax, rax    ; return 0
 415
 416
 417     ; begin epilog
 418     add rsp, 16
 419     pop rbx
 420     pop rdi
 421     pop rsi
 422     UNSHADOW_ARGS
 423     pop         rbp
 424     ret
 425
 426
 427
 428 ;unsigned int
 429 ;vp8_get4x4sse_cs_mmx
 430 ;(
 431 ;    unsigned char *src_ptr,
 432 ;    int  source_stride,
 433 ;    unsigned char *ref_ptr,
 434 ;    int  recon_stride
 435 ;)
 436 global sym(vp8_get4x4sse_cs_mmx)
 437 sym(vp8_get4x4sse_cs_mmx):
 438     push        rbp
 439     mov         rbp, rsp
 440     SHADOW_ARGS_TO_STACK 4
 441     push rsi
 442     push rdi
 443     push rbx
 444     ; end prolog
 445
 446
 447         pxor        mm6, mm6                    ; Blank mmx7
 448         pxor        mm7, mm7                    ; Blank mmx7
 449
 450         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
 451         mov         rbx, arg(2) ;[ref_ptr]
 452         movsxd      rcx, dword ptr arg(1) ;[source_stride]
 453         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
 454         ; Row 1
 455         movd        mm0, [rax]                  ; Copy eight bytes to mm0
 456         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
 457         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 458         punpcklbw   mm1, mm6
 459         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 460         pmaddwd     mm0, mm0                    ; square and accumulate
 461         add         rbx,rdx                     ; Inc pointer into ref data
 462         add         rax,rcx                     ; Inc pointer into the new data
 463         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
 464         paddd       mm7, mm0                    ; accumulate in mm7
 465
 466         ; Row 2
 467         movd        mm0, [rax]                  ; Copy eight bytes to mm0
 468         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 469         punpcklbw   mm1, mm6
 470         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 471         pmaddwd     mm0, mm0                    ; square and accumulate
 472         add         rbx,rdx                     ; Inc pointer into ref data
 473         add         rax,rcx                     ; Inc pointer into the new data
 474         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
 475         paddd       mm7, mm0                    ; accumulate in mm7
 476
 477         ; Row 3
 478         movd        mm0, [rax]                  ; Copy eight bytes to mm0
 479         punpcklbw   mm1, mm6
 480         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 481         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 482
 483         pmaddwd     mm0, mm0                    ; square and accumulate
 484         add         rbx,rdx                     ; Inc pointer into ref data
 485         add         rax,rcx                     ; Inc pointer into the new data
 486         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
 487         paddd       mm7, mm0                    ; accumulate in mm7
 488
 489         ; Row 4
 490         movd        mm0, [rax]                  ; Copy eight bytes to mm0
 491         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
 492         punpcklbw   mm1, mm6
 493         psubsw      mm0, mm1                    ; A-B (low order) to MM0
 494         pmaddwd     mm0, mm0                    ; square and accumulate
 495         paddd       mm7, mm0                    ; accumulate in mm7
 496
 497         movq        mm0,    mm7                 ;
 498         psrlq       mm7,    32
 499
 500         paddd       mm0,    mm7
 501         movq        rax,    mm0
 502
 503
 504     ; begin epilog
 505     pop rbx
 506     pop rdi
 507     pop rsi
 508     UNSHADOW_ARGS
 509     pop         rbp
 510     ret
 511
 512 %define mmx_filter_shift            7
 513
 514 ;void vp8_filter_block2d_bil4x4_var_mmx
 515 ;(
 516 ;    unsigned char *ref_ptr,
 517 ;    int ref_pixels_per_line,
 518 ;    unsigned char *src_ptr,
 519 ;    int src_pixels_per_line,
 520 ;    unsigned short *HFilter,
 521 ;    unsigned short *VFilter,
 522 ;    int *sum,
 523 ;    unsigned int *sumsquared
 524 ;)
 525 global sym(vp8_filter_block2d_bil4x4_var_mmx)
 526 sym(vp8_filter_block2d_bil4x4_var_mmx):
 527     push        rbp
 528     mov         rbp, rsp
 529     SHADOW_ARGS_TO_STACK 8
 530     GET_GOT     rbx
 531     push rsi
 532     push rdi
 533     sub         rsp, 16
 534     ; end prolog
 535
 536
 537         pxor            mm6,            mm6                 ;
 538         pxor            mm7,            mm7                 ;
 539
 540         mov             rax,            arg(4) ;HFilter             ;
 541         mov             rdx,            arg(5) ;VFilter             ;
 542
 543         mov             rsi,            arg(0) ;ref_ptr              ;
 544         mov             rdi,            arg(2) ;src_ptr              ;
 545
 546         mov             rcx,            4                   ;
 547         pxor            mm0,            mm0                 ;
 548
 549         movd            mm1,            [rsi]               ;
 550         movd            mm3,            [rsi+1]             ;
 551
 552         punpcklbw       mm1,            mm0                 ;
 553         pmullw          mm1,            [rax]               ;
 554
 555         punpcklbw       mm3,            mm0                 ;
 556         pmullw          mm3,            [rax+8]             ;
 557
 558         paddw           mm1,            mm3                 ;
 559         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 560
 561         psraw           mm1,            mmx_filter_shift    ;
 562         movq            mm5,            mm1
 563
 564 %if ABI_IS_32BIT
 565         add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
 566 %else
 567         movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
 568         add             rsi, r8
 569 %endif
 570
 571 filter_block2d_bil4x4_var_mmx_loop:
 572
 573         movd            mm1,            [rsi]               ;
 574         movd            mm3,            [rsi+1]             ;
 575
 576         punpcklbw       mm1,            mm0                 ;
 577         pmullw          mm1,            [rax]               ;
 578
 579         punpcklbw       mm3,            mm0                 ;
 580         pmullw          mm3,            [rax+8]             ;
 581
 582         paddw           mm1,            mm3                 ;
 583         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 584
 585         psraw           mm1,            mmx_filter_shift    ;
 586         movq            mm3,            mm5                 ;
 587
 588         movq            mm5,            mm1                 ;
 589         pmullw          mm3,            [rdx]               ;
 590
 591         pmullw          mm1,            [rdx+8]             ;
 592         paddw           mm1,            mm3                 ;
 593
 594
 595         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 596         psraw           mm1,            mmx_filter_shift    ;
 597
 598         movd            mm3,            [rdi]               ;
 599         punpcklbw       mm3,            mm0                 ;
 600
 601         psubw           mm1,            mm3                 ;
 602         paddw           mm6,            mm1                 ;
 603
 604         pmaddwd         mm1,            mm1                 ;
 605         paddd           mm7,            mm1                 ;
 606
 607 %if ABI_IS_32BIT
 608         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
 609         add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
 610 %else
 611         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
 612         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 613         add             rsi,            r8
 614         add             rdi,            r9
 615 %endif
 616         sub             rcx,            1                   ;
 617         jnz             filter_block2d_bil4x4_var_mmx_loop       ;
 618
 619
 620         pxor            mm3,            mm3                 ;
 621         pxor            mm2,            mm2                 ;
 622
 623         punpcklwd       mm2,            mm6                 ;
 624         punpckhwd       mm3,            mm6                 ;
 625
 626         paddd           mm2,            mm3                 ;
 627         movq            mm6,            mm2                 ;
 628
 629         psrlq           mm6,            32                  ;
 630         paddd           mm2,            mm6                 ;
 631
 632         psrad           mm2,            16                  ;
 633         movq            mm4,            mm7                 ;
 634
 635         psrlq           mm4,            32                  ;
 636         paddd           mm4,            mm7                 ;
 637
 638         mov             rdi,            arg(6) ;sum
 639         mov             rsi,            arg(7) ;sumsquared
 640
 641         movd            dword ptr [rdi],          mm2                 ;
 642         movd            dword ptr [rsi],          mm4                 ;
 643
 644
 645
 646     ; begin epilog
 647     add rsp, 16
 648     pop rdi
 649     pop rsi
 650     RESTORE_GOT
 651     UNSHADOW_ARGS
 652     pop         rbp
 653     ret
 654
 655
 656
 657
 658 ;void vp8_filter_block2d_bil_var_mmx
 659 ;(
 660 ;    unsigned char *ref_ptr,
 661 ;    int ref_pixels_per_line,
 662 ;    unsigned char *src_ptr,
 663 ;    int src_pixels_per_line,
 664 ;    unsigned int Height,
 665 ;    unsigned short *HFilter,
 666 ;    unsigned short *VFilter,
 667 ;    int *sum,
 668 ;    unsigned int *sumsquared
 669 ;)
 670 global sym(vp8_filter_block2d_bil_var_mmx)
 671 sym(vp8_filter_block2d_bil_var_mmx):
 672     push        rbp
 673     mov         rbp, rsp
 674     SHADOW_ARGS_TO_STACK 9
 675     GET_GOT     rbx
 676     push rsi
 677     push rdi
 678     sub         rsp, 16
 679     ; end prolog
 680
 681         pxor            mm6,            mm6                 ;
 682         pxor            mm7,            mm7                 ;
 683         mov             rax,            arg(5) ;HFilter             ;
 684
 685         mov             rdx,            arg(6) ;VFilter             ;
 686         mov             rsi,            arg(0) ;ref_ptr              ;
 687
 688         mov             rdi,            arg(2) ;src_ptr              ;
 689         movsxd          rcx,            dword ptr arg(4) ;Height              ;
 690
 691         pxor            mm0,            mm0                 ;
 692         movq            mm1,            [rsi]               ;
 693
 694         movq            mm3,            [rsi+1]             ;
 695         movq            mm2,            mm1                 ;
 696
 697         movq            mm4,            mm3                 ;
 698         punpcklbw       mm1,            mm0                 ;
 699
 700         punpckhbw       mm2,            mm0                 ;
 701         pmullw          mm1,            [rax]               ;
 702
 703         pmullw          mm2,            [rax]               ;
 704         punpcklbw       mm3,            mm0                 ;
 705
 706         punpckhbw       mm4,            mm0                 ;
 707         pmullw          mm3,            [rax+8]             ;
 708
 709         pmullw          mm4,            [rax+8]             ;
 710         paddw           mm1,            mm3                 ;
 711
 712         paddw           mm2,            mm4                 ;
 713         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 714
 715         psraw           mm1,            mmx_filter_shift    ;
 716         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 717
 718         psraw           mm2,            mmx_filter_shift    ;
 719         movq            mm5,            mm1
 720
 721         packuswb        mm5,            mm2                 ;
 722 %if ABI_IS_32BIT
 723         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
 724 %else
 725         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
 726         add             rsi,            r8
 727 %endif
 728
 729 filter_block2d_bil_var_mmx_loop:
 730
 731         movq            mm1,            [rsi]               ;
 732         movq            mm3,            [rsi+1]             ;
 733
 734         movq            mm2,            mm1                 ;
 735         movq            mm4,            mm3                 ;
 736
 737         punpcklbw       mm1,            mm0                 ;
 738         punpckhbw       mm2,            mm0                 ;
 739
 740         pmullw          mm1,            [rax]               ;
 741         pmullw          mm2,            [rax]               ;
 742
 743         punpcklbw       mm3,            mm0                 ;
 744         punpckhbw       mm4,            mm0                 ;
 745
 746         pmullw          mm3,            [rax+8]             ;
 747         pmullw          mm4,            [rax+8]             ;
 748
 749         paddw           mm1,            mm3                 ;
 750         paddw           mm2,            mm4                 ;
 751
 752         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 753         psraw           mm1,            mmx_filter_shift    ;
 754
 755         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 756         psraw           mm2,            mmx_filter_shift    ;
 757
 758         movq            mm3,            mm5                 ;
 759         movq            mm4,            mm5                 ;
 760
 761         punpcklbw       mm3,            mm0                 ;
 762         punpckhbw       mm4,            mm0                 ;
 763
 764         movq            mm5,            mm1                 ;
 765         packuswb        mm5,            mm2                 ;
 766
 767         pmullw          mm3,            [rdx]               ;
 768         pmullw          mm4,            [rdx]               ;
 769
 770         pmullw          mm1,            [rdx+8]             ;
 771         pmullw          mm2,            [rdx+8]             ;
 772
 773         paddw           mm1,            mm3                 ;
 774         paddw           mm2,            mm4                 ;
 775
 776         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 777         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 778
 779         psraw           mm1,            mmx_filter_shift    ;
 780         psraw           mm2,            mmx_filter_shift    ;
 781
 782         movq            mm3,            [rdi]               ;
 783         movq            mm4,            mm3                 ;
 784
 785         punpcklbw       mm3,            mm0                 ;
 786         punpckhbw       mm4,            mm0                 ;
 787
 788         psubw           mm1,            mm3                 ;
 789         psubw           mm2,            mm4                 ;
 790
 791         paddw           mm6,            mm1                 ;
 792         pmaddwd         mm1,            mm1                 ;
 793
 794         paddw           mm6,            mm2                 ;
 795         pmaddwd         mm2,            mm2                 ;
 796
 797         paddd           mm7,            mm1                 ;
 798         paddd           mm7,            mm2                 ;
 799
 800 %if ABI_IS_32BIT
 801         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
 802         add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
 803 %else
 804         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
 805         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
 806         add             rsi,            r8
 807         add             rdi,            r9
 808 %endif
 809         sub             rcx,            1                   ;
 810         jnz             filter_block2d_bil_var_mmx_loop       ;
 811
 812
 813         pxor            mm3,            mm3                 ;
 814         pxor            mm2,            mm2                 ;
 815
 816         punpcklwd       mm2,            mm6                 ;
 817         punpckhwd       mm3,            mm6                 ;
 818
 819         paddd           mm2,            mm3                 ;
 820         movq            mm6,            mm2                 ;
 821
 822         psrlq           mm6,            32                  ;
 823         paddd           mm2,            mm6                 ;
 824
 825         psrad           mm2,            16                  ;
 826         movq            mm4,            mm7                 ;
 827
 828         psrlq           mm4,            32                  ;
 829         paddd           mm4,            mm7                 ;
 830
 831         mov             rdi,            arg(7) ;sum
 832         mov             rsi,            arg(8) ;sumsquared
 833
 834         movd            dword ptr [rdi],          mm2                 ;
 835         movd            dword ptr [rsi],          mm4                 ;
 836
 837     ; begin epilog
 838     add rsp, 16
 839     pop rdi
 840     pop rsi
 841     RESTORE_GOT
 842     UNSHADOW_ARGS
 843     pop         rbp
 844     ret
 845
 846 ;unsigned int vp8_get16x16pred_error_mmx
 847 ;(
 848 ;    unsigned char *src_ptr,
 849 ;    int src_stride,
 850 ;    unsigned char *ref_ptr,
 851 ;    int ref_stride
 852 ;)
 853 global sym(vp8_get16x16pred_error_mmx)
 854 sym(vp8_get16x16pred_error_mmx):
 855     push        rbp
 856     mov         rbp, rsp
 857     SHADOW_ARGS_TO_STACK 4
 858     GET_GOT     rbx
 859     push rsi
 860     push rdi
 861     sub         rsp, 16
 862     ; end prolog
 863
 864         mov         rsi,            arg(0) ;DWORD PTR [src_ptr]
 865         mov         rdi,            arg(2) ;DWORD PTR [ref_ptr]
 866
 867         movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
 868         movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
 869
 870         pxor        mm0,            mm0                     ; clear xmm0 for unpack
 871         pxor        mm7,            mm7                     ; clear xmm7 for accumulating diffs
 872
 873         pxor        mm6,            mm6                     ; clear xmm6 for accumulating sse
 874         mov         rcx,            16
 875
 876 var16loop:
 877
 878         movq        mm1,            [rsi]
 879         movq        mm2,            [rdi]
 880
 881         movq        mm3,            mm1
 882         movq        mm4,            mm2
 883
 884         punpcklbw   mm1,            mm0
 885         punpckhbw   mm3,            mm0
 886
 887         punpcklbw   mm2,            mm0
 888         punpckhbw   mm4,            mm0
 889
 890         psubw       mm1,            mm2
 891         psubw       mm3,            mm4
 892
 893         paddw       mm7,            mm1
 894         pmaddwd     mm1,            mm1
 895
 896         paddw       mm7,            mm3
 897         pmaddwd     mm3,            mm3
 898
 899         paddd       mm6,            mm1
 900         paddd       mm6,            mm3
 901
 902
 903         movq        mm1,            [rsi+8]
 904         movq        mm2,            [rdi+8]
 905
 906         movq        mm3,            mm1
 907         movq        mm4,            mm2
 908
 909         punpcklbw   mm1,            mm0
 910         punpckhbw   mm3,            mm0
 911
 912         punpcklbw   mm2,            mm0
 913         punpckhbw   mm4,            mm0
 914
 915         psubw       mm1,            mm2
 916         psubw       mm3,            mm4
 917
 918         paddw       mm7,            mm1
 919         pmaddwd     mm1,            mm1
 920
 921         paddw       mm7,            mm3
 922         pmaddwd     mm3,            mm3
 923
 924         paddd       mm6,            mm1
 925         paddd       mm6,            mm3
 926
 927         add         rsi,            rax
 928         add         rdi,            rdx
 929
 930         sub         rcx,            1
 931         jnz         var16loop
 932
 933
 934         movq        mm1,            mm6
 935         pxor        mm6,            mm6
 936
 937         pxor        mm5,            mm5
 938         punpcklwd   mm6,            mm7
 939
 940         punpckhwd   mm5,            mm7
 941         psrad       mm5,            16
 942
 943         psrad       mm6,            16
 944         paddd       mm6,            mm5
 945
 946         movq        mm2,            mm1
 947         psrlq       mm1,            32
 948
 949         paddd       mm2,            mm1
 950         movq        mm7,            mm6
 951
 952         psrlq       mm6,            32
 953         paddd       mm6,            mm7
 954
 955         movd DWORD PTR [rsp],       mm6  ;Sum
 956         movd DWORD PTR [rsp+4],     mm2  ;SSE
 957
 958         ; return (SSE-((Sum*Sum)>>8));
 959         movsxd      rdx, dword ptr [rsp]
 960         imul        rdx, rdx
 961         sar         rdx, 8
 962         movsxd      rax, dword ptr [rsp + 4]
 963         sub         rax, rdx
 964
 965
 966     ; begin epilog
 967     add rsp, 16
 968     pop rdi
 969     pop rsi
 970     RESTORE_GOT
 971     UNSHADOW_ARGS
 972     pop         rbp
 973     ret
 974
 975
 976
 977 SECTION_RODATA
 978 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
 979 align 16
 980 mmx_bi_rd:
 981     times 4 dw 64