2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro PROCESS_16X2X3
1
16 movdqa xmm0
, XMMWORD
PTR [rsi
]
17 lddqu xmm5
, XMMWORD
PTR [rdi
]
18 lddqu xmm6
, XMMWORD
PTR [rdi
+1]
19 lddqu xmm7
, XMMWORD
PTR [rdi
+2]
25 movdqa xmm0
, XMMWORD
PTR [rsi
]
26 lddqu xmm1
, XMMWORD
PTR [rdi
]
27 lddqu xmm2
, XMMWORD
PTR [rdi
+1]
28 lddqu xmm3
, XMMWORD
PTR [rdi
+2]
38 movdqa xmm0
, XMMWORD
PTR [rsi
+rax
]
39 lddqu xmm1
, XMMWORD
PTR [rdi
+rdx
]
40 lddqu xmm2
, XMMWORD
PTR [rdi
+rdx
+1]
41 lddqu xmm3
, XMMWORD
PTR [rdi
+rdx
+2]
55 %macro PROCESS_8X2X3
1
57 movq mm0
, QWORD PTR [rsi
]
58 movq mm5
, QWORD PTR [rdi
]
59 movq mm6
, QWORD PTR [rdi
+1]
60 movq mm7
, QWORD PTR [rdi
+2]
66 movq mm0
, QWORD PTR [rsi
]
67 movq mm1
, QWORD PTR [rdi
]
68 movq mm2
, QWORD PTR [rdi
+1]
69 movq mm3
, QWORD PTR [rdi
+2]
79 movq mm0
, QWORD PTR [rsi
+rax
]
80 movq mm1
, QWORD PTR [rdi
+rdx
]
81 movq mm2
, QWORD PTR [rdi
+rdx
+1]
82 movq mm3
, QWORD PTR [rdi
+rdx
+2]
96 %macro LOAD_X4_ADDRESSES
5
97 mov %2, [%1+REG_SZ_BYTES
*0]
98 mov %3, [%1+REG_SZ_BYTES
*1]
100 mov %4, [%1+REG_SZ_BYTES
*2]
101 mov %5, [%1+REG_SZ_BYTES
*3]
104 %macro PROCESS_16X2X4
1
106 movdqa xmm0
, XMMWORD
PTR [rsi
]
107 lddqu xmm4
, XMMWORD
PTR [rcx
]
108 lddqu xmm5
, XMMWORD
PTR [rdx
]
109 lddqu xmm6
, XMMWORD
PTR [rbx
]
110 lddqu xmm7
, XMMWORD
PTR [rdi
]
117 movdqa xmm0
, XMMWORD
PTR [rsi
]
118 lddqu xmm1
, XMMWORD
PTR [rcx
]
119 lddqu xmm2
, XMMWORD
PTR [rdx
]
120 lddqu xmm3
, XMMWORD
PTR [rbx
]
127 lddqu xmm1
, XMMWORD
PTR [rdi
]
134 movdqa xmm0
, XMMWORD
PTR [rsi
+rax
]
135 lddqu xmm1
, XMMWORD
PTR [rcx
+rbp
]
136 lddqu xmm2
, XMMWORD
PTR [rdx
+rbp
]
137 lddqu xmm3
, XMMWORD
PTR [rbx
+rbp
]
144 lddqu xmm1
, XMMWORD
PTR [rdi
+rbp
]
161 %macro PROCESS_8X2X4
1
163 movq mm0
, QWORD PTR [rsi
]
164 movq mm4
, QWORD PTR [rcx
]
165 movq mm5
, QWORD PTR [rdx
]
166 movq mm6
, QWORD PTR [rbx
]
167 movq mm7
, QWORD PTR [rdi
]
174 movq mm0
, QWORD PTR [rsi
]
175 movq mm1
, QWORD PTR [rcx
]
176 movq mm2
, QWORD PTR [rdx
]
177 movq mm3
, QWORD PTR [rbx
]
184 movq mm1
, QWORD PTR [rdi
]
191 movq mm0
, QWORD PTR [rsi
+rax
]
192 movq mm1
, QWORD PTR [rcx
+rbp
]
193 movq mm2
, QWORD PTR [rdx
+rbp
]
194 movq mm3
, QWORD PTR [rbx
+rbp
]
201 movq mm1
, QWORD PTR [rdi
+rbp
]
218 ;void int vp8_sad16x16x3_sse3(
219 ; unsigned char *src_ptr,
221 ; unsigned char *ref_ptr,
224 global sym
(vp8_sad16x16x3_sse3
)
225 sym
(vp8_sad16x16x3_sse3
):
228 SHADOW_ARGS_TO_STACK
5
233 mov rsi
, arg
(0) ;src_ptr
234 mov rdi
, arg
(2) ;ref_ptr
236 movsxd rax
, dword ptr arg
(1) ;src_stride
237 movsxd rdx
, dword ptr arg
(3) ;ref_stride
248 mov rdi
, arg
(4) ;Results
275 ;void int vp8_sad16x8x3_sse3(
276 ; unsigned char *src_ptr,
278 ; unsigned char *ref_ptr,
281 global sym
(vp8_sad16x8x3_sse3
)
282 sym
(vp8_sad16x8x3_sse3
):
285 SHADOW_ARGS_TO_STACK
5
290 mov rsi
, arg
(0) ;src_ptr
291 mov rdi
, arg
(2) ;ref_ptr
293 movsxd rax
, dword ptr arg
(1) ;src_stride
294 movsxd rdx
, dword ptr arg
(3) ;ref_stride
301 mov rdi
, arg
(4) ;Results
328 ;void int vp8_sad8x16x3_sse3(
329 ; unsigned char *src_ptr,
331 ; unsigned char *ref_ptr,
334 global sym
(vp8_sad8x16x3_sse3
)
335 sym
(vp8_sad8x16x3_sse3
):
338 SHADOW_ARGS_TO_STACK
5
343 mov rsi
, arg
(0) ;src_ptr
344 mov rdi
, arg
(2) ;ref_ptr
346 movsxd rax
, dword ptr arg
(1) ;src_stride
347 movsxd rdx
, dword ptr arg
(3) ;ref_stride
358 mov rdi
, arg
(4) ;Results
371 ;void int vp8_sad8x8x3_sse3(
372 ; unsigned char *src_ptr,
374 ; unsigned char *ref_ptr,
377 global sym
(vp8_sad8x8x3_sse3
)
378 sym
(vp8_sad8x8x3_sse3
):
381 SHADOW_ARGS_TO_STACK
5
386 mov rsi
, arg
(0) ;src_ptr
387 mov rdi
, arg
(2) ;ref_ptr
389 movsxd rax
, dword ptr arg
(1) ;src_stride
390 movsxd rdx
, dword ptr arg
(3) ;ref_stride
397 mov rdi
, arg
(4) ;Results
410 ;void int vp8_sad4x4x3_sse3(
411 ; unsigned char *src_ptr,
413 ; unsigned char *ref_ptr,
416 global sym
(vp8_sad4x4x3_sse3
)
417 sym
(vp8_sad4x4x3_sse3
):
420 SHADOW_ARGS_TO_STACK
5
425 mov rsi
, arg
(0) ;src_ptr
426 mov rdi
, arg
(2) ;ref_ptr
428 movsxd rax
, dword ptr arg
(1) ;src_stride
429 movsxd rdx
, dword ptr arg
(3) ;ref_stride
431 movd mm0
, DWORD PTR [rsi
]
432 movd mm1
, DWORD PTR [rdi
]
434 movd mm2
, DWORD PTR [rsi
+rax
]
435 movd mm3
, DWORD PTR [rdi
+rdx
]
440 movd mm4
, DWORD PTR [rdi
+1]
441 movd mm5
, DWORD PTR [rdi
+2]
443 movd mm2
, DWORD PTR [rdi
+rdx
+1]
444 movd mm3
, DWORD PTR [rdi
+rdx
+2]
459 movd mm0
, DWORD PTR [rsi
]
460 movd mm2
, DWORD PTR [rdi
]
462 movd mm3
, DWORD PTR [rsi
+rax
]
463 movd mm6
, DWORD PTR [rdi
+rdx
]
468 movd mm3
, DWORD PTR [rdi
+1]
469 movd mm7
, DWORD PTR [rdi
+2]
475 movd mm2
, DWORD PTR [rdi
+rdx
+1]
476 movd mm6
, DWORD PTR [rdi
+rdx
+2]
487 mov rdi
, arg
(4) ;Results
501 ;unsigned int vp8_sad16x16_sse3(
502 ; unsigned char *src_ptr,
504 ; unsigned char *ref_ptr,
507 ;%define lddqu movdqu
508 global sym
(vp8_sad16x16_sse3
)
509 sym
(vp8_sad16x16_sse3
):
512 SHADOW_ARGS_TO_STACK
5
518 mov rsi
, arg
(0) ;src_ptr
519 mov rdi
, arg
(2) ;ref_ptr
521 movsxd rbx
, dword ptr arg
(1) ;src_stride
522 movsxd rdx
, dword ptr arg
(3) ;ref_stride
529 vp8_sad16x16_sse3_loop:
533 jg vp8_sad16x16_early_exit
535 movq mm0
, QWORD PTR [rsi
]
536 movq mm2
, QWORD PTR [rsi
+8]
538 movq mm1
, QWORD PTR [rdi
]
539 movq mm3
, QWORD PTR [rdi
+8]
541 movq mm4
, QWORD PTR [rsi
+rbx
]
542 movq mm5
, QWORD PTR [rdi
+rdx
]
547 movq mm1
, QWORD PTR [rsi
+rbx
+8]
548 movq mm3
, QWORD PTR [rdi
+rdx
+8]
563 jne vp8_sad16x16_sse3_loop
567 vp8_sad16x16_early_exit:
577 ;void vp8_sad16x16x4d_sse3(
578 ; unsigned char *src_ptr,
580 ; unsigned char *ref_ptr_base,
583 global sym
(vp8_sad16x16x4d_sse3
)
584 sym
(vp8_sad16x16x4d_sse3
):
587 SHADOW_ARGS_TO_STACK
5
594 mov rdi
, arg
(2) ; ref_ptr_base
596 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
598 mov rsi
, arg
(0) ;src_ptr
600 movsxd rbx
, dword ptr arg
(1) ;src_stride
601 movsxd rbp
, dword ptr arg
(3) ;ref_stride
615 mov rdi
, arg
(4) ;Results
649 ;void vp8_sad16x8x4d_sse3(
650 ; unsigned char *src_ptr,
652 ; unsigned char *ref_ptr_base,
655 global sym
(vp8_sad16x8x4d_sse3
)
656 sym
(vp8_sad16x8x4d_sse3
):
659 SHADOW_ARGS_TO_STACK
5
666 mov rdi
, arg
(2) ; ref_ptr_base
668 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
670 mov rsi
, arg
(0) ;src_ptr
672 movsxd rbx
, dword ptr arg
(1) ;src_stride
673 movsxd rbp
, dword ptr arg
(3) ;ref_stride
683 mov rdi
, arg
(4) ;Results
717 ;void int vp8_sad8x16x4d_sse3(
718 ; unsigned char *src_ptr,
720 ; unsigned char *ref_ptr,
723 global sym
(vp8_sad8x16x4d_sse3
)
724 sym
(vp8_sad8x16x4d_sse3
):
727 SHADOW_ARGS_TO_STACK
5
734 mov rdi
, arg
(2) ; ref_ptr_base
736 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
738 mov rsi
, arg
(0) ;src_ptr
740 movsxd rbx
, dword ptr arg
(1) ;src_stride
741 movsxd rbp
, dword ptr arg
(3) ;ref_stride
755 mov rdi
, arg
(4) ;Results
770 ;void int vp8_sad8x8x4d_sse3(
771 ; unsigned char *src_ptr,
773 ; unsigned char *ref_ptr,
776 global sym
(vp8_sad8x8x4d_sse3
)
777 sym
(vp8_sad8x8x4d_sse3
):
780 SHADOW_ARGS_TO_STACK
5
787 mov rdi
, arg
(2) ; ref_ptr_base
789 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
791 mov rsi
, arg
(0) ;src_ptr
793 movsxd rbx
, dword ptr arg
(1) ;src_stride
794 movsxd rbp
, dword ptr arg
(3) ;ref_stride
804 mov rdi
, arg
(4) ;Results
819 ;void int vp8_sad4x4x4d_sse3(
820 ; unsigned char *src_ptr,
822 ; unsigned char *ref_ptr,
825 global sym
(vp8_sad4x4x4d_sse3
)
826 sym
(vp8_sad4x4x4d_sse3
):
829 SHADOW_ARGS_TO_STACK
5
836 mov rdi
, arg
(2) ; ref_ptr_base
838 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
840 mov rsi
, arg
(0) ;src_ptr
842 movsxd rbx
, dword ptr arg
(1) ;src_stride
843 movsxd rbp
, dword ptr arg
(3) ;ref_stride
847 movd mm0
, DWORD PTR [rsi
]
848 movd mm1
, DWORD PTR [rcx
]
850 movd mm2
, DWORD PTR [rsi
+rax
]
851 movd mm3
, DWORD PTR [rcx
+rbp
]
856 movd mm4
, DWORD PTR [rdx
]
857 movd mm5
, DWORD PTR [rbx
]
859 movd mm6
, DWORD PTR [rdi
]
860 movd mm2
, DWORD PTR [rdx
+rbp
]
862 movd mm3
, DWORD PTR [rbx
+rbp
]
863 movd mm7
, DWORD PTR [rdi
+rbp
]
886 movd mm0
, DWORD PTR [rsi
]
887 movd mm2
, DWORD PTR [rcx
]
889 movd mm3
, DWORD PTR [rsi
+rax
]
890 movd mm7
, DWORD PTR [rcx
+rbp
]
895 movd mm3
, DWORD PTR [rdx
]
896 movd mm7
, DWORD PTR [rbx
]
902 mov rsi
, arg
(4) ;Results
907 movd mm2
, DWORD PTR [rdx
+rax
]
908 movd mm1
, DWORD PTR [rbx
+rax
]
916 movd mm2
, DWORD PTR [rdi
]
917 movd mm1
, DWORD PTR [rdi
+rax
]