2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 %macro STACK_FRAME_CREATE_X3
0
16 %define src_stride rax
18 %define ref_stride rdx
21 %define result_ptr arg
(4)
22 %define max_err arg
(4)
29 mov rsi
, arg
(0) ; src_ptr
30 mov rdi
, arg
(2) ; ref_ptr
32 movsxd rax
, dword ptr arg
(1) ; src_stride
33 movsxd rdx
, dword ptr arg
(3) ; ref_stride
35 %ifidn __OUTPUT_FORMAT__
,x64
37 %define src_stride rdx
42 %define result_ptr
[rsp
+8+4*8]
43 %define max_err
[rsp
+8+4*8]
46 %define src_stride rsi
48 %define ref_stride rcx
58 %macro STACK_FRAME_DESTROY_X3
0
74 %ifidn __OUTPUT_FORMAT__
,x64
80 %macro STACK_FRAME_CREATE_X4
0
83 %define src_stride rax
88 %define ref_stride rbp
89 %define result_ptr arg
(4)
97 mov rdi
, arg
(2) ; ref_ptr_base
99 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
101 mov rsi
, arg
(0) ; src_ptr
103 movsxd rbx
, dword ptr arg
(1) ; src_stride
104 movsxd rbp
, dword ptr arg
(3) ; ref_stride
108 %ifidn __OUTPUT_FORMAT__
,x64
110 %define src_stride rdx
115 %define ref_stride r9
116 %define result_ptr
[rsp
+16+4*8]
119 LOAD_X4_ADDRESSES r8
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
122 %define src_stride rsi
127 %define ref_stride rcx
128 %define result_ptr r8
130 LOAD_X4_ADDRESSES rdx
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
136 %macro STACK_FRAME_DESTROY_X4
0
152 %ifidn __OUTPUT_FORMAT__
,x64
159 %macro PROCESS_16X2X3
5
161 movdqa xmm0
, XMMWORD
PTR [%2]
162 lddqu xmm5
, XMMWORD
PTR [%3]
163 lddqu xmm6
, XMMWORD
PTR [%3+1]
164 lddqu xmm7
, XMMWORD
PTR [%3+2]
170 movdqa xmm0
, XMMWORD
PTR [%2]
171 lddqu xmm1
, XMMWORD
PTR [%3]
172 lddqu xmm2
, XMMWORD
PTR [%3+1]
173 lddqu xmm3
, XMMWORD
PTR [%3+2]
183 movdqa xmm0
, XMMWORD
PTR [%2+%4]
184 lddqu xmm1
, XMMWORD
PTR [%3+%5]
185 lddqu xmm2
, XMMWORD
PTR [%3+%5+1]
186 lddqu xmm3
, XMMWORD
PTR [%3+%5+2]
202 %macro PROCESS_8X2X3
5
204 movq mm0
, QWORD PTR [%2]
205 movq mm5
, QWORD PTR [%3]
206 movq mm6
, QWORD PTR [%3+1]
207 movq mm7
, QWORD PTR [%3+2]
213 movq mm0
, QWORD PTR [%2]
214 movq mm1
, QWORD PTR [%3]
215 movq mm2
, QWORD PTR [%3+1]
216 movq mm3
, QWORD PTR [%3+2]
226 movq mm0
, QWORD PTR [%2+%4]
227 movq mm1
, QWORD PTR [%3+%5]
228 movq mm2
, QWORD PTR [%3+%5+1]
229 movq mm3
, QWORD PTR [%3+%5+2]
245 %macro LOAD_X4_ADDRESSES
5
246 mov %2, [%1+REG_SZ_BYTES
*0]
247 mov %3, [%1+REG_SZ_BYTES
*1]
249 mov %4, [%1+REG_SZ_BYTES
*2]
250 mov %5, [%1+REG_SZ_BYTES
*3]
253 %macro PROCESS_16X2X4
8
255 movdqa xmm0
, XMMWORD
PTR [%2]
256 lddqu xmm4
, XMMWORD
PTR [%3]
257 lddqu xmm5
, XMMWORD
PTR [%4]
258 lddqu xmm6
, XMMWORD
PTR [%5]
259 lddqu xmm7
, XMMWORD
PTR [%6]
266 movdqa xmm0
, XMMWORD
PTR [%2]
267 lddqu xmm1
, XMMWORD
PTR [%3]
268 lddqu xmm2
, XMMWORD
PTR [%4]
269 lddqu xmm3
, XMMWORD
PTR [%5]
276 lddqu xmm1
, XMMWORD
PTR [%6]
283 movdqa xmm0
, XMMWORD
PTR [%2+%7]
284 lddqu xmm1
, XMMWORD
PTR [%3+%8]
285 lddqu xmm2
, XMMWORD
PTR [%4+%8]
286 lddqu xmm3
, XMMWORD
PTR [%5+%8]
293 lddqu xmm1
, XMMWORD
PTR [%6+%8]
311 %macro PROCESS_8X2X4
8
313 movq mm0
, QWORD PTR [%2]
314 movq mm4
, QWORD PTR [%3]
315 movq mm5
, QWORD PTR [%4]
316 movq mm6
, QWORD PTR [%5]
317 movq mm7
, QWORD PTR [%6]
324 movq mm0
, QWORD PTR [%2]
325 movq mm1
, QWORD PTR [%3]
326 movq mm2
, QWORD PTR [%4]
327 movq mm3
, QWORD PTR [%5]
334 movq mm1
, QWORD PTR [%6]
341 movq mm0
, QWORD PTR [%2+%7]
342 movq mm1
, QWORD PTR [%3+%8]
343 movq mm2
, QWORD PTR [%4+%8]
344 movq mm3
, QWORD PTR [%5+%8]
351 movq mm1
, QWORD PTR [%6+%8]
369 ;void int vp8_sad16x16x3_sse3(
370 ; unsigned char *src_ptr,
372 ; unsigned char *ref_ptr,
375 global sym
(vp8_sad16x16x3_sse3
)
376 sym
(vp8_sad16x16x3_sse3
):
378 STACK_FRAME_CREATE_X3
380 PROCESS_16X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
381 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
382 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
383 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
384 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
385 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
386 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
387 PROCESS_16X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
409 STACK_FRAME_DESTROY_X3
411 ;void int vp8_sad16x8x3_sse3(
412 ; unsigned char *src_ptr,
414 ; unsigned char *ref_ptr,
417 global sym
(vp8_sad16x8x3_sse3
)
418 sym
(vp8_sad16x8x3_sse3
):
420 STACK_FRAME_CREATE_X3
422 PROCESS_16X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
423 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
424 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
425 PROCESS_16X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
447 STACK_FRAME_DESTROY_X3
449 ;void int vp8_sad8x16x3_sse3(
450 ; unsigned char *src_ptr,
452 ; unsigned char *ref_ptr,
455 global sym
(vp8_sad8x16x3_sse3
)
456 sym
(vp8_sad8x16x3_sse3
):
458 STACK_FRAME_CREATE_X3
460 PROCESS_8X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
461 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
462 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
463 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
464 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
465 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
466 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
467 PROCESS_8X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
476 STACK_FRAME_DESTROY_X3
478 ;void int vp8_sad8x8x3_sse3(
479 ; unsigned char *src_ptr,
481 ; unsigned char *ref_ptr,
484 global sym
(vp8_sad8x8x3_sse3
)
485 sym
(vp8_sad8x8x3_sse3
):
487 STACK_FRAME_CREATE_X3
489 PROCESS_8X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
490 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
491 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
492 PROCESS_8X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
501 STACK_FRAME_DESTROY_X3
503 ;void int vp8_sad4x4x3_sse3(
504 ; unsigned char *src_ptr,
506 ; unsigned char *ref_ptr,
509 global sym
(vp8_sad4x4x3_sse3
)
510 sym
(vp8_sad4x4x3_sse3
):
512 STACK_FRAME_CREATE_X3
514 movd mm0
, DWORD PTR [src_ptr
]
515 movd mm1
, DWORD PTR [ref_ptr
]
517 movd mm2
, DWORD PTR [src_ptr
+src_stride
]
518 movd mm3
, DWORD PTR [ref_ptr
+ref_stride
]
523 movd mm4
, DWORD PTR [ref_ptr
+1]
524 movd mm5
, DWORD PTR [ref_ptr
+2]
526 movd mm2
, DWORD PTR [ref_ptr
+ref_stride
+1]
527 movd mm3
, DWORD PTR [ref_ptr
+ref_stride
+2]
537 lea src_ptr
, [src_ptr
+src_stride
*2]
538 lea ref_ptr
, [ref_ptr
+ref_stride
*2]
540 movd mm0
, DWORD PTR [src_ptr
]
541 movd mm2
, DWORD PTR [ref_ptr
]
543 movd mm3
, DWORD PTR [src_ptr
+src_stride
]
544 movd mm6
, DWORD PTR [ref_ptr
+ref_stride
]
549 movd mm3
, DWORD PTR [ref_ptr
+1]
550 movd mm7
, DWORD PTR [ref_ptr
+2]
556 movd mm2
, DWORD PTR [ref_ptr
+ref_stride
+1]
557 movd mm6
, DWORD PTR [ref_ptr
+ref_stride
+2]
575 STACK_FRAME_DESTROY_X3
577 ;unsigned int vp8_sad16x16_sse3(
578 ; unsigned char *src_ptr,
580 ; unsigned char *ref_ptr,
583 ;%define lddqu movdqu
584 global sym
(vp8_sad16x16_sse3
)
585 sym
(vp8_sad16x16_sse3
):
587 STACK_FRAME_CREATE_X3
589 lea end_ptr
, [src_ptr
+src_stride
*8]
591 lea end_ptr
, [end_ptr
+src_stride
*8]
594 .
vp8_sad16x16_sse3_loop:
598 jg .vp8_sad16x16_early_exit
600 movq mm0
, QWORD PTR [src_ptr
]
601 movq mm2
, QWORD PTR [src_ptr
+8]
603 movq mm1
, QWORD PTR [ref_ptr
]
604 movq mm3
, QWORD PTR [ref_ptr
+8]
606 movq mm4
, QWORD PTR [src_ptr
+src_stride
]
607 movq mm5
, QWORD PTR [ref_ptr
+ref_stride
]
612 movq mm1
, QWORD PTR [src_ptr
+src_stride
+8]
613 movq mm3
, QWORD PTR [ref_ptr
+ref_stride
+8]
618 lea src_ptr
, [src_ptr
+src_stride
*2]
619 lea ref_ptr
, [ref_ptr
+ref_stride
*2]
628 jne .vp8_sad16x16_sse3_loop
632 .
vp8_sad16x16_early_exit:
636 STACK_FRAME_DESTROY_X3
638 ;void vp8_sad16x16x4d_sse3(
639 ; unsigned char *src_ptr,
641 ; unsigned char *ref_ptr_base,
644 global sym
(vp8_sad16x16x4d_sse3
)
645 sym
(vp8_sad16x16x4d_sse3
):
647 STACK_FRAME_CREATE_X4
649 PROCESS_16X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
650 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
651 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
652 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
653 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
654 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
655 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
656 PROCESS_16X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
687 STACK_FRAME_DESTROY_X4
689 ;void vp8_sad16x8x4d_sse3(
690 ; unsigned char *src_ptr,
692 ; unsigned char *ref_ptr_base,
695 global sym
(vp8_sad16x8x4d_sse3
)
696 sym
(vp8_sad16x8x4d_sse3
):
698 STACK_FRAME_CREATE_X4
700 PROCESS_16X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
701 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
702 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
703 PROCESS_16X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
734 STACK_FRAME_DESTROY_X4
736 ;void int vp8_sad8x16x4d_sse3(
737 ; unsigned char *src_ptr,
739 ; unsigned char *ref_ptr,
742 global sym
(vp8_sad8x16x4d_sse3
)
743 sym
(vp8_sad8x16x4d_sse3
):
745 STACK_FRAME_CREATE_X4
747 PROCESS_8X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
748 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
749 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
750 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
751 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
752 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
753 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
754 PROCESS_8X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
767 STACK_FRAME_DESTROY_X4
769 ;void int vp8_sad8x8x4d_sse3(
770 ; unsigned char *src_ptr,
772 ; unsigned char *ref_ptr,
775 global sym
(vp8_sad8x8x4d_sse3
)
776 sym
(vp8_sad8x8x4d_sse3
):
778 STACK_FRAME_CREATE_X4
780 PROCESS_8X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
781 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
782 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
783 PROCESS_8X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
796 STACK_FRAME_DESTROY_X4
798 ;void int vp8_sad4x4x4d_sse3(
799 ; unsigned char *src_ptr,
801 ; unsigned char *ref_ptr,
804 global sym
(vp8_sad4x4x4d_sse3
)
805 sym
(vp8_sad4x4x4d_sse3
):
807 STACK_FRAME_CREATE_X4
809 movd mm0
, DWORD PTR [src_ptr
]
810 movd mm1
, DWORD PTR [r0_ptr
]
812 movd mm2
, DWORD PTR [src_ptr
+src_stride
]
813 movd mm3
, DWORD PTR [r0_ptr
+ref_stride
]
818 movd mm4
, DWORD PTR [r1_ptr
]
819 movd mm5
, DWORD PTR [r2_ptr
]
821 movd mm6
, DWORD PTR [r3_ptr
]
822 movd mm2
, DWORD PTR [r1_ptr
+ref_stride
]
824 movd mm3
, DWORD PTR [r2_ptr
+ref_stride
]
825 movd mm7
, DWORD PTR [r3_ptr
+ref_stride
]
840 lea src_ptr
, [src_ptr
+src_stride
*2]
841 lea r0_ptr
, [r0_ptr
+ref_stride
*2]
843 lea r1_ptr
, [r1_ptr
+ref_stride
*2]
844 lea r2_ptr
, [r2_ptr
+ref_stride
*2]
846 lea r3_ptr
, [r3_ptr
+ref_stride
*2]
848 movd mm0
, DWORD PTR [src_ptr
]
849 movd mm2
, DWORD PTR [r0_ptr
]
851 movd mm3
, DWORD PTR [src_ptr
+src_stride
]
852 movd mm7
, DWORD PTR [r0_ptr
+ref_stride
]
857 movd mm3
, DWORD PTR [r1_ptr
]
858 movd mm7
, DWORD PTR [r2_ptr
]
865 %define ref_stride rax
872 movd mm2
, DWORD PTR [r1_ptr
+ref_stride
]
873 movd mm1
, DWORD PTR [r2_ptr
+ref_stride
]
881 movd mm2
, DWORD PTR [r3_ptr
]
882 movd mm1
, DWORD PTR [r3_ptr
+ref_stride
]
897 STACK_FRAME_DESTROY_X4