2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 %macro STACK_FRAME_CREATE_X3
0
16 %define src_stride rax
18 %define ref_stride rdx
21 %define result_ptr arg
(4)
22 %define max_err arg
(4)
29 mov rsi
, arg
(0) ; src_ptr
30 mov rdi
, arg
(2) ; ref_ptr
32 movsxd rax
, dword ptr arg
(1) ; src_stride
33 movsxd rdx
, dword ptr arg
(3) ; ref_stride
35 %ifidn __OUTPUT_FORMAT__
,x64
37 %define src_stride rdx
42 %define result_ptr
[rsp
+40+4*8]
43 %define max_err
[rsp
+40+4*8]
47 %define src_stride rsi
49 %define ref_stride rcx
59 %macro STACK_FRAME_DESTROY_X3
0
75 %ifidn __OUTPUT_FORMAT__
,x64
82 %macro STACK_FRAME_CREATE_X4
0
85 %define src_stride rax
90 %define ref_stride rbp
91 %define result_ptr arg
(4)
99 mov rdi
, arg
(2) ; ref_ptr_base
101 LOAD_X4_ADDRESSES rdi
, rcx
, rdx
, rax
, rdi
103 mov rsi
, arg
(0) ; src_ptr
105 movsxd rbx
, dword ptr arg
(1) ; src_stride
106 movsxd rbp
, dword ptr arg
(3) ; ref_stride
110 %ifidn __OUTPUT_FORMAT__
,x64
112 %define src_stride rdx
117 %define ref_stride r9
118 %define result_ptr
[rsp
+48+4*8]
122 LOAD_X4_ADDRESSES r8
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
125 %define src_stride rsi
130 %define ref_stride rcx
131 %define result_ptr r8
133 LOAD_X4_ADDRESSES rdx
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
139 %macro STACK_FRAME_DESTROY_X4
0
155 %ifidn __OUTPUT_FORMAT__
,x64
163 %macro PROCESS_16X2X3
5
165 movdqa xmm0
, XMMWORD
PTR [%2]
166 lddqu xmm5
, XMMWORD
PTR [%3]
167 lddqu xmm6
, XMMWORD
PTR [%3+1]
168 lddqu xmm7
, XMMWORD
PTR [%3+2]
174 movdqa xmm0
, XMMWORD
PTR [%2]
175 lddqu xmm1
, XMMWORD
PTR [%3]
176 lddqu xmm2
, XMMWORD
PTR [%3+1]
177 lddqu xmm3
, XMMWORD
PTR [%3+2]
187 movdqa xmm0
, XMMWORD
PTR [%2+%4]
188 lddqu xmm1
, XMMWORD
PTR [%3+%5]
189 lddqu xmm2
, XMMWORD
PTR [%3+%5+1]
190 lddqu xmm3
, XMMWORD
PTR [%3+%5+2]
206 %macro PROCESS_8X2X3
5
208 movq mm0
, QWORD PTR [%2]
209 movq mm5
, QWORD PTR [%3]
210 movq mm6
, QWORD PTR [%3+1]
211 movq mm7
, QWORD PTR [%3+2]
217 movq mm0
, QWORD PTR [%2]
218 movq mm1
, QWORD PTR [%3]
219 movq mm2
, QWORD PTR [%3+1]
220 movq mm3
, QWORD PTR [%3+2]
230 movq mm0
, QWORD PTR [%2+%4]
231 movq mm1
, QWORD PTR [%3+%5]
232 movq mm2
, QWORD PTR [%3+%5+1]
233 movq mm3
, QWORD PTR [%3+%5+2]
249 %macro LOAD_X4_ADDRESSES
5
250 mov %2, [%1+REG_SZ_BYTES
*0]
251 mov %3, [%1+REG_SZ_BYTES
*1]
253 mov %4, [%1+REG_SZ_BYTES
*2]
254 mov %5, [%1+REG_SZ_BYTES
*3]
257 %macro PROCESS_16X2X4
8
259 movdqa xmm0
, XMMWORD
PTR [%2]
260 lddqu xmm4
, XMMWORD
PTR [%3]
261 lddqu xmm5
, XMMWORD
PTR [%4]
262 lddqu xmm6
, XMMWORD
PTR [%5]
263 lddqu xmm7
, XMMWORD
PTR [%6]
270 movdqa xmm0
, XMMWORD
PTR [%2]
271 lddqu xmm1
, XMMWORD
PTR [%3]
272 lddqu xmm2
, XMMWORD
PTR [%4]
273 lddqu xmm3
, XMMWORD
PTR [%5]
280 lddqu xmm1
, XMMWORD
PTR [%6]
287 movdqa xmm0
, XMMWORD
PTR [%2+%7]
288 lddqu xmm1
, XMMWORD
PTR [%3+%8]
289 lddqu xmm2
, XMMWORD
PTR [%4+%8]
290 lddqu xmm3
, XMMWORD
PTR [%5+%8]
297 lddqu xmm1
, XMMWORD
PTR [%6+%8]
315 %macro PROCESS_8X2X4
8
317 movq mm0
, QWORD PTR [%2]
318 movq mm4
, QWORD PTR [%3]
319 movq mm5
, QWORD PTR [%4]
320 movq mm6
, QWORD PTR [%5]
321 movq mm7
, QWORD PTR [%6]
328 movq mm0
, QWORD PTR [%2]
329 movq mm1
, QWORD PTR [%3]
330 movq mm2
, QWORD PTR [%4]
331 movq mm3
, QWORD PTR [%5]
338 movq mm1
, QWORD PTR [%6]
345 movq mm0
, QWORD PTR [%2+%7]
346 movq mm1
, QWORD PTR [%3+%8]
347 movq mm2
, QWORD PTR [%4+%8]
348 movq mm3
, QWORD PTR [%5+%8]
355 movq mm1
, QWORD PTR [%6+%8]
373 ;void int vp8_sad16x16x3_sse3(
374 ; unsigned char *src_ptr,
376 ; unsigned char *ref_ptr,
379 global sym
(vp8_sad16x16x3_sse3
)
380 sym
(vp8_sad16x16x3_sse3
):
382 STACK_FRAME_CREATE_X3
384 PROCESS_16X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
385 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
386 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
387 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
388 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
389 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
390 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
391 PROCESS_16X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
413 STACK_FRAME_DESTROY_X3
415 ;void int vp8_sad16x8x3_sse3(
416 ; unsigned char *src_ptr,
418 ; unsigned char *ref_ptr,
421 global sym
(vp8_sad16x8x3_sse3
)
422 sym
(vp8_sad16x8x3_sse3
):
424 STACK_FRAME_CREATE_X3
426 PROCESS_16X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
427 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
428 PROCESS_16X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
429 PROCESS_16X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
451 STACK_FRAME_DESTROY_X3
453 ;void int vp8_sad8x16x3_sse3(
454 ; unsigned char *src_ptr,
456 ; unsigned char *ref_ptr,
459 global sym
(vp8_sad8x16x3_sse3
)
460 sym
(vp8_sad8x16x3_sse3
):
462 STACK_FRAME_CREATE_X3
464 PROCESS_8X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
465 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
466 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
467 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
468 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
469 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
470 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
471 PROCESS_8X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
480 STACK_FRAME_DESTROY_X3
482 ;void int vp8_sad8x8x3_sse3(
483 ; unsigned char *src_ptr,
485 ; unsigned char *ref_ptr,
488 global sym
(vp8_sad8x8x3_sse3
)
489 sym
(vp8_sad8x8x3_sse3
):
491 STACK_FRAME_CREATE_X3
493 PROCESS_8X2X3
0, src_ptr
, ref_ptr
, src_stride
, ref_stride
494 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
495 PROCESS_8X2X3
1, src_ptr
, ref_ptr
, src_stride
, ref_stride
496 PROCESS_8X2X3
2, src_ptr
, ref_ptr
, src_stride
, ref_stride
505 STACK_FRAME_DESTROY_X3
507 ;void int vp8_sad4x4x3_sse3(
508 ; unsigned char *src_ptr,
510 ; unsigned char *ref_ptr,
513 global sym
(vp8_sad4x4x3_sse3
)
514 sym
(vp8_sad4x4x3_sse3
):
516 STACK_FRAME_CREATE_X3
518 movd mm0
, DWORD PTR [src_ptr
]
519 movd mm1
, DWORD PTR [ref_ptr
]
521 movd mm2
, DWORD PTR [src_ptr
+src_stride
]
522 movd mm3
, DWORD PTR [ref_ptr
+ref_stride
]
527 movd mm4
, DWORD PTR [ref_ptr
+1]
528 movd mm5
, DWORD PTR [ref_ptr
+2]
530 movd mm2
, DWORD PTR [ref_ptr
+ref_stride
+1]
531 movd mm3
, DWORD PTR [ref_ptr
+ref_stride
+2]
541 lea src_ptr
, [src_ptr
+src_stride
*2]
542 lea ref_ptr
, [ref_ptr
+ref_stride
*2]
544 movd mm0
, DWORD PTR [src_ptr
]
545 movd mm2
, DWORD PTR [ref_ptr
]
547 movd mm3
, DWORD PTR [src_ptr
+src_stride
]
548 movd mm6
, DWORD PTR [ref_ptr
+ref_stride
]
553 movd mm3
, DWORD PTR [ref_ptr
+1]
554 movd mm7
, DWORD PTR [ref_ptr
+2]
560 movd mm2
, DWORD PTR [ref_ptr
+ref_stride
+1]
561 movd mm6
, DWORD PTR [ref_ptr
+ref_stride
+2]
579 STACK_FRAME_DESTROY_X3
581 ;unsigned int vp8_sad16x16_sse3(
582 ; unsigned char *src_ptr,
584 ; unsigned char *ref_ptr,
587 ;%define lddqu movdqu
588 global sym
(vp8_sad16x16_sse3
)
589 sym
(vp8_sad16x16_sse3
):
591 STACK_FRAME_CREATE_X3
596 .
vp8_sad16x16_sse3_loop:
597 movdqa xmm0
, XMMWORD
PTR [src_ptr
]
598 movdqu xmm1
, XMMWORD
PTR [ref_ptr
]
599 movdqa xmm2
, XMMWORD
PTR [src_ptr
+src_stride
]
600 movdqu xmm3
, XMMWORD
PTR [ref_ptr
+ref_stride
]
602 lea src_ptr
, [src_ptr
+src_stride
*2]
603 lea ref_ptr
, [ref_ptr
+ref_stride
*2]
605 movdqa xmm4
, XMMWORD
PTR [src_ptr
]
606 movdqu xmm5
, XMMWORD
PTR [ref_ptr
]
607 movdqa xmm6
, XMMWORD
PTR [src_ptr
+src_stride
]
611 movdqu xmm1
, XMMWORD
PTR [ref_ptr
+ref_stride
]
617 lea src_ptr
, [src_ptr
+src_stride
*2]
618 lea ref_ptr
, [ref_ptr
+ref_stride
*2]
626 jne .vp8_sad16x16_sse3_loop
633 STACK_FRAME_DESTROY_X3
635 ;void vp8_sad16x16x4d_sse3(
636 ; unsigned char *src_ptr,
638 ; unsigned char *ref_ptr_base,
641 global sym
(vp8_sad16x16x4d_sse3
)
642 sym
(vp8_sad16x16x4d_sse3
):
644 STACK_FRAME_CREATE_X4
646 PROCESS_16X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
647 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
648 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
649 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
650 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
651 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
652 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
653 PROCESS_16X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
684 STACK_FRAME_DESTROY_X4
686 ;void vp8_sad16x8x4d_sse3(
687 ; unsigned char *src_ptr,
689 ; unsigned char *ref_ptr_base,
692 global sym
(vp8_sad16x8x4d_sse3
)
693 sym
(vp8_sad16x8x4d_sse3
):
695 STACK_FRAME_CREATE_X4
697 PROCESS_16X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
698 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
699 PROCESS_16X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
700 PROCESS_16X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
731 STACK_FRAME_DESTROY_X4
733 ;void int vp8_sad8x16x4d_sse3(
734 ; unsigned char *src_ptr,
736 ; unsigned char *ref_ptr,
739 global sym
(vp8_sad8x16x4d_sse3
)
740 sym
(vp8_sad8x16x4d_sse3
):
742 STACK_FRAME_CREATE_X4
744 PROCESS_8X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
745 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
746 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
747 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
748 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
749 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
750 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
751 PROCESS_8X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
764 STACK_FRAME_DESTROY_X4
766 ;void int vp8_sad8x8x4d_sse3(
767 ; unsigned char *src_ptr,
769 ; unsigned char *ref_ptr,
772 global sym
(vp8_sad8x8x4d_sse3
)
773 sym
(vp8_sad8x8x4d_sse3
):
775 STACK_FRAME_CREATE_X4
777 PROCESS_8X2X4
0, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
778 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
779 PROCESS_8X2X4
1, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
780 PROCESS_8X2X4
2, src_ptr
, r0_ptr
, r1_ptr
, r2_ptr
, r3_ptr
, src_stride
, ref_stride
793 STACK_FRAME_DESTROY_X4
795 ;void int vp8_sad4x4x4d_sse3(
796 ; unsigned char *src_ptr,
798 ; unsigned char *ref_ptr,
801 global sym
(vp8_sad4x4x4d_sse3
)
802 sym
(vp8_sad4x4x4d_sse3
):
804 STACK_FRAME_CREATE_X4
806 movd mm0
, DWORD PTR [src_ptr
]
807 movd mm1
, DWORD PTR [r0_ptr
]
809 movd mm2
, DWORD PTR [src_ptr
+src_stride
]
810 movd mm3
, DWORD PTR [r0_ptr
+ref_stride
]
815 movd mm4
, DWORD PTR [r1_ptr
]
816 movd mm5
, DWORD PTR [r2_ptr
]
818 movd mm6
, DWORD PTR [r3_ptr
]
819 movd mm2
, DWORD PTR [r1_ptr
+ref_stride
]
821 movd mm3
, DWORD PTR [r2_ptr
+ref_stride
]
822 movd mm7
, DWORD PTR [r3_ptr
+ref_stride
]
837 lea src_ptr
, [src_ptr
+src_stride
*2]
838 lea r0_ptr
, [r0_ptr
+ref_stride
*2]
840 lea r1_ptr
, [r1_ptr
+ref_stride
*2]
841 lea r2_ptr
, [r2_ptr
+ref_stride
*2]
843 lea r3_ptr
, [r3_ptr
+ref_stride
*2]
845 movd mm0
, DWORD PTR [src_ptr
]
846 movd mm2
, DWORD PTR [r0_ptr
]
848 movd mm3
, DWORD PTR [src_ptr
+src_stride
]
849 movd mm7
, DWORD PTR [r0_ptr
+ref_stride
]
854 movd mm3
, DWORD PTR [r1_ptr
]
855 movd mm7
, DWORD PTR [r2_ptr
]
862 %define ref_stride rax
869 movd mm2
, DWORD PTR [r1_ptr
+ref_stride
]
870 movd mm1
, DWORD PTR [r2_ptr
+ref_stride
]
878 movd mm2
, DWORD PTR [r3_ptr
]
879 movd mm1
, DWORD PTR [r3_ptr
+ref_stride
]
894 STACK_FRAME_DESTROY_X4