arm: remove duplicate functions
[libvpx.git] / vp8 / encoder / x86 / sad_sse3.asm
blob1b7293c20f3cf57fe3c8964f068647f39c098f04
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro PROCESS_16X2X3 1
15 %if %1
16 movdqa xmm0, XMMWORD PTR [rsi]
17 lddqu xmm5, XMMWORD PTR [rdi]
18 lddqu xmm6, XMMWORD PTR [rdi+1]
19 lddqu xmm7, XMMWORD PTR [rdi+2]
21 psadbw xmm5, xmm0
22 psadbw xmm6, xmm0
23 psadbw xmm7, xmm0
24 %else
25 movdqa xmm0, XMMWORD PTR [rsi]
26 lddqu xmm1, XMMWORD PTR [rdi]
27 lddqu xmm2, XMMWORD PTR [rdi+1]
28 lddqu xmm3, XMMWORD PTR [rdi+2]
30 psadbw xmm1, xmm0
31 psadbw xmm2, xmm0
32 psadbw xmm3, xmm0
34 paddw xmm5, xmm1
35 paddw xmm6, xmm2
36 paddw xmm7, xmm3
37 %endif
38 movdqa xmm0, XMMWORD PTR [rsi+rax]
39 lddqu xmm1, XMMWORD PTR [rdi+rdx]
40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
43 lea rsi, [rsi+rax*2]
44 lea rdi, [rdi+rdx*2]
46 psadbw xmm1, xmm0
47 psadbw xmm2, xmm0
48 psadbw xmm3, xmm0
50 paddw xmm5, xmm1
51 paddw xmm6, xmm2
52 paddw xmm7, xmm3
53 %endmacro
55 %macro PROCESS_8X2X3 1
56 %if %1
57 movq mm0, QWORD PTR [rsi]
58 movq mm5, QWORD PTR [rdi]
59 movq mm6, QWORD PTR [rdi+1]
60 movq mm7, QWORD PTR [rdi+2]
62 psadbw mm5, mm0
63 psadbw mm6, mm0
64 psadbw mm7, mm0
65 %else
66 movq mm0, QWORD PTR [rsi]
67 movq mm1, QWORD PTR [rdi]
68 movq mm2, QWORD PTR [rdi+1]
69 movq mm3, QWORD PTR [rdi+2]
71 psadbw mm1, mm0
72 psadbw mm2, mm0
73 psadbw mm3, mm0
75 paddw mm5, mm1
76 paddw mm6, mm2
77 paddw mm7, mm3
78 %endif
79 movq mm0, QWORD PTR [rsi+rax]
80 movq mm1, QWORD PTR [rdi+rdx]
81 movq mm2, QWORD PTR [rdi+rdx+1]
82 movq mm3, QWORD PTR [rdi+rdx+2]
84 lea rsi, [rsi+rax*2]
85 lea rdi, [rdi+rdx*2]
87 psadbw mm1, mm0
88 psadbw mm2, mm0
89 psadbw mm3, mm0
91 paddw mm5, mm1
92 paddw mm6, mm2
93 paddw mm7, mm3
94 %endmacro
96 %macro LOAD_X4_ADDRESSES 5
97 mov %2, [%1+REG_SZ_BYTES*0]
98 mov %3, [%1+REG_SZ_BYTES*1]
100 mov %4, [%1+REG_SZ_BYTES*2]
101 mov %5, [%1+REG_SZ_BYTES*3]
102 %endmacro
104 %macro PROCESS_16X2X4 1
105 %if %1
106 movdqa xmm0, XMMWORD PTR [rsi]
107 lddqu xmm4, XMMWORD PTR [rcx]
108 lddqu xmm5, XMMWORD PTR [rdx]
109 lddqu xmm6, XMMWORD PTR [rbx]
110 lddqu xmm7, XMMWORD PTR [rdi]
112 psadbw xmm4, xmm0
113 psadbw xmm5, xmm0
114 psadbw xmm6, xmm0
115 psadbw xmm7, xmm0
116 %else
117 movdqa xmm0, XMMWORD PTR [rsi]
118 lddqu xmm1, XMMWORD PTR [rcx]
119 lddqu xmm2, XMMWORD PTR [rdx]
120 lddqu xmm3, XMMWORD PTR [rbx]
122 psadbw xmm1, xmm0
123 psadbw xmm2, xmm0
124 psadbw xmm3, xmm0
126 paddw xmm4, xmm1
127 lddqu xmm1, XMMWORD PTR [rdi]
128 paddw xmm5, xmm2
129 paddw xmm6, xmm3
131 psadbw xmm1, xmm0
132 paddw xmm7, xmm1
133 %endif
134 movdqa xmm0, XMMWORD PTR [rsi+rax]
135 lddqu xmm1, XMMWORD PTR [rcx+rbp]
136 lddqu xmm2, XMMWORD PTR [rdx+rbp]
137 lddqu xmm3, XMMWORD PTR [rbx+rbp]
139 psadbw xmm1, xmm0
140 psadbw xmm2, xmm0
141 psadbw xmm3, xmm0
143 paddw xmm4, xmm1
144 lddqu xmm1, XMMWORD PTR [rdi+rbp]
145 paddw xmm5, xmm2
146 paddw xmm6, xmm3
148 lea rsi, [rsi+rax*2]
149 lea rcx, [rcx+rbp*2]
151 lea rdx, [rdx+rbp*2]
152 lea rbx, [rbx+rbp*2]
154 lea rdi, [rdi+rbp*2]
156 psadbw xmm1, xmm0
157 paddw xmm7, xmm1
159 %endmacro
161 %macro PROCESS_8X2X4 1
162 %if %1
163 movq mm0, QWORD PTR [rsi]
164 movq mm4, QWORD PTR [rcx]
165 movq mm5, QWORD PTR [rdx]
166 movq mm6, QWORD PTR [rbx]
167 movq mm7, QWORD PTR [rdi]
169 psadbw mm4, mm0
170 psadbw mm5, mm0
171 psadbw mm6, mm0
172 psadbw mm7, mm0
173 %else
174 movq mm0, QWORD PTR [rsi]
175 movq mm1, QWORD PTR [rcx]
176 movq mm2, QWORD PTR [rdx]
177 movq mm3, QWORD PTR [rbx]
179 psadbw mm1, mm0
180 psadbw mm2, mm0
181 psadbw mm3, mm0
183 paddw mm4, mm1
184 movq mm1, QWORD PTR [rdi]
185 paddw mm5, mm2
186 paddw mm6, mm3
188 psadbw mm1, mm0
189 paddw mm7, mm1
190 %endif
191 movq mm0, QWORD PTR [rsi+rax]
192 movq mm1, QWORD PTR [rcx+rbp]
193 movq mm2, QWORD PTR [rdx+rbp]
194 movq mm3, QWORD PTR [rbx+rbp]
196 psadbw mm1, mm0
197 psadbw mm2, mm0
198 psadbw mm3, mm0
200 paddw mm4, mm1
201 movq mm1, QWORD PTR [rdi+rbp]
202 paddw mm5, mm2
203 paddw mm6, mm3
205 lea rsi, [rsi+rax*2]
206 lea rcx, [rcx+rbp*2]
208 lea rdx, [rdx+rbp*2]
209 lea rbx, [rbx+rbp*2]
211 lea rdi, [rdi+rbp*2]
213 psadbw mm1, mm0
214 paddw mm7, mm1
216 %endmacro
218 ;void int vp8_sad16x16x3_sse3(
219 ; unsigned char *src_ptr,
220 ; int src_stride,
221 ; unsigned char *ref_ptr,
222 ; int ref_stride,
223 ; int *results)
224 global sym(vp8_sad16x16x3_sse3)
225 sym(vp8_sad16x16x3_sse3):
226 push rbp
227 mov rbp, rsp
228 SHADOW_ARGS_TO_STACK 5
229 push rsi
230 push rdi
231 ; end prolog
233 mov rsi, arg(0) ;src_ptr
234 mov rdi, arg(2) ;ref_ptr
236 movsxd rax, dword ptr arg(1) ;src_stride
237 movsxd rdx, dword ptr arg(3) ;ref_stride
239 PROCESS_16X2X3 1
240 PROCESS_16X2X3 0
241 PROCESS_16X2X3 0
242 PROCESS_16X2X3 0
243 PROCESS_16X2X3 0
244 PROCESS_16X2X3 0
245 PROCESS_16X2X3 0
246 PROCESS_16X2X3 0
248 mov rdi, arg(4) ;Results
250 movq xmm0, xmm5
251 psrldq xmm5, 8
253 paddw xmm0, xmm5
254 movd [rdi], xmm0
256 movq xmm0, xmm6
257 psrldq xmm6, 8
259 paddw xmm0, xmm6
260 movd [rdi+4], xmm0
262 movq xmm0, xmm7
263 psrldq xmm7, 8
265 paddw xmm0, xmm7
266 movd [rdi+8], xmm0
268 ; begin epilog
269 pop rdi
270 pop rsi
271 UNSHADOW_ARGS
272 pop rbp
275 ;void int vp8_sad16x8x3_sse3(
276 ; unsigned char *src_ptr,
277 ; int src_stride,
278 ; unsigned char *ref_ptr,
279 ; int ref_stride,
280 ; int *results)
281 global sym(vp8_sad16x8x3_sse3)
282 sym(vp8_sad16x8x3_sse3):
283 push rbp
284 mov rbp, rsp
285 SHADOW_ARGS_TO_STACK 5
286 push rsi
287 push rdi
288 ; end prolog
290 mov rsi, arg(0) ;src_ptr
291 mov rdi, arg(2) ;ref_ptr
293 movsxd rax, dword ptr arg(1) ;src_stride
294 movsxd rdx, dword ptr arg(3) ;ref_stride
296 PROCESS_16X2X3 1
297 PROCESS_16X2X3 0
298 PROCESS_16X2X3 0
299 PROCESS_16X2X3 0
301 mov rdi, arg(4) ;Results
303 movq xmm0, xmm5
304 psrldq xmm5, 8
306 paddw xmm0, xmm5
307 movd [rdi], xmm0
309 movq xmm0, xmm6
310 psrldq xmm6, 8
312 paddw xmm0, xmm6
313 movd [rdi+4], xmm0
315 movq xmm0, xmm7
316 psrldq xmm7, 8
318 paddw xmm0, xmm7
319 movd [rdi+8], xmm0
321 ; begin epilog
322 pop rdi
323 pop rsi
324 UNSHADOW_ARGS
325 pop rbp
328 ;void int vp8_sad8x16x3_sse3(
329 ; unsigned char *src_ptr,
330 ; int src_stride,
331 ; unsigned char *ref_ptr,
332 ; int ref_stride,
333 ; int *results)
334 global sym(vp8_sad8x16x3_sse3)
335 sym(vp8_sad8x16x3_sse3):
336 push rbp
337 mov rbp, rsp
338 SHADOW_ARGS_TO_STACK 5
339 push rsi
340 push rdi
341 ; end prolog
343 mov rsi, arg(0) ;src_ptr
344 mov rdi, arg(2) ;ref_ptr
346 movsxd rax, dword ptr arg(1) ;src_stride
347 movsxd rdx, dword ptr arg(3) ;ref_stride
349 PROCESS_8X2X3 1
350 PROCESS_8X2X3 0
351 PROCESS_8X2X3 0
352 PROCESS_8X2X3 0
353 PROCESS_8X2X3 0
354 PROCESS_8X2X3 0
355 PROCESS_8X2X3 0
356 PROCESS_8X2X3 0
358 mov rdi, arg(4) ;Results
360 movd [rdi], mm5
361 movd [rdi+4], mm6
362 movd [rdi+8], mm7
364 ; begin epilog
365 pop rdi
366 pop rsi
367 UNSHADOW_ARGS
368 pop rbp
371 ;void int vp8_sad8x8x3_sse3(
372 ; unsigned char *src_ptr,
373 ; int src_stride,
374 ; unsigned char *ref_ptr,
375 ; int ref_stride,
376 ; int *results)
377 global sym(vp8_sad8x8x3_sse3)
378 sym(vp8_sad8x8x3_sse3):
379 push rbp
380 mov rbp, rsp
381 SHADOW_ARGS_TO_STACK 5
382 push rsi
383 push rdi
384 ; end prolog
386 mov rsi, arg(0) ;src_ptr
387 mov rdi, arg(2) ;ref_ptr
389 movsxd rax, dword ptr arg(1) ;src_stride
390 movsxd rdx, dword ptr arg(3) ;ref_stride
392 PROCESS_8X2X3 1
393 PROCESS_8X2X3 0
394 PROCESS_8X2X3 0
395 PROCESS_8X2X3 0
397 mov rdi, arg(4) ;Results
399 movd [rdi], mm5
400 movd [rdi+4], mm6
401 movd [rdi+8], mm7
403 ; begin epilog
404 pop rdi
405 pop rsi
406 UNSHADOW_ARGS
407 pop rbp
410 ;void int vp8_sad4x4x3_sse3(
411 ; unsigned char *src_ptr,
412 ; int src_stride,
413 ; unsigned char *ref_ptr,
414 ; int ref_stride,
415 ; int *results)
416 global sym(vp8_sad4x4x3_sse3)
417 sym(vp8_sad4x4x3_sse3):
418 push rbp
419 mov rbp, rsp
420 SHADOW_ARGS_TO_STACK 5
421 push rsi
422 push rdi
423 ; end prolog
425 mov rsi, arg(0) ;src_ptr
426 mov rdi, arg(2) ;ref_ptr
428 movsxd rax, dword ptr arg(1) ;src_stride
429 movsxd rdx, dword ptr arg(3) ;ref_stride
431 movd mm0, DWORD PTR [rsi]
432 movd mm1, DWORD PTR [rdi]
434 movd mm2, DWORD PTR [rsi+rax]
435 movd mm3, DWORD PTR [rdi+rdx]
437 punpcklbw mm0, mm2
438 punpcklbw mm1, mm3
440 movd mm4, DWORD PTR [rdi+1]
441 movd mm5, DWORD PTR [rdi+2]
443 movd mm2, DWORD PTR [rdi+rdx+1]
444 movd mm3, DWORD PTR [rdi+rdx+2]
446 psadbw mm1, mm0
448 punpcklbw mm4, mm2
449 punpcklbw mm5, mm3
451 psadbw mm4, mm0
452 psadbw mm5, mm0
456 lea rsi, [rsi+rax*2]
457 lea rdi, [rdi+rdx*2]
459 movd mm0, DWORD PTR [rsi]
460 movd mm2, DWORD PTR [rdi]
462 movd mm3, DWORD PTR [rsi+rax]
463 movd mm6, DWORD PTR [rdi+rdx]
465 punpcklbw mm0, mm3
466 punpcklbw mm2, mm6
468 movd mm3, DWORD PTR [rdi+1]
469 movd mm7, DWORD PTR [rdi+2]
471 psadbw mm2, mm0
473 paddw mm1, mm2
475 movd mm2, DWORD PTR [rdi+rdx+1]
476 movd mm6, DWORD PTR [rdi+rdx+2]
478 punpcklbw mm3, mm2
479 punpcklbw mm7, mm6
481 psadbw mm3, mm0
482 psadbw mm7, mm0
484 paddw mm3, mm4
485 paddw mm7, mm5
487 mov rdi, arg(4) ;Results
488 movd [rdi], mm1
490 movd [rdi+4], mm3
491 movd [rdi+8], mm7
494 ; begin epilog
495 pop rdi
496 pop rsi
497 UNSHADOW_ARGS
498 pop rbp
501 ;unsigned int vp8_sad16x16_sse3(
502 ; unsigned char *src_ptr,
503 ; int src_stride,
504 ; unsigned char *ref_ptr,
505 ; int ref_stride,
506 ; int max_err)
507 ;%define lddqu movdqu
508 global sym(vp8_sad16x16_sse3)
509 sym(vp8_sad16x16_sse3):
510 push rbp
511 mov rbp, rsp
512 SHADOW_ARGS_TO_STACK 5
513 push rbx
514 push rsi
515 push rdi
516 ; end prolog
518 mov rsi, arg(0) ;src_ptr
519 mov rdi, arg(2) ;ref_ptr
521 movsxd rbx, dword ptr arg(1) ;src_stride
522 movsxd rdx, dword ptr arg(3) ;ref_stride
524 lea rcx, [rsi+rbx*8]
526 lea rcx, [rcx+rbx*8]
527 pxor mm7, mm7
529 vp8_sad16x16_sse3_loop:
531 movq rax, mm7
532 cmp rax, arg(4)
533 jg vp8_sad16x16_early_exit
535 movq mm0, QWORD PTR [rsi]
536 movq mm2, QWORD PTR [rsi+8]
538 movq mm1, QWORD PTR [rdi]
539 movq mm3, QWORD PTR [rdi+8]
541 movq mm4, QWORD PTR [rsi+rbx]
542 movq mm5, QWORD PTR [rdi+rdx]
544 psadbw mm0, mm1
545 psadbw mm2, mm3
547 movq mm1, QWORD PTR [rsi+rbx+8]
548 movq mm3, QWORD PTR [rdi+rdx+8]
550 psadbw mm4, mm5
551 psadbw mm1, mm3
553 lea rsi, [rsi+rbx*2]
554 lea rdi, [rdi+rdx*2]
556 paddw mm0, mm2
557 paddw mm4, mm1
559 paddw mm7, mm0
560 paddw mm7, mm4
562 cmp rsi, rcx
563 jne vp8_sad16x16_sse3_loop
565 movq rax, mm7
567 vp8_sad16x16_early_exit:
569 ; begin epilog
570 pop rdi
571 pop rsi
572 pop rbx
573 UNSHADOW_ARGS
574 pop rbp
577 ;void vp8_sad16x16x4d_sse3(
578 ; unsigned char *src_ptr,
579 ; int src_stride,
580 ; unsigned char *ref_ptr_base,
581 ; int ref_stride,
582 ; int *results)
583 global sym(vp8_sad16x16x4d_sse3)
584 sym(vp8_sad16x16x4d_sse3):
585 push rbp
586 mov rbp, rsp
587 SHADOW_ARGS_TO_STACK 5
588 push rsi
589 push rdi
590 push rbx
591 ; end prolog
593 push rbp
594 mov rdi, arg(2) ; ref_ptr_base
596 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
598 mov rsi, arg(0) ;src_ptr
600 movsxd rbx, dword ptr arg(1) ;src_stride
601 movsxd rbp, dword ptr arg(3) ;ref_stride
603 xchg rbx, rax
605 PROCESS_16X2X4 1
606 PROCESS_16X2X4 0
607 PROCESS_16X2X4 0
608 PROCESS_16X2X4 0
609 PROCESS_16X2X4 0
610 PROCESS_16X2X4 0
611 PROCESS_16X2X4 0
612 PROCESS_16X2X4 0
614 pop rbp
615 mov rdi, arg(4) ;Results
617 movq xmm0, xmm4
618 psrldq xmm4, 8
620 paddw xmm0, xmm4
621 movd [rdi], xmm0
623 movq xmm0, xmm5
624 psrldq xmm5, 8
626 paddw xmm0, xmm5
627 movd [rdi+4], xmm0
629 movq xmm0, xmm6
630 psrldq xmm6, 8
632 paddw xmm0, xmm6
633 movd [rdi+8], xmm0
635 movq xmm0, xmm7
636 psrldq xmm7, 8
638 paddw xmm0, xmm7
639 movd [rdi+12], xmm0
641 ; begin epilog
642 pop rbx
643 pop rdi
644 pop rsi
645 UNSHADOW_ARGS
646 pop rbp
649 ;void vp8_sad16x8x4d_sse3(
650 ; unsigned char *src_ptr,
651 ; int src_stride,
652 ; unsigned char *ref_ptr_base,
653 ; int ref_stride,
654 ; int *results)
655 global sym(vp8_sad16x8x4d_sse3)
656 sym(vp8_sad16x8x4d_sse3):
657 push rbp
658 mov rbp, rsp
659 SHADOW_ARGS_TO_STACK 5
660 push rsi
661 push rdi
662 push rbx
663 ; end prolog
665 push rbp
666 mov rdi, arg(2) ; ref_ptr_base
668 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
670 mov rsi, arg(0) ;src_ptr
672 movsxd rbx, dword ptr arg(1) ;src_stride
673 movsxd rbp, dword ptr arg(3) ;ref_stride
675 xchg rbx, rax
677 PROCESS_16X2X4 1
678 PROCESS_16X2X4 0
679 PROCESS_16X2X4 0
680 PROCESS_16X2X4 0
682 pop rbp
683 mov rdi, arg(4) ;Results
685 movq xmm0, xmm4
686 psrldq xmm4, 8
688 paddw xmm0, xmm4
689 movd [rdi], xmm0
691 movq xmm0, xmm5
692 psrldq xmm5, 8
694 paddw xmm0, xmm5
695 movd [rdi+4], xmm0
697 movq xmm0, xmm6
698 psrldq xmm6, 8
700 paddw xmm0, xmm6
701 movd [rdi+8], xmm0
703 movq xmm0, xmm7
704 psrldq xmm7, 8
706 paddw xmm0, xmm7
707 movd [rdi+12], xmm0
709 ; begin epilog
710 pop rbx
711 pop rdi
712 pop rsi
713 UNSHADOW_ARGS
714 pop rbp
717 ;void int vp8_sad8x16x4d_sse3(
718 ; unsigned char *src_ptr,
719 ; int src_stride,
720 ; unsigned char *ref_ptr,
721 ; int ref_stride,
722 ; int *results)
723 global sym(vp8_sad8x16x4d_sse3)
724 sym(vp8_sad8x16x4d_sse3):
725 push rbp
726 mov rbp, rsp
727 SHADOW_ARGS_TO_STACK 5
728 push rsi
729 push rdi
730 push rbx
731 ; end prolog
733 push rbp
734 mov rdi, arg(2) ; ref_ptr_base
736 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
738 mov rsi, arg(0) ;src_ptr
740 movsxd rbx, dword ptr arg(1) ;src_stride
741 movsxd rbp, dword ptr arg(3) ;ref_stride
743 xchg rbx, rax
745 PROCESS_8X2X4 1
746 PROCESS_8X2X4 0
747 PROCESS_8X2X4 0
748 PROCESS_8X2X4 0
749 PROCESS_8X2X4 0
750 PROCESS_8X2X4 0
751 PROCESS_8X2X4 0
752 PROCESS_8X2X4 0
754 pop rbp
755 mov rdi, arg(4) ;Results
757 movd [rdi], mm4
758 movd [rdi+4], mm5
759 movd [rdi+8], mm6
760 movd [rdi+12], mm7
762 ; begin epilog
763 pop rbx
764 pop rdi
765 pop rsi
766 UNSHADOW_ARGS
767 pop rbp
770 ;void int vp8_sad8x8x4d_sse3(
771 ; unsigned char *src_ptr,
772 ; int src_stride,
773 ; unsigned char *ref_ptr,
774 ; int ref_stride,
775 ; int *results)
776 global sym(vp8_sad8x8x4d_sse3)
777 sym(vp8_sad8x8x4d_sse3):
778 push rbp
779 mov rbp, rsp
780 SHADOW_ARGS_TO_STACK 5
781 push rsi
782 push rdi
783 push rbx
784 ; end prolog
786 push rbp
787 mov rdi, arg(2) ; ref_ptr_base
789 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
791 mov rsi, arg(0) ;src_ptr
793 movsxd rbx, dword ptr arg(1) ;src_stride
794 movsxd rbp, dword ptr arg(3) ;ref_stride
796 xchg rbx, rax
798 PROCESS_8X2X4 1
799 PROCESS_8X2X4 0
800 PROCESS_8X2X4 0
801 PROCESS_8X2X4 0
803 pop rbp
804 mov rdi, arg(4) ;Results
806 movd [rdi], mm4
807 movd [rdi+4], mm5
808 movd [rdi+8], mm6
809 movd [rdi+12], mm7
811 ; begin epilog
812 pop rbx
813 pop rdi
814 pop rsi
815 UNSHADOW_ARGS
816 pop rbp
819 ;void int vp8_sad4x4x4d_sse3(
820 ; unsigned char *src_ptr,
821 ; int src_stride,
822 ; unsigned char *ref_ptr,
823 ; int ref_stride,
824 ; int *results)
825 global sym(vp8_sad4x4x4d_sse3)
826 sym(vp8_sad4x4x4d_sse3):
827 push rbp
828 mov rbp, rsp
829 SHADOW_ARGS_TO_STACK 5
830 push rsi
831 push rdi
832 push rbx
833 ; end prolog
835 push rbp
836 mov rdi, arg(2) ; ref_ptr_base
838 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
840 mov rsi, arg(0) ;src_ptr
842 movsxd rbx, dword ptr arg(1) ;src_stride
843 movsxd rbp, dword ptr arg(3) ;ref_stride
845 xchg rbx, rax
847 movd mm0, DWORD PTR [rsi]
848 movd mm1, DWORD PTR [rcx]
850 movd mm2, DWORD PTR [rsi+rax]
851 movd mm3, DWORD PTR [rcx+rbp]
853 punpcklbw mm0, mm2
854 punpcklbw mm1, mm3
856 movd mm4, DWORD PTR [rdx]
857 movd mm5, DWORD PTR [rbx]
859 movd mm6, DWORD PTR [rdi]
860 movd mm2, DWORD PTR [rdx+rbp]
862 movd mm3, DWORD PTR [rbx+rbp]
863 movd mm7, DWORD PTR [rdi+rbp]
865 psadbw mm1, mm0
867 punpcklbw mm4, mm2
868 punpcklbw mm5, mm3
870 punpcklbw mm6, mm7
871 psadbw mm4, mm0
873 psadbw mm5, mm0
874 psadbw mm6, mm0
878 lea rsi, [rsi+rax*2]
879 lea rcx, [rcx+rbp*2]
881 lea rdx, [rdx+rbp*2]
882 lea rbx, [rbx+rbp*2]
884 lea rdi, [rdi+rbp*2]
886 movd mm0, DWORD PTR [rsi]
887 movd mm2, DWORD PTR [rcx]
889 movd mm3, DWORD PTR [rsi+rax]
890 movd mm7, DWORD PTR [rcx+rbp]
892 punpcklbw mm0, mm3
893 punpcklbw mm2, mm7
895 movd mm3, DWORD PTR [rdx]
896 movd mm7, DWORD PTR [rbx]
898 psadbw mm2, mm0
899 mov rax, rbp
901 pop rbp
902 mov rsi, arg(4) ;Results
904 paddw mm1, mm2
905 movd [rsi], mm1
907 movd mm2, DWORD PTR [rdx+rax]
908 movd mm1, DWORD PTR [rbx+rax]
910 punpcklbw mm3, mm2
911 punpcklbw mm7, mm1
913 psadbw mm3, mm0
914 psadbw mm7, mm0
916 movd mm2, DWORD PTR [rdi]
917 movd mm1, DWORD PTR [rdi+rax]
919 paddw mm3, mm4
920 paddw mm7, mm5
922 movd [rsi+4], mm3
923 punpcklbw mm2, mm1
925 movd [rsi+8], mm7
926 psadbw mm2, mm0
928 paddw mm2, mm6
929 movd [rsi+12], mm2
932 ; begin epilog
933 pop rbx
934 pop rdi
935 pop rsi
936 UNSHADOW_ARGS
937 pop rbp