Add save/restore xmm registers in x86 assembly code
[libvpx.git] / vp8 / encoder / x86 / sad_sse3.asm
blob666879267be3df7ed9d7e82329909e14484c690c
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 %macro STACK_FRAME_CREATE_X3 0
14 %if ABI_IS_32BIT
15 %define src_ptr rsi
16 %define src_stride rax
17 %define ref_ptr rdi
18 %define ref_stride rdx
19 %define end_ptr rcx
20 %define ret_var rbx
21 %define result_ptr arg(4)
22 %define max_err arg(4)
23 push rbp
24 mov rbp, rsp
25 push rsi
26 push rdi
27 push rbx
29 mov rsi, arg(0) ; src_ptr
30 mov rdi, arg(2) ; ref_ptr
32 movsxd rax, dword ptr arg(1) ; src_stride
33 movsxd rdx, dword ptr arg(3) ; ref_stride
34 %else
35 %ifidn __OUTPUT_FORMAT__,x64
36 %define src_ptr rcx
37 %define src_stride rdx
38 %define ref_ptr r8
39 %define ref_stride r9
40 %define end_ptr r10
41 %define ret_var r11
42 %define result_ptr [rsp+40+4*8]
43 %define max_err [rsp+40+4*8]
44 SAVE_XMM
45 %else
46 %define src_ptr rdi
47 %define src_stride rsi
48 %define ref_ptr rdx
49 %define ref_stride rcx
50 %define end_ptr r9
51 %define ret_var r10
52 %define result_ptr r8
53 %define max_err r8
54 %endif
55 %endif
57 %endmacro
59 %macro STACK_FRAME_DESTROY_X3 0
60 %define src_ptr
61 %define src_stride
62 %define ref_ptr
63 %define ref_stride
64 %define end_ptr
65 %define ret_var
66 %define result_ptr
67 %define max_err
69 %if ABI_IS_32BIT
70 pop rbx
71 pop rdi
72 pop rsi
73 pop rbp
74 %else
75 %ifidn __OUTPUT_FORMAT__,x64
76 RESTORE_XMM
77 %endif
78 %endif
79 ret
80 %endmacro
82 %macro STACK_FRAME_CREATE_X4 0
83 %if ABI_IS_32BIT
84 %define src_ptr rsi
85 %define src_stride rax
86 %define r0_ptr rcx
87 %define r1_ptr rdx
88 %define r2_ptr rbx
89 %define r3_ptr rdi
90 %define ref_stride rbp
91 %define result_ptr arg(4)
92 push rbp
93 mov rbp, rsp
94 push rsi
95 push rdi
96 push rbx
98 push rbp
99 mov rdi, arg(2) ; ref_ptr_base
101 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
103 mov rsi, arg(0) ; src_ptr
105 movsxd rbx, dword ptr arg(1) ; src_stride
106 movsxd rbp, dword ptr arg(3) ; ref_stride
108 xchg rbx, rax
109 %else
110 %ifidn __OUTPUT_FORMAT__,x64
111 %define src_ptr rcx
112 %define src_stride rdx
113 %define r0_ptr rsi
114 %define r1_ptr r10
115 %define r2_ptr r11
116 %define r3_ptr r8
117 %define ref_stride r9
118 %define result_ptr [rsp+48+4*8]
119 SAVE_XMM
120 push rsi
122 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
123 %else
124 %define src_ptr rdi
125 %define src_stride rsi
126 %define r0_ptr r9
127 %define r1_ptr r10
128 %define r2_ptr r11
129 %define r3_ptr rdx
130 %define ref_stride rcx
131 %define result_ptr r8
133 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
135 %endif
136 %endif
137 %endmacro
139 %macro STACK_FRAME_DESTROY_X4 0
140 %define src_ptr
141 %define src_stride
142 %define r0_ptr
143 %define r1_ptr
144 %define r2_ptr
145 %define r3_ptr
146 %define ref_stride
147 %define result_ptr
149 %if ABI_IS_32BIT
150 pop rbx
151 pop rdi
152 pop rsi
153 pop rbp
154 %else
155 %ifidn __OUTPUT_FORMAT__,x64
156 pop rsi
157 RESTORE_XMM
158 %endif
159 %endif
161 %endmacro
163 %macro PROCESS_16X2X3 5
164 %if %1==0
165 movdqa xmm0, XMMWORD PTR [%2]
166 lddqu xmm5, XMMWORD PTR [%3]
167 lddqu xmm6, XMMWORD PTR [%3+1]
168 lddqu xmm7, XMMWORD PTR [%3+2]
170 psadbw xmm5, xmm0
171 psadbw xmm6, xmm0
172 psadbw xmm7, xmm0
173 %else
174 movdqa xmm0, XMMWORD PTR [%2]
175 lddqu xmm1, XMMWORD PTR [%3]
176 lddqu xmm2, XMMWORD PTR [%3+1]
177 lddqu xmm3, XMMWORD PTR [%3+2]
179 psadbw xmm1, xmm0
180 psadbw xmm2, xmm0
181 psadbw xmm3, xmm0
183 paddw xmm5, xmm1
184 paddw xmm6, xmm2
185 paddw xmm7, xmm3
186 %endif
187 movdqa xmm0, XMMWORD PTR [%2+%4]
188 lddqu xmm1, XMMWORD PTR [%3+%5]
189 lddqu xmm2, XMMWORD PTR [%3+%5+1]
190 lddqu xmm3, XMMWORD PTR [%3+%5+2]
192 %if %1==0 || %1==1
193 lea %2, [%2+%4*2]
194 lea %3, [%3+%5*2]
195 %endif
197 psadbw xmm1, xmm0
198 psadbw xmm2, xmm0
199 psadbw xmm3, xmm0
201 paddw xmm5, xmm1
202 paddw xmm6, xmm2
203 paddw xmm7, xmm3
204 %endmacro
206 %macro PROCESS_8X2X3 5
207 %if %1==0
208 movq mm0, QWORD PTR [%2]
209 movq mm5, QWORD PTR [%3]
210 movq mm6, QWORD PTR [%3+1]
211 movq mm7, QWORD PTR [%3+2]
213 psadbw mm5, mm0
214 psadbw mm6, mm0
215 psadbw mm7, mm0
216 %else
217 movq mm0, QWORD PTR [%2]
218 movq mm1, QWORD PTR [%3]
219 movq mm2, QWORD PTR [%3+1]
220 movq mm3, QWORD PTR [%3+2]
222 psadbw mm1, mm0
223 psadbw mm2, mm0
224 psadbw mm3, mm0
226 paddw mm5, mm1
227 paddw mm6, mm2
228 paddw mm7, mm3
229 %endif
230 movq mm0, QWORD PTR [%2+%4]
231 movq mm1, QWORD PTR [%3+%5]
232 movq mm2, QWORD PTR [%3+%5+1]
233 movq mm3, QWORD PTR [%3+%5+2]
235 %if %1==0 || %1==1
236 lea %2, [%2+%4*2]
237 lea %3, [%3+%5*2]
238 %endif
240 psadbw mm1, mm0
241 psadbw mm2, mm0
242 psadbw mm3, mm0
244 paddw mm5, mm1
245 paddw mm6, mm2
246 paddw mm7, mm3
247 %endmacro
249 %macro LOAD_X4_ADDRESSES 5
250 mov %2, [%1+REG_SZ_BYTES*0]
251 mov %3, [%1+REG_SZ_BYTES*1]
253 mov %4, [%1+REG_SZ_BYTES*2]
254 mov %5, [%1+REG_SZ_BYTES*3]
255 %endmacro
257 %macro PROCESS_16X2X4 8
258 %if %1==0
259 movdqa xmm0, XMMWORD PTR [%2]
260 lddqu xmm4, XMMWORD PTR [%3]
261 lddqu xmm5, XMMWORD PTR [%4]
262 lddqu xmm6, XMMWORD PTR [%5]
263 lddqu xmm7, XMMWORD PTR [%6]
265 psadbw xmm4, xmm0
266 psadbw xmm5, xmm0
267 psadbw xmm6, xmm0
268 psadbw xmm7, xmm0
269 %else
270 movdqa xmm0, XMMWORD PTR [%2]
271 lddqu xmm1, XMMWORD PTR [%3]
272 lddqu xmm2, XMMWORD PTR [%4]
273 lddqu xmm3, XMMWORD PTR [%5]
275 psadbw xmm1, xmm0
276 psadbw xmm2, xmm0
277 psadbw xmm3, xmm0
279 paddw xmm4, xmm1
280 lddqu xmm1, XMMWORD PTR [%6]
281 paddw xmm5, xmm2
282 paddw xmm6, xmm3
284 psadbw xmm1, xmm0
285 paddw xmm7, xmm1
286 %endif
287 movdqa xmm0, XMMWORD PTR [%2+%7]
288 lddqu xmm1, XMMWORD PTR [%3+%8]
289 lddqu xmm2, XMMWORD PTR [%4+%8]
290 lddqu xmm3, XMMWORD PTR [%5+%8]
292 psadbw xmm1, xmm0
293 psadbw xmm2, xmm0
294 psadbw xmm3, xmm0
296 paddw xmm4, xmm1
297 lddqu xmm1, XMMWORD PTR [%6+%8]
298 paddw xmm5, xmm2
299 paddw xmm6, xmm3
301 %if %1==0 || %1==1
302 lea %2, [%2+%7*2]
303 lea %3, [%3+%8*2]
305 lea %4, [%4+%8*2]
306 lea %5, [%5+%8*2]
308 lea %6, [%6+%8*2]
309 %endif
310 psadbw xmm1, xmm0
311 paddw xmm7, xmm1
313 %endmacro
315 %macro PROCESS_8X2X4 8
316 %if %1==0
317 movq mm0, QWORD PTR [%2]
318 movq mm4, QWORD PTR [%3]
319 movq mm5, QWORD PTR [%4]
320 movq mm6, QWORD PTR [%5]
321 movq mm7, QWORD PTR [%6]
323 psadbw mm4, mm0
324 psadbw mm5, mm0
325 psadbw mm6, mm0
326 psadbw mm7, mm0
327 %else
328 movq mm0, QWORD PTR [%2]
329 movq mm1, QWORD PTR [%3]
330 movq mm2, QWORD PTR [%4]
331 movq mm3, QWORD PTR [%5]
333 psadbw mm1, mm0
334 psadbw mm2, mm0
335 psadbw mm3, mm0
337 paddw mm4, mm1
338 movq mm1, QWORD PTR [%6]
339 paddw mm5, mm2
340 paddw mm6, mm3
342 psadbw mm1, mm0
343 paddw mm7, mm1
344 %endif
345 movq mm0, QWORD PTR [%2+%7]
346 movq mm1, QWORD PTR [%3+%8]
347 movq mm2, QWORD PTR [%4+%8]
348 movq mm3, QWORD PTR [%5+%8]
350 psadbw mm1, mm0
351 psadbw mm2, mm0
352 psadbw mm3, mm0
354 paddw mm4, mm1
355 movq mm1, QWORD PTR [%6+%8]
356 paddw mm5, mm2
357 paddw mm6, mm3
359 %if %1==0 || %1==1
360 lea %2, [%2+%7*2]
361 lea %3, [%3+%8*2]
363 lea %4, [%4+%8*2]
364 lea %5, [%5+%8*2]
366 lea %6, [%6+%8*2]
367 %endif
368 psadbw mm1, mm0
369 paddw mm7, mm1
371 %endmacro
373 ;void int vp8_sad16x16x3_sse3(
374 ; unsigned char *src_ptr,
375 ; int src_stride,
376 ; unsigned char *ref_ptr,
377 ; int ref_stride,
378 ; int *results)
379 global sym(vp8_sad16x16x3_sse3)
380 sym(vp8_sad16x16x3_sse3):
382 STACK_FRAME_CREATE_X3
384 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
385 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
386 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
387 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
388 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
391 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
393 mov rcx, result_ptr
395 movq xmm0, xmm5
396 psrldq xmm5, 8
398 paddw xmm0, xmm5
399 movd [rcx], xmm0
401 movq xmm0, xmm6
402 psrldq xmm6, 8
404 paddw xmm0, xmm6
405 movd [rcx+4], xmm0
407 movq xmm0, xmm7
408 psrldq xmm7, 8
410 paddw xmm0, xmm7
411 movd [rcx+8], xmm0
413 STACK_FRAME_DESTROY_X3
415 ;void int vp8_sad16x8x3_sse3(
416 ; unsigned char *src_ptr,
417 ; int src_stride,
418 ; unsigned char *ref_ptr,
419 ; int ref_stride,
420 ; int *results)
421 global sym(vp8_sad16x8x3_sse3)
422 sym(vp8_sad16x8x3_sse3):
424 STACK_FRAME_CREATE_X3
426 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
427 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
428 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
429 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
431 mov rcx, result_ptr
433 movq xmm0, xmm5
434 psrldq xmm5, 8
436 paddw xmm0, xmm5
437 movd [rcx], xmm0
439 movq xmm0, xmm6
440 psrldq xmm6, 8
442 paddw xmm0, xmm6
443 movd [rcx+4], xmm0
445 movq xmm0, xmm7
446 psrldq xmm7, 8
448 paddw xmm0, xmm7
449 movd [rcx+8], xmm0
451 STACK_FRAME_DESTROY_X3
453 ;void int vp8_sad8x16x3_sse3(
454 ; unsigned char *src_ptr,
455 ; int src_stride,
456 ; unsigned char *ref_ptr,
457 ; int ref_stride,
458 ; int *results)
459 global sym(vp8_sad8x16x3_sse3)
460 sym(vp8_sad8x16x3_sse3):
462 STACK_FRAME_CREATE_X3
464 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
465 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
466 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
467 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
468 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
471 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
473 mov rcx, result_ptr
475 punpckldq mm5, mm6
477 movq [rcx], mm5
478 movd [rcx+8], mm7
480 STACK_FRAME_DESTROY_X3
482 ;void int vp8_sad8x8x3_sse3(
483 ; unsigned char *src_ptr,
484 ; int src_stride,
485 ; unsigned char *ref_ptr,
486 ; int ref_stride,
487 ; int *results)
488 global sym(vp8_sad8x8x3_sse3)
489 sym(vp8_sad8x8x3_sse3):
491 STACK_FRAME_CREATE_X3
493 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
494 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
495 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
496 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
498 mov rcx, result_ptr
500 punpckldq mm5, mm6
502 movq [rcx], mm5
503 movd [rcx+8], mm7
505 STACK_FRAME_DESTROY_X3
507 ;void int vp8_sad4x4x3_sse3(
508 ; unsigned char *src_ptr,
509 ; int src_stride,
510 ; unsigned char *ref_ptr,
511 ; int ref_stride,
512 ; int *results)
513 global sym(vp8_sad4x4x3_sse3)
514 sym(vp8_sad4x4x3_sse3):
516 STACK_FRAME_CREATE_X3
518 movd mm0, DWORD PTR [src_ptr]
519 movd mm1, DWORD PTR [ref_ptr]
521 movd mm2, DWORD PTR [src_ptr+src_stride]
522 movd mm3, DWORD PTR [ref_ptr+ref_stride]
524 punpcklbw mm0, mm2
525 punpcklbw mm1, mm3
527 movd mm4, DWORD PTR [ref_ptr+1]
528 movd mm5, DWORD PTR [ref_ptr+2]
530 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
531 movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
533 psadbw mm1, mm0
535 punpcklbw mm4, mm2
536 punpcklbw mm5, mm3
538 psadbw mm4, mm0
539 psadbw mm5, mm0
541 lea src_ptr, [src_ptr+src_stride*2]
542 lea ref_ptr, [ref_ptr+ref_stride*2]
544 movd mm0, DWORD PTR [src_ptr]
545 movd mm2, DWORD PTR [ref_ptr]
547 movd mm3, DWORD PTR [src_ptr+src_stride]
548 movd mm6, DWORD PTR [ref_ptr+ref_stride]
550 punpcklbw mm0, mm3
551 punpcklbw mm2, mm6
553 movd mm3, DWORD PTR [ref_ptr+1]
554 movd mm7, DWORD PTR [ref_ptr+2]
556 psadbw mm2, mm0
558 paddw mm1, mm2
560 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
561 movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
563 punpcklbw mm3, mm2
564 punpcklbw mm7, mm6
566 psadbw mm3, mm0
567 psadbw mm7, mm0
569 paddw mm3, mm4
570 paddw mm7, mm5
572 mov rcx, result_ptr
574 punpckldq mm1, mm3
576 movq [rcx], mm1
577 movd [rcx+8], mm7
579 STACK_FRAME_DESTROY_X3
581 ;unsigned int vp8_sad16x16_sse3(
582 ; unsigned char *src_ptr,
583 ; int src_stride,
584 ; unsigned char *ref_ptr,
585 ; int ref_stride,
586 ; int max_err)
587 ;%define lddqu movdqu
588 global sym(vp8_sad16x16_sse3)
589 sym(vp8_sad16x16_sse3):
591 STACK_FRAME_CREATE_X3
593 mov end_ptr, 4
594 pxor xmm7, xmm7
596 .vp8_sad16x16_sse3_loop:
597 movdqa xmm0, XMMWORD PTR [src_ptr]
598 movdqu xmm1, XMMWORD PTR [ref_ptr]
599 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
600 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
602 lea src_ptr, [src_ptr+src_stride*2]
603 lea ref_ptr, [ref_ptr+ref_stride*2]
605 movdqa xmm4, XMMWORD PTR [src_ptr]
606 movdqu xmm5, XMMWORD PTR [ref_ptr]
607 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
609 psadbw xmm0, xmm1
611 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
613 psadbw xmm2, xmm3
614 psadbw xmm4, xmm5
615 psadbw xmm6, xmm1
617 lea src_ptr, [src_ptr+src_stride*2]
618 lea ref_ptr, [ref_ptr+ref_stride*2]
620 paddw xmm7, xmm0
621 paddw xmm7, xmm2
622 paddw xmm7, xmm4
623 paddw xmm7, xmm6
625 sub end_ptr, 1
626 jne .vp8_sad16x16_sse3_loop
628 movq xmm0, xmm7
629 psrldq xmm7, 8
630 paddw xmm0, xmm7
631 movq rax, xmm0
633 STACK_FRAME_DESTROY_X3
635 ;void vp8_sad16x16x4d_sse3(
636 ; unsigned char *src_ptr,
637 ; int src_stride,
638 ; unsigned char *ref_ptr_base,
639 ; int ref_stride,
640 ; int *results)
641 global sym(vp8_sad16x16x4d_sse3)
642 sym(vp8_sad16x16x4d_sse3):
644 STACK_FRAME_CREATE_X4
646 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
647 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
648 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
649 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
650 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
651 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
652 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
653 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
655 %if ABI_IS_32BIT
656 pop rbp
657 %endif
658 mov rcx, result_ptr
660 movq xmm0, xmm4
661 psrldq xmm4, 8
663 paddw xmm0, xmm4
664 movd [rcx], xmm0
666 movq xmm0, xmm5
667 psrldq xmm5, 8
669 paddw xmm0, xmm5
670 movd [rcx+4], xmm0
672 movq xmm0, xmm6
673 psrldq xmm6, 8
675 paddw xmm0, xmm6
676 movd [rcx+8], xmm0
678 movq xmm0, xmm7
679 psrldq xmm7, 8
681 paddw xmm0, xmm7
682 movd [rcx+12], xmm0
684 STACK_FRAME_DESTROY_X4
686 ;void vp8_sad16x8x4d_sse3(
687 ; unsigned char *src_ptr,
688 ; int src_stride,
689 ; unsigned char *ref_ptr_base,
690 ; int ref_stride,
691 ; int *results)
692 global sym(vp8_sad16x8x4d_sse3)
693 sym(vp8_sad16x8x4d_sse3):
695 STACK_FRAME_CREATE_X4
697 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
698 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
699 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
700 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
702 %if ABI_IS_32BIT
703 pop rbp
704 %endif
705 mov rcx, result_ptr
707 movq xmm0, xmm4
708 psrldq xmm4, 8
710 paddw xmm0, xmm4
711 movd [rcx], xmm0
713 movq xmm0, xmm5
714 psrldq xmm5, 8
716 paddw xmm0, xmm5
717 movd [rcx+4], xmm0
719 movq xmm0, xmm6
720 psrldq xmm6, 8
722 paddw xmm0, xmm6
723 movd [rcx+8], xmm0
725 movq xmm0, xmm7
726 psrldq xmm7, 8
728 paddw xmm0, xmm7
729 movd [rcx+12], xmm0
731 STACK_FRAME_DESTROY_X4
733 ;void int vp8_sad8x16x4d_sse3(
734 ; unsigned char *src_ptr,
735 ; int src_stride,
736 ; unsigned char *ref_ptr,
737 ; int ref_stride,
738 ; int *results)
739 global sym(vp8_sad8x16x4d_sse3)
740 sym(vp8_sad8x16x4d_sse3):
742 STACK_FRAME_CREATE_X4
744 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
745 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
746 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
747 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
748 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
749 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
750 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
751 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
753 %if ABI_IS_32BIT
754 pop rbp
755 %endif
756 mov rcx, result_ptr
758 punpckldq mm4, mm5
759 punpckldq mm6, mm7
761 movq [rcx], mm4
762 movq [rcx+8], mm6
764 STACK_FRAME_DESTROY_X4
766 ;void int vp8_sad8x8x4d_sse3(
767 ; unsigned char *src_ptr,
768 ; int src_stride,
769 ; unsigned char *ref_ptr,
770 ; int ref_stride,
771 ; int *results)
772 global sym(vp8_sad8x8x4d_sse3)
773 sym(vp8_sad8x8x4d_sse3):
775 STACK_FRAME_CREATE_X4
777 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
778 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
779 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
780 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
782 %if ABI_IS_32BIT
783 pop rbp
784 %endif
785 mov rcx, result_ptr
787 punpckldq mm4, mm5
788 punpckldq mm6, mm7
790 movq [rcx], mm4
791 movq [rcx+8], mm6
793 STACK_FRAME_DESTROY_X4
795 ;void int vp8_sad4x4x4d_sse3(
796 ; unsigned char *src_ptr,
797 ; int src_stride,
798 ; unsigned char *ref_ptr,
799 ; int ref_stride,
800 ; int *results)
801 global sym(vp8_sad4x4x4d_sse3)
802 sym(vp8_sad4x4x4d_sse3):
804 STACK_FRAME_CREATE_X4
806 movd mm0, DWORD PTR [src_ptr]
807 movd mm1, DWORD PTR [r0_ptr]
809 movd mm2, DWORD PTR [src_ptr+src_stride]
810 movd mm3, DWORD PTR [r0_ptr+ref_stride]
812 punpcklbw mm0, mm2
813 punpcklbw mm1, mm3
815 movd mm4, DWORD PTR [r1_ptr]
816 movd mm5, DWORD PTR [r2_ptr]
818 movd mm6, DWORD PTR [r3_ptr]
819 movd mm2, DWORD PTR [r1_ptr+ref_stride]
821 movd mm3, DWORD PTR [r2_ptr+ref_stride]
822 movd mm7, DWORD PTR [r3_ptr+ref_stride]
824 psadbw mm1, mm0
826 punpcklbw mm4, mm2
827 punpcklbw mm5, mm3
829 punpcklbw mm6, mm7
830 psadbw mm4, mm0
832 psadbw mm5, mm0
833 psadbw mm6, mm0
837 lea src_ptr, [src_ptr+src_stride*2]
838 lea r0_ptr, [r0_ptr+ref_stride*2]
840 lea r1_ptr, [r1_ptr+ref_stride*2]
841 lea r2_ptr, [r2_ptr+ref_stride*2]
843 lea r3_ptr, [r3_ptr+ref_stride*2]
845 movd mm0, DWORD PTR [src_ptr]
846 movd mm2, DWORD PTR [r0_ptr]
848 movd mm3, DWORD PTR [src_ptr+src_stride]
849 movd mm7, DWORD PTR [r0_ptr+ref_stride]
851 punpcklbw mm0, mm3
852 punpcklbw mm2, mm7
854 movd mm3, DWORD PTR [r1_ptr]
855 movd mm7, DWORD PTR [r2_ptr]
857 psadbw mm2, mm0
858 %if ABI_IS_32BIT
859 mov rax, rbp
861 pop rbp
862 %define ref_stride rax
863 %endif
864 mov rsi, result_ptr
866 paddw mm1, mm2
867 movd [rsi], mm1
869 movd mm2, DWORD PTR [r1_ptr+ref_stride]
870 movd mm1, DWORD PTR [r2_ptr+ref_stride]
872 punpcklbw mm3, mm2
873 punpcklbw mm7, mm1
875 psadbw mm3, mm0
876 psadbw mm7, mm0
878 movd mm2, DWORD PTR [r3_ptr]
879 movd mm1, DWORD PTR [r3_ptr+ref_stride]
881 paddw mm3, mm4
882 paddw mm7, mm5
884 movd [rsi+4], mm3
885 punpcklbw mm2, mm1
887 movd [rsi+8], mm7
888 psadbw mm2, mm0
890 paddw mm2, mm6
891 movd [rsi+12], mm2
894 STACK_FRAME_DESTROY_X4