Fix relative include paths
[libvpx.git] / vp8 / encoder / x86 / sad_sse3.asm
blob57541751640246ac6d112ad83afce995319e0188
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 %macro STACK_FRAME_CREATE_X3 0
14 %if ABI_IS_32BIT
15 %define src_ptr rsi
16 %define src_stride rax
17 %define ref_ptr rdi
18 %define ref_stride rdx
19 %define end_ptr rcx
20 %define ret_var rbx
21 %define result_ptr arg(4)
22 %define max_err arg(4)
23 push rbp
24 mov rbp, rsp
25 push rsi
26 push rdi
27 push rbx
29 mov rsi, arg(0) ; src_ptr
30 mov rdi, arg(2) ; ref_ptr
32 movsxd rax, dword ptr arg(1) ; src_stride
33 movsxd rdx, dword ptr arg(3) ; ref_stride
34 %else
35 %ifidn __OUTPUT_FORMAT__,x64
36 %define src_ptr rcx
37 %define src_stride rdx
38 %define ref_ptr r8
39 %define ref_stride r9
40 %define end_ptr r10
41 %define ret_var r11
42 %define result_ptr [rsp+8+4*8]
43 %define max_err [rsp+8+4*8]
44 %else
45 %define src_ptr rdi
46 %define src_stride rsi
47 %define ref_ptr rdx
48 %define ref_stride rcx
49 %define end_ptr r9
50 %define ret_var r10
51 %define result_ptr r8
52 %define max_err r8
53 %endif
54 %endif
56 %endmacro
58 %macro STACK_FRAME_DESTROY_X3 0
59 %define src_ptr
60 %define src_stride
61 %define ref_ptr
62 %define ref_stride
63 %define end_ptr
64 %define ret_var
65 %define result_ptr
66 %define max_err
68 %if ABI_IS_32BIT
69 pop rbx
70 pop rdi
71 pop rsi
72 pop rbp
73 %else
74 %ifidn __OUTPUT_FORMAT__,x64
75 %endif
76 %endif
77 ret
78 %endmacro
80 %macro STACK_FRAME_CREATE_X4 0
81 %if ABI_IS_32BIT
82 %define src_ptr rsi
83 %define src_stride rax
84 %define r0_ptr rcx
85 %define r1_ptr rdx
86 %define r2_ptr rbx
87 %define r3_ptr rdi
88 %define ref_stride rbp
89 %define result_ptr arg(4)
90 push rbp
91 mov rbp, rsp
92 push rsi
93 push rdi
94 push rbx
96 push rbp
97 mov rdi, arg(2) ; ref_ptr_base
99 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
101 mov rsi, arg(0) ; src_ptr
103 movsxd rbx, dword ptr arg(1) ; src_stride
104 movsxd rbp, dword ptr arg(3) ; ref_stride
106 xchg rbx, rax
107 %else
108 %ifidn __OUTPUT_FORMAT__,x64
109 %define src_ptr rcx
110 %define src_stride rdx
111 %define r0_ptr rsi
112 %define r1_ptr r10
113 %define r2_ptr r11
114 %define r3_ptr r8
115 %define ref_stride r9
116 %define result_ptr [rsp+16+4*8]
117 push rsi
119 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
120 %else
121 %define src_ptr rdi
122 %define src_stride rsi
123 %define r0_ptr r9
124 %define r1_ptr r10
125 %define r2_ptr r11
126 %define r3_ptr rdx
127 %define ref_stride rcx
128 %define result_ptr r8
130 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
132 %endif
133 %endif
134 %endmacro
136 %macro STACK_FRAME_DESTROY_X4 0
137 %define src_ptr
138 %define src_stride
139 %define r0_ptr
140 %define r1_ptr
141 %define r2_ptr
142 %define r3_ptr
143 %define ref_stride
144 %define result_ptr
146 %if ABI_IS_32BIT
147 pop rbx
148 pop rdi
149 pop rsi
150 pop rbp
151 %else
152 %ifidn __OUTPUT_FORMAT__,x64
153 pop rsi
154 %endif
155 %endif
157 %endmacro
159 %macro PROCESS_16X2X3 5
160 %if %1==0
161 movdqa xmm0, XMMWORD PTR [%2]
162 lddqu xmm5, XMMWORD PTR [%3]
163 lddqu xmm6, XMMWORD PTR [%3+1]
164 lddqu xmm7, XMMWORD PTR [%3+2]
166 psadbw xmm5, xmm0
167 psadbw xmm6, xmm0
168 psadbw xmm7, xmm0
169 %else
170 movdqa xmm0, XMMWORD PTR [%2]
171 lddqu xmm1, XMMWORD PTR [%3]
172 lddqu xmm2, XMMWORD PTR [%3+1]
173 lddqu xmm3, XMMWORD PTR [%3+2]
175 psadbw xmm1, xmm0
176 psadbw xmm2, xmm0
177 psadbw xmm3, xmm0
179 paddw xmm5, xmm1
180 paddw xmm6, xmm2
181 paddw xmm7, xmm3
182 %endif
183 movdqa xmm0, XMMWORD PTR [%2+%4]
184 lddqu xmm1, XMMWORD PTR [%3+%5]
185 lddqu xmm2, XMMWORD PTR [%3+%5+1]
186 lddqu xmm3, XMMWORD PTR [%3+%5+2]
188 %if %1==0 || %1==1
189 lea %2, [%2+%4*2]
190 lea %3, [%3+%5*2]
191 %endif
193 psadbw xmm1, xmm0
194 psadbw xmm2, xmm0
195 psadbw xmm3, xmm0
197 paddw xmm5, xmm1
198 paddw xmm6, xmm2
199 paddw xmm7, xmm3
200 %endmacro
202 %macro PROCESS_8X2X3 5
203 %if %1==0
204 movq mm0, QWORD PTR [%2]
205 movq mm5, QWORD PTR [%3]
206 movq mm6, QWORD PTR [%3+1]
207 movq mm7, QWORD PTR [%3+2]
209 psadbw mm5, mm0
210 psadbw mm6, mm0
211 psadbw mm7, mm0
212 %else
213 movq mm0, QWORD PTR [%2]
214 movq mm1, QWORD PTR [%3]
215 movq mm2, QWORD PTR [%3+1]
216 movq mm3, QWORD PTR [%3+2]
218 psadbw mm1, mm0
219 psadbw mm2, mm0
220 psadbw mm3, mm0
222 paddw mm5, mm1
223 paddw mm6, mm2
224 paddw mm7, mm3
225 %endif
226 movq mm0, QWORD PTR [%2+%4]
227 movq mm1, QWORD PTR [%3+%5]
228 movq mm2, QWORD PTR [%3+%5+1]
229 movq mm3, QWORD PTR [%3+%5+2]
231 %if %1==0 || %1==1
232 lea %2, [%2+%4*2]
233 lea %3, [%3+%5*2]
234 %endif
236 psadbw mm1, mm0
237 psadbw mm2, mm0
238 psadbw mm3, mm0
240 paddw mm5, mm1
241 paddw mm6, mm2
242 paddw mm7, mm3
243 %endmacro
245 %macro LOAD_X4_ADDRESSES 5
246 mov %2, [%1+REG_SZ_BYTES*0]
247 mov %3, [%1+REG_SZ_BYTES*1]
249 mov %4, [%1+REG_SZ_BYTES*2]
250 mov %5, [%1+REG_SZ_BYTES*3]
251 %endmacro
253 %macro PROCESS_16X2X4 8
254 %if %1==0
255 movdqa xmm0, XMMWORD PTR [%2]
256 lddqu xmm4, XMMWORD PTR [%3]
257 lddqu xmm5, XMMWORD PTR [%4]
258 lddqu xmm6, XMMWORD PTR [%5]
259 lddqu xmm7, XMMWORD PTR [%6]
261 psadbw xmm4, xmm0
262 psadbw xmm5, xmm0
263 psadbw xmm6, xmm0
264 psadbw xmm7, xmm0
265 %else
266 movdqa xmm0, XMMWORD PTR [%2]
267 lddqu xmm1, XMMWORD PTR [%3]
268 lddqu xmm2, XMMWORD PTR [%4]
269 lddqu xmm3, XMMWORD PTR [%5]
271 psadbw xmm1, xmm0
272 psadbw xmm2, xmm0
273 psadbw xmm3, xmm0
275 paddw xmm4, xmm1
276 lddqu xmm1, XMMWORD PTR [%6]
277 paddw xmm5, xmm2
278 paddw xmm6, xmm3
280 psadbw xmm1, xmm0
281 paddw xmm7, xmm1
282 %endif
283 movdqa xmm0, XMMWORD PTR [%2+%7]
284 lddqu xmm1, XMMWORD PTR [%3+%8]
285 lddqu xmm2, XMMWORD PTR [%4+%8]
286 lddqu xmm3, XMMWORD PTR [%5+%8]
288 psadbw xmm1, xmm0
289 psadbw xmm2, xmm0
290 psadbw xmm3, xmm0
292 paddw xmm4, xmm1
293 lddqu xmm1, XMMWORD PTR [%6+%8]
294 paddw xmm5, xmm2
295 paddw xmm6, xmm3
297 %if %1==0 || %1==1
298 lea %2, [%2+%7*2]
299 lea %3, [%3+%8*2]
301 lea %4, [%4+%8*2]
302 lea %5, [%5+%8*2]
304 lea %6, [%6+%8*2]
305 %endif
306 psadbw xmm1, xmm0
307 paddw xmm7, xmm1
309 %endmacro
311 %macro PROCESS_8X2X4 8
312 %if %1==0
313 movq mm0, QWORD PTR [%2]
314 movq mm4, QWORD PTR [%3]
315 movq mm5, QWORD PTR [%4]
316 movq mm6, QWORD PTR [%5]
317 movq mm7, QWORD PTR [%6]
319 psadbw mm4, mm0
320 psadbw mm5, mm0
321 psadbw mm6, mm0
322 psadbw mm7, mm0
323 %else
324 movq mm0, QWORD PTR [%2]
325 movq mm1, QWORD PTR [%3]
326 movq mm2, QWORD PTR [%4]
327 movq mm3, QWORD PTR [%5]
329 psadbw mm1, mm0
330 psadbw mm2, mm0
331 psadbw mm3, mm0
333 paddw mm4, mm1
334 movq mm1, QWORD PTR [%6]
335 paddw mm5, mm2
336 paddw mm6, mm3
338 psadbw mm1, mm0
339 paddw mm7, mm1
340 %endif
341 movq mm0, QWORD PTR [%2+%7]
342 movq mm1, QWORD PTR [%3+%8]
343 movq mm2, QWORD PTR [%4+%8]
344 movq mm3, QWORD PTR [%5+%8]
346 psadbw mm1, mm0
347 psadbw mm2, mm0
348 psadbw mm3, mm0
350 paddw mm4, mm1
351 movq mm1, QWORD PTR [%6+%8]
352 paddw mm5, mm2
353 paddw mm6, mm3
355 %if %1==0 || %1==1
356 lea %2, [%2+%7*2]
357 lea %3, [%3+%8*2]
359 lea %4, [%4+%8*2]
360 lea %5, [%5+%8*2]
362 lea %6, [%6+%8*2]
363 %endif
364 psadbw mm1, mm0
365 paddw mm7, mm1
367 %endmacro
369 ;void int vp8_sad16x16x3_sse3(
370 ; unsigned char *src_ptr,
371 ; int src_stride,
372 ; unsigned char *ref_ptr,
373 ; int ref_stride,
374 ; int *results)
375 global sym(vp8_sad16x16x3_sse3)
376 sym(vp8_sad16x16x3_sse3):
378 STACK_FRAME_CREATE_X3
380 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
381 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
382 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
383 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
384 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
385 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
386 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
387 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
389 mov rcx, result_ptr
391 movq xmm0, xmm5
392 psrldq xmm5, 8
394 paddw xmm0, xmm5
395 movd [rcx], xmm0
397 movq xmm0, xmm6
398 psrldq xmm6, 8
400 paddw xmm0, xmm6
401 movd [rcx+4], xmm0
403 movq xmm0, xmm7
404 psrldq xmm7, 8
406 paddw xmm0, xmm7
407 movd [rcx+8], xmm0
409 STACK_FRAME_DESTROY_X3
411 ;void int vp8_sad16x8x3_sse3(
412 ; unsigned char *src_ptr,
413 ; int src_stride,
414 ; unsigned char *ref_ptr,
415 ; int ref_stride,
416 ; int *results)
417 global sym(vp8_sad16x8x3_sse3)
418 sym(vp8_sad16x8x3_sse3):
420 STACK_FRAME_CREATE_X3
422 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
423 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
424 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
425 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
427 mov rcx, result_ptr
429 movq xmm0, xmm5
430 psrldq xmm5, 8
432 paddw xmm0, xmm5
433 movd [rcx], xmm0
435 movq xmm0, xmm6
436 psrldq xmm6, 8
438 paddw xmm0, xmm6
439 movd [rcx+4], xmm0
441 movq xmm0, xmm7
442 psrldq xmm7, 8
444 paddw xmm0, xmm7
445 movd [rcx+8], xmm0
447 STACK_FRAME_DESTROY_X3
449 ;void int vp8_sad8x16x3_sse3(
450 ; unsigned char *src_ptr,
451 ; int src_stride,
452 ; unsigned char *ref_ptr,
453 ; int ref_stride,
454 ; int *results)
455 global sym(vp8_sad8x16x3_sse3)
456 sym(vp8_sad8x16x3_sse3):
458 STACK_FRAME_CREATE_X3
460 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
461 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
462 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
463 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
464 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
465 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
466 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
467 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
469 mov rcx, result_ptr
471 punpckldq mm5, mm6
473 movq [rcx], mm5
474 movd [rcx+8], mm7
476 STACK_FRAME_DESTROY_X3
478 ;void int vp8_sad8x8x3_sse3(
479 ; unsigned char *src_ptr,
480 ; int src_stride,
481 ; unsigned char *ref_ptr,
482 ; int ref_stride,
483 ; int *results)
484 global sym(vp8_sad8x8x3_sse3)
485 sym(vp8_sad8x8x3_sse3):
487 STACK_FRAME_CREATE_X3
489 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
490 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
491 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
492 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
494 mov rcx, result_ptr
496 punpckldq mm5, mm6
498 movq [rcx], mm5
499 movd [rcx+8], mm7
501 STACK_FRAME_DESTROY_X3
503 ;void int vp8_sad4x4x3_sse3(
504 ; unsigned char *src_ptr,
505 ; int src_stride,
506 ; unsigned char *ref_ptr,
507 ; int ref_stride,
508 ; int *results)
509 global sym(vp8_sad4x4x3_sse3)
510 sym(vp8_sad4x4x3_sse3):
512 STACK_FRAME_CREATE_X3
514 movd mm0, DWORD PTR [src_ptr]
515 movd mm1, DWORD PTR [ref_ptr]
517 movd mm2, DWORD PTR [src_ptr+src_stride]
518 movd mm3, DWORD PTR [ref_ptr+ref_stride]
520 punpcklbw mm0, mm2
521 punpcklbw mm1, mm3
523 movd mm4, DWORD PTR [ref_ptr+1]
524 movd mm5, DWORD PTR [ref_ptr+2]
526 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
527 movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
529 psadbw mm1, mm0
531 punpcklbw mm4, mm2
532 punpcklbw mm5, mm3
534 psadbw mm4, mm0
535 psadbw mm5, mm0
537 lea src_ptr, [src_ptr+src_stride*2]
538 lea ref_ptr, [ref_ptr+ref_stride*2]
540 movd mm0, DWORD PTR [src_ptr]
541 movd mm2, DWORD PTR [ref_ptr]
543 movd mm3, DWORD PTR [src_ptr+src_stride]
544 movd mm6, DWORD PTR [ref_ptr+ref_stride]
546 punpcklbw mm0, mm3
547 punpcklbw mm2, mm6
549 movd mm3, DWORD PTR [ref_ptr+1]
550 movd mm7, DWORD PTR [ref_ptr+2]
552 psadbw mm2, mm0
554 paddw mm1, mm2
556 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
557 movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
559 punpcklbw mm3, mm2
560 punpcklbw mm7, mm6
562 psadbw mm3, mm0
563 psadbw mm7, mm0
565 paddw mm3, mm4
566 paddw mm7, mm5
568 mov rcx, result_ptr
570 punpckldq mm1, mm3
572 movq [rcx], mm1
573 movd [rcx+8], mm7
575 STACK_FRAME_DESTROY_X3
577 ;unsigned int vp8_sad16x16_sse3(
578 ; unsigned char *src_ptr,
579 ; int src_stride,
580 ; unsigned char *ref_ptr,
581 ; int ref_stride,
582 ; int max_err)
583 ;%define lddqu movdqu
584 global sym(vp8_sad16x16_sse3)
585 sym(vp8_sad16x16_sse3):
587 STACK_FRAME_CREATE_X3
589 lea end_ptr, [src_ptr+src_stride*8]
591 lea end_ptr, [end_ptr+src_stride*8]
592 pxor mm7, mm7
594 .vp8_sad16x16_sse3_loop:
596 movq ret_var, mm7
597 cmp ret_var, max_err
598 jg .vp8_sad16x16_early_exit
600 movq mm0, QWORD PTR [src_ptr]
601 movq mm2, QWORD PTR [src_ptr+8]
603 movq mm1, QWORD PTR [ref_ptr]
604 movq mm3, QWORD PTR [ref_ptr+8]
606 movq mm4, QWORD PTR [src_ptr+src_stride]
607 movq mm5, QWORD PTR [ref_ptr+ref_stride]
609 psadbw mm0, mm1
610 psadbw mm2, mm3
612 movq mm1, QWORD PTR [src_ptr+src_stride+8]
613 movq mm3, QWORD PTR [ref_ptr+ref_stride+8]
615 psadbw mm4, mm5
616 psadbw mm1, mm3
618 lea src_ptr, [src_ptr+src_stride*2]
619 lea ref_ptr, [ref_ptr+ref_stride*2]
621 paddw mm0, mm2
622 paddw mm4, mm1
624 paddw mm7, mm0
625 paddw mm7, mm4
627 cmp src_ptr, end_ptr
628 jne .vp8_sad16x16_sse3_loop
630 movq ret_var, mm7
632 .vp8_sad16x16_early_exit:
634 mov rax, ret_var
636 STACK_FRAME_DESTROY_X3
638 ;void vp8_sad16x16x4d_sse3(
639 ; unsigned char *src_ptr,
640 ; int src_stride,
641 ; unsigned char *ref_ptr_base,
642 ; int ref_stride,
643 ; int *results)
644 global sym(vp8_sad16x16x4d_sse3)
645 sym(vp8_sad16x16x4d_sse3):
647 STACK_FRAME_CREATE_X4
649 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
650 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
651 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
652 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
653 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
654 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
655 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
656 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
658 %if ABI_IS_32BIT
659 pop rbp
660 %endif
661 mov rcx, result_ptr
663 movq xmm0, xmm4
664 psrldq xmm4, 8
666 paddw xmm0, xmm4
667 movd [rcx], xmm0
669 movq xmm0, xmm5
670 psrldq xmm5, 8
672 paddw xmm0, xmm5
673 movd [rcx+4], xmm0
675 movq xmm0, xmm6
676 psrldq xmm6, 8
678 paddw xmm0, xmm6
679 movd [rcx+8], xmm0
681 movq xmm0, xmm7
682 psrldq xmm7, 8
684 paddw xmm0, xmm7
685 movd [rcx+12], xmm0
687 STACK_FRAME_DESTROY_X4
689 ;void vp8_sad16x8x4d_sse3(
690 ; unsigned char *src_ptr,
691 ; int src_stride,
692 ; unsigned char *ref_ptr_base,
693 ; int ref_stride,
694 ; int *results)
695 global sym(vp8_sad16x8x4d_sse3)
696 sym(vp8_sad16x8x4d_sse3):
698 STACK_FRAME_CREATE_X4
700 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
701 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
702 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
703 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
705 %if ABI_IS_32BIT
706 pop rbp
707 %endif
708 mov rcx, result_ptr
710 movq xmm0, xmm4
711 psrldq xmm4, 8
713 paddw xmm0, xmm4
714 movd [rcx], xmm0
716 movq xmm0, xmm5
717 psrldq xmm5, 8
719 paddw xmm0, xmm5
720 movd [rcx+4], xmm0
722 movq xmm0, xmm6
723 psrldq xmm6, 8
725 paddw xmm0, xmm6
726 movd [rcx+8], xmm0
728 movq xmm0, xmm7
729 psrldq xmm7, 8
731 paddw xmm0, xmm7
732 movd [rcx+12], xmm0
734 STACK_FRAME_DESTROY_X4
736 ;void int vp8_sad8x16x4d_sse3(
737 ; unsigned char *src_ptr,
738 ; int src_stride,
739 ; unsigned char *ref_ptr,
740 ; int ref_stride,
741 ; int *results)
742 global sym(vp8_sad8x16x4d_sse3)
743 sym(vp8_sad8x16x4d_sse3):
745 STACK_FRAME_CREATE_X4
747 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
748 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
749 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
750 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
751 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
752 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
753 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
754 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
756 %if ABI_IS_32BIT
757 pop rbp
758 %endif
759 mov rcx, result_ptr
761 punpckldq mm4, mm5
762 punpckldq mm6, mm7
764 movq [rcx], mm4
765 movq [rcx+8], mm6
767 STACK_FRAME_DESTROY_X4
769 ;void int vp8_sad8x8x4d_sse3(
770 ; unsigned char *src_ptr,
771 ; int src_stride,
772 ; unsigned char *ref_ptr,
773 ; int ref_stride,
774 ; int *results)
775 global sym(vp8_sad8x8x4d_sse3)
776 sym(vp8_sad8x8x4d_sse3):
778 STACK_FRAME_CREATE_X4
780 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
781 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
782 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
783 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
785 %if ABI_IS_32BIT
786 pop rbp
787 %endif
788 mov rcx, result_ptr
790 punpckldq mm4, mm5
791 punpckldq mm6, mm7
793 movq [rcx], mm4
794 movq [rcx+8], mm6
796 STACK_FRAME_DESTROY_X4
798 ;void int vp8_sad4x4x4d_sse3(
799 ; unsigned char *src_ptr,
800 ; int src_stride,
801 ; unsigned char *ref_ptr,
802 ; int ref_stride,
803 ; int *results)
804 global sym(vp8_sad4x4x4d_sse3)
805 sym(vp8_sad4x4x4d_sse3):
807 STACK_FRAME_CREATE_X4
809 movd mm0, DWORD PTR [src_ptr]
810 movd mm1, DWORD PTR [r0_ptr]
812 movd mm2, DWORD PTR [src_ptr+src_stride]
813 movd mm3, DWORD PTR [r0_ptr+ref_stride]
815 punpcklbw mm0, mm2
816 punpcklbw mm1, mm3
818 movd mm4, DWORD PTR [r1_ptr]
819 movd mm5, DWORD PTR [r2_ptr]
821 movd mm6, DWORD PTR [r3_ptr]
822 movd mm2, DWORD PTR [r1_ptr+ref_stride]
824 movd mm3, DWORD PTR [r2_ptr+ref_stride]
825 movd mm7, DWORD PTR [r3_ptr+ref_stride]
827 psadbw mm1, mm0
829 punpcklbw mm4, mm2
830 punpcklbw mm5, mm3
832 punpcklbw mm6, mm7
833 psadbw mm4, mm0
835 psadbw mm5, mm0
836 psadbw mm6, mm0
840 lea src_ptr, [src_ptr+src_stride*2]
841 lea r0_ptr, [r0_ptr+ref_stride*2]
843 lea r1_ptr, [r1_ptr+ref_stride*2]
844 lea r2_ptr, [r2_ptr+ref_stride*2]
846 lea r3_ptr, [r3_ptr+ref_stride*2]
848 movd mm0, DWORD PTR [src_ptr]
849 movd mm2, DWORD PTR [r0_ptr]
851 movd mm3, DWORD PTR [src_ptr+src_stride]
852 movd mm7, DWORD PTR [r0_ptr+ref_stride]
854 punpcklbw mm0, mm3
855 punpcklbw mm2, mm7
857 movd mm3, DWORD PTR [r1_ptr]
858 movd mm7, DWORD PTR [r2_ptr]
860 psadbw mm2, mm0
861 %if ABI_IS_32BIT
862 mov rax, rbp
864 pop rbp
865 %define ref_stride rax
866 %endif
867 mov rsi, result_ptr
869 paddw mm1, mm2
870 movd [rsi], mm1
872 movd mm2, DWORD PTR [r1_ptr+ref_stride]
873 movd mm1, DWORD PTR [r2_ptr+ref_stride]
875 punpcklbw mm3, mm2
876 punpcklbw mm7, mm1
878 psadbw mm3, mm0
879 psadbw mm7, mm0
881 movd mm2, DWORD PTR [r3_ptr]
882 movd mm1, DWORD PTR [r3_ptr+ref_stride]
884 paddw mm3, mm4
885 paddw mm7, mm5
887 movd [rsi+4], mm3
888 punpcklbw mm2, mm1
890 movd [rsi+8], mm7
891 psadbw mm2, mm0
893 paddw mm2, mm6
894 movd [rsi+12], mm2
897 STACK_FRAME_DESTROY_X4