Removed unused vp8_recon_intra4x4mb function
[libvpx.git] / vp8 / encoder / x86 / variance_impl_mmx.asm
blob67a9b4d3efa2e624acb41754045e601b6b10da3d
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
15 global sym(vp8_get_mb_ss_mmx)
16 sym(vp8_get_mb_ss_mmx):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 7
20 GET_GOT rbx
21 push rsi
22 push rdi
23 sub rsp, 8
24 ; end prolog
26 mov rax, arg(0) ;src_ptr
27 mov rcx, 16
28 pxor mm4, mm4
30 NEXTROW:
31 movq mm0, [rax]
32 movq mm1, [rax+8]
33 movq mm2, [rax+16]
34 movq mm3, [rax+24]
35 pmaddwd mm0, mm0
36 pmaddwd mm1, mm1
37 pmaddwd mm2, mm2
38 pmaddwd mm3, mm3
40 paddd mm4, mm0
41 paddd mm4, mm1
42 paddd mm4, mm2
43 paddd mm4, mm3
45 add rax, 32
46 dec rcx
47 ja NEXTROW
48 movq QWORD PTR [rsp], mm4
50 ;return sum[0]+sum[1];
51 movsxd rax, dword ptr [rsp]
52 movsxd rcx, dword ptr [rsp+4]
53 add rax, rcx
56 ; begin epilog
57 add rsp, 8
58 pop rdi
59 pop rsi
60 RESTORE_GOT
61 UNSHADOW_ARGS
62 pop rbp
63 ret
66 ;unsigned int vp8_get8x8var_mmx
68 ; unsigned char *src_ptr,
69 ; int source_stride,
70 ; unsigned char *ref_ptr,
71 ; int recon_stride,
72 ; unsigned int *SSE,
73 ; int *Sum
75 global sym(vp8_get8x8var_mmx)
76 sym(vp8_get8x8var_mmx):
77 push rbp
78 mov rbp, rsp
79 SHADOW_ARGS_TO_STACK 6
80 push rsi
81 push rdi
82 push rbx
83 sub rsp, 16
84 ; end prolog
87 pxor mm5, mm5 ; Blank mmx6
88 pxor mm6, mm6 ; Blank mmx7
89 pxor mm7, mm7 ; Blank mmx7
91 mov rax, arg(0) ;[src_ptr] ; Load base addresses
92 mov rbx, arg(2) ;[ref_ptr]
93 movsxd rcx, dword ptr arg(1) ;[source_stride]
94 movsxd rdx, dword ptr arg(3) ;[recon_stride]
96 ; Row 1
97 movq mm0, [rax] ; Copy eight bytes to mm0
98 movq mm1, [rbx] ; Copy eight bytes to mm1
99 movq mm2, mm0 ; Take copies
100 movq mm3, mm1 ; Take copies
102 punpcklbw mm0, mm6 ; unpack to higher prrcision
103 punpcklbw mm1, mm6
104 punpckhbw mm2, mm6 ; unpack to higher prrcision
105 punpckhbw mm3, mm6
106 psubsw mm0, mm1 ; A-B (low order) to MM0
107 psubsw mm2, mm3 ; A-B (high order) to MM2
109 paddw mm5, mm0 ; accumulate differences in mm5
110 paddw mm5, mm2 ; accumulate differences in mm5
112 pmaddwd mm0, mm0 ; square and accumulate
113 pmaddwd mm2, mm2 ; square and accumulate
114 add rbx,rdx ; Inc pointer into ref data
115 add rax,rcx ; Inc pointer into the new data
116 movq mm1, [rbx] ; Copy eight bytes to mm1
117 paddd mm7, mm0 ; accumulate in mm7
118 paddd mm7, mm2 ; accumulate in mm7
121 ; Row 2
122 movq mm0, [rax] ; Copy eight bytes to mm0
123 movq mm2, mm0 ; Take copies
124 movq mm3, mm1 ; Take copies
126 punpcklbw mm0, mm6 ; unpack to higher prrcision
127 punpcklbw mm1, mm6
128 punpckhbw mm2, mm6 ; unpack to higher prrcision
129 punpckhbw mm3, mm6
130 psubsw mm0, mm1 ; A-B (low order) to MM0
131 psubsw mm2, mm3 ; A-B (high order) to MM2
133 paddw mm5, mm0 ; accumulate differences in mm5
134 paddw mm5, mm2 ; accumulate differences in mm5
136 pmaddwd mm0, mm0 ; square and accumulate
137 pmaddwd mm2, mm2 ; square and accumulate
138 add rbx,rdx ; Inc pointer into ref data
139 add rax,rcx ; Inc pointer into the new data
140 movq mm1, [rbx] ; Copy eight bytes to mm1
141 paddd mm7, mm0 ; accumulate in mm7
142 paddd mm7, mm2 ; accumulate in mm7
144 ; Row 3
145 movq mm0, [rax] ; Copy eight bytes to mm0
146 movq mm2, mm0 ; Take copies
147 movq mm3, mm1 ; Take copies
149 punpcklbw mm0, mm6 ; unpack to higher prrcision
150 punpcklbw mm1, mm6
151 punpckhbw mm2, mm6 ; unpack to higher prrcision
152 punpckhbw mm3, mm6
153 psubsw mm0, mm1 ; A-B (low order) to MM0
154 psubsw mm2, mm3 ; A-B (high order) to MM2
156 paddw mm5, mm0 ; accumulate differences in mm5
157 paddw mm5, mm2 ; accumulate differences in mm5
159 pmaddwd mm0, mm0 ; square and accumulate
160 pmaddwd mm2, mm2 ; square and accumulate
161 add rbx,rdx ; Inc pointer into ref data
162 add rax,rcx ; Inc pointer into the new data
163 movq mm1, [rbx] ; Copy eight bytes to mm1
164 paddd mm7, mm0 ; accumulate in mm7
165 paddd mm7, mm2 ; accumulate in mm7
167 ; Row 4
168 movq mm0, [rax] ; Copy eight bytes to mm0
169 movq mm2, mm0 ; Take copies
170 movq mm3, mm1 ; Take copies
172 punpcklbw mm0, mm6 ; unpack to higher prrcision
173 punpcklbw mm1, mm6
174 punpckhbw mm2, mm6 ; unpack to higher prrcision
175 punpckhbw mm3, mm6
176 psubsw mm0, mm1 ; A-B (low order) to MM0
177 psubsw mm2, mm3 ; A-B (high order) to MM2
179 paddw mm5, mm0 ; accumulate differences in mm5
180 paddw mm5, mm2 ; accumulate differences in mm5
182 pmaddwd mm0, mm0 ; square and accumulate
183 pmaddwd mm2, mm2 ; square and accumulate
184 add rbx,rdx ; Inc pointer into ref data
185 add rax,rcx ; Inc pointer into the new data
186 movq mm1, [rbx] ; Copy eight bytes to mm1
187 paddd mm7, mm0 ; accumulate in mm7
188 paddd mm7, mm2 ; accumulate in mm7
190 ; Row 5
191 movq mm0, [rax] ; Copy eight bytes to mm0
192 movq mm2, mm0 ; Take copies
193 movq mm3, mm1 ; Take copies
195 punpcklbw mm0, mm6 ; unpack to higher prrcision
196 punpcklbw mm1, mm6
197 punpckhbw mm2, mm6 ; unpack to higher prrcision
198 punpckhbw mm3, mm6
199 psubsw mm0, mm1 ; A-B (low order) to MM0
200 psubsw mm2, mm3 ; A-B (high order) to MM2
202 paddw mm5, mm0 ; accumulate differences in mm5
203 paddw mm5, mm2 ; accumulate differences in mm5
205 pmaddwd mm0, mm0 ; square and accumulate
206 pmaddwd mm2, mm2 ; square and accumulate
207 add rbx,rdx ; Inc pointer into ref data
208 add rax,rcx ; Inc pointer into the new data
209 movq mm1, [rbx] ; Copy eight bytes to mm1
210 ; movq mm4, [rbx + rdx]
211 paddd mm7, mm0 ; accumulate in mm7
212 paddd mm7, mm2 ; accumulate in mm7
214 ; Row 6
215 movq mm0, [rax] ; Copy eight bytes to mm0
216 movq mm2, mm0 ; Take copies
217 movq mm3, mm1 ; Take copies
219 punpcklbw mm0, mm6 ; unpack to higher prrcision
220 punpcklbw mm1, mm6
221 punpckhbw mm2, mm6 ; unpack to higher prrcision
222 punpckhbw mm3, mm6
223 psubsw mm0, mm1 ; A-B (low order) to MM0
224 psubsw mm2, mm3 ; A-B (high order) to MM2
226 paddw mm5, mm0 ; accumulate differences in mm5
227 paddw mm5, mm2 ; accumulate differences in mm5
229 pmaddwd mm0, mm0 ; square and accumulate
230 pmaddwd mm2, mm2 ; square and accumulate
231 add rbx,rdx ; Inc pointer into ref data
232 add rax,rcx ; Inc pointer into the new data
233 movq mm1, [rbx] ; Copy eight bytes to mm1
234 paddd mm7, mm0 ; accumulate in mm7
235 paddd mm7, mm2 ; accumulate in mm7
237 ; Row 7
238 movq mm0, [rax] ; Copy eight bytes to mm0
239 movq mm2, mm0 ; Take copies
240 movq mm3, mm1 ; Take copies
242 punpcklbw mm0, mm6 ; unpack to higher prrcision
243 punpcklbw mm1, mm6
244 punpckhbw mm2, mm6 ; unpack to higher prrcision
245 punpckhbw mm3, mm6
246 psubsw mm0, mm1 ; A-B (low order) to MM0
247 psubsw mm2, mm3 ; A-B (high order) to MM2
249 paddw mm5, mm0 ; accumulate differences in mm5
250 paddw mm5, mm2 ; accumulate differences in mm5
252 pmaddwd mm0, mm0 ; square and accumulate
253 pmaddwd mm2, mm2 ; square and accumulate
254 add rbx,rdx ; Inc pointer into ref data
255 add rax,rcx ; Inc pointer into the new data
256 movq mm1, [rbx] ; Copy eight bytes to mm1
257 paddd mm7, mm0 ; accumulate in mm7
258 paddd mm7, mm2 ; accumulate in mm7
260 ; Row 8
261 movq mm0, [rax] ; Copy eight bytes to mm0
262 movq mm2, mm0 ; Take copies
263 movq mm3, mm1 ; Take copies
265 punpcklbw mm0, mm6 ; unpack to higher prrcision
266 punpcklbw mm1, mm6
267 punpckhbw mm2, mm6 ; unpack to higher prrcision
268 punpckhbw mm3, mm6
269 psubsw mm0, mm1 ; A-B (low order) to MM0
270 psubsw mm2, mm3 ; A-B (high order) to MM2
272 paddw mm5, mm0 ; accumulate differences in mm5
273 paddw mm5, mm2 ; accumulate differences in mm5
275 pmaddwd mm0, mm0 ; square and accumulate
276 pmaddwd mm2, mm2 ; square and accumulate
277 add rbx,rdx ; Inc pointer into ref data
278 add rax,rcx ; Inc pointer into the new data
279 paddd mm7, mm0 ; accumulate in mm7
280 paddd mm7, mm2 ; accumulate in mm7
282 ; Now accumulate the final results.
283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
285 movsx rdx, WORD PTR [rsp+8]
286 movsx rcx, WORD PTR [rsp+10]
287 movsx rbx, WORD PTR [rsp+12]
288 movsx rax, WORD PTR [rsp+14]
289 add rdx, rcx
290 add rbx, rax
291 add rdx, rbx ;XSum
292 movsxd rax, DWORD PTR [rsp]
293 movsxd rcx, DWORD PTR [rsp+4]
294 add rax, rcx ;XXSum
295 mov rsi, arg(4) ;SSE
296 mov rdi, arg(5) ;Sum
297 mov dword ptr [rsi], eax
298 mov dword ptr [rdi], edx
299 xor rax, rax ; return 0
302 ; begin epilog
303 add rsp, 16
304 pop rbx
305 pop rdi
306 pop rsi
307 UNSHADOW_ARGS
308 pop rbp
313 ;unsigned int
314 ;vp8_get4x4var_mmx
316 ; unsigned char *src_ptr,
317 ; int source_stride,
318 ; unsigned char *ref_ptr,
319 ; int recon_stride,
320 ; unsigned int *SSE,
321 ; int *Sum
323 global sym(vp8_get4x4var_mmx)
324 sym(vp8_get4x4var_mmx):
325 push rbp
326 mov rbp, rsp
327 SHADOW_ARGS_TO_STACK 6
328 push rsi
329 push rdi
330 push rbx
331 sub rsp, 16
332 ; end prolog
335 pxor mm5, mm5 ; Blank mmx6
336 pxor mm6, mm6 ; Blank mmx7
337 pxor mm7, mm7 ; Blank mmx7
339 mov rax, arg(0) ;[src_ptr] ; Load base addresses
340 mov rbx, arg(2) ;[ref_ptr]
341 movsxd rcx, dword ptr arg(1) ;[source_stride]
342 movsxd rdx, dword ptr arg(3) ;[recon_stride]
344 ; Row 1
345 movq mm0, [rax] ; Copy eight bytes to mm0
346 movq mm1, [rbx] ; Copy eight bytes to mm1
347 punpcklbw mm0, mm6 ; unpack to higher prrcision
348 punpcklbw mm1, mm6
349 psubsw mm0, mm1 ; A-B (low order) to MM0
350 paddw mm5, mm0 ; accumulate differences in mm5
351 pmaddwd mm0, mm0 ; square and accumulate
352 add rbx,rdx ; Inc pointer into ref data
353 add rax,rcx ; Inc pointer into the new data
354 movq mm1, [rbx] ; Copy eight bytes to mm1
355 paddd mm7, mm0 ; accumulate in mm7
358 ; Row 2
359 movq mm0, [rax] ; Copy eight bytes to mm0
360 punpcklbw mm0, mm6 ; unpack to higher prrcision
361 punpcklbw mm1, mm6
362 psubsw mm0, mm1 ; A-B (low order) to MM0
363 paddw mm5, mm0 ; accumulate differences in mm5
365 pmaddwd mm0, mm0 ; square and accumulate
366 add rbx,rdx ; Inc pointer into ref data
367 add rax,rcx ; Inc pointer into the new data
368 movq mm1, [rbx] ; Copy eight bytes to mm1
369 paddd mm7, mm0 ; accumulate in mm7
371 ; Row 3
372 movq mm0, [rax] ; Copy eight bytes to mm0
373 punpcklbw mm0, mm6 ; unpack to higher prrcision
374 punpcklbw mm1, mm6
375 psubsw mm0, mm1 ; A-B (low order) to MM0
376 paddw mm5, mm0 ; accumulate differences in mm5
378 pmaddwd mm0, mm0 ; square and accumulate
379 add rbx,rdx ; Inc pointer into ref data
380 add rax,rcx ; Inc pointer into the new data
381 movq mm1, [rbx] ; Copy eight bytes to mm1
382 paddd mm7, mm0 ; accumulate in mm7
384 ; Row 4
385 movq mm0, [rax] ; Copy eight bytes to mm0
387 punpcklbw mm0, mm6 ; unpack to higher prrcision
388 punpcklbw mm1, mm6
389 psubsw mm0, mm1 ; A-B (low order) to MM0
391 paddw mm5, mm0 ; accumulate differences in mm5
393 pmaddwd mm0, mm0 ; square and accumulate
394 paddd mm7, mm0 ; accumulate in mm7
397 ; Now accumulate the final results.
398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
400 movsx rdx, WORD PTR [rsp+8]
401 movsx rcx, WORD PTR [rsp+10]
402 movsx rbx, WORD PTR [rsp+12]
403 movsx rax, WORD PTR [rsp+14]
404 add rdx, rcx
405 add rbx, rax
406 add rdx, rbx ;XSum
407 movsxd rax, DWORD PTR [rsp]
408 movsxd rcx, DWORD PTR [rsp+4]
409 add rax, rcx ;XXSum
410 mov rsi, arg(4) ;SSE
411 mov rdi, arg(5) ;Sum
412 mov dword ptr [rsi], eax
413 mov dword ptr [rdi], edx
414 xor rax, rax ; return 0
417 ; begin epilog
418 add rsp, 16
419 pop rbx
420 pop rdi
421 pop rsi
422 UNSHADOW_ARGS
423 pop rbp
428 ;unsigned int
429 ;vp8_get4x4sse_cs_mmx
431 ; unsigned char *src_ptr,
432 ; int source_stride,
433 ; unsigned char *ref_ptr,
434 ; int recon_stride
436 global sym(vp8_get4x4sse_cs_mmx)
437 sym(vp8_get4x4sse_cs_mmx):
438 push rbp
439 mov rbp, rsp
440 SHADOW_ARGS_TO_STACK 4
441 push rsi
442 push rdi
443 push rbx
444 ; end prolog
447 pxor mm6, mm6 ; Blank mmx7
448 pxor mm7, mm7 ; Blank mmx7
450 mov rax, arg(0) ;[src_ptr] ; Load base addresses
451 mov rbx, arg(2) ;[ref_ptr]
452 movsxd rcx, dword ptr arg(1) ;[source_stride]
453 movsxd rdx, dword ptr arg(3) ;[recon_stride]
454 ; Row 1
455 movd mm0, [rax] ; Copy eight bytes to mm0
456 movd mm1, [rbx] ; Copy eight bytes to mm1
457 punpcklbw mm0, mm6 ; unpack to higher prrcision
458 punpcklbw mm1, mm6
459 psubsw mm0, mm1 ; A-B (low order) to MM0
460 pmaddwd mm0, mm0 ; square and accumulate
461 add rbx,rdx ; Inc pointer into ref data
462 add rax,rcx ; Inc pointer into the new data
463 movd mm1, [rbx] ; Copy eight bytes to mm1
464 paddd mm7, mm0 ; accumulate in mm7
466 ; Row 2
467 movd mm0, [rax] ; Copy eight bytes to mm0
468 punpcklbw mm0, mm6 ; unpack to higher prrcision
469 punpcklbw mm1, mm6
470 psubsw mm0, mm1 ; A-B (low order) to MM0
471 pmaddwd mm0, mm0 ; square and accumulate
472 add rbx,rdx ; Inc pointer into ref data
473 add rax,rcx ; Inc pointer into the new data
474 movd mm1, [rbx] ; Copy eight bytes to mm1
475 paddd mm7, mm0 ; accumulate in mm7
477 ; Row 3
478 movd mm0, [rax] ; Copy eight bytes to mm0
479 punpcklbw mm1, mm6
480 punpcklbw mm0, mm6 ; unpack to higher prrcision
481 psubsw mm0, mm1 ; A-B (low order) to MM0
483 pmaddwd mm0, mm0 ; square and accumulate
484 add rbx,rdx ; Inc pointer into ref data
485 add rax,rcx ; Inc pointer into the new data
486 movd mm1, [rbx] ; Copy eight bytes to mm1
487 paddd mm7, mm0 ; accumulate in mm7
489 ; Row 4
490 movd mm0, [rax] ; Copy eight bytes to mm0
491 punpcklbw mm0, mm6 ; unpack to higher prrcision
492 punpcklbw mm1, mm6
493 psubsw mm0, mm1 ; A-B (low order) to MM0
494 pmaddwd mm0, mm0 ; square and accumulate
495 paddd mm7, mm0 ; accumulate in mm7
497 movq mm0, mm7 ;
498 psrlq mm7, 32
500 paddd mm0, mm7
501 movq rax, mm0
504 ; begin epilog
505 pop rbx
506 pop rdi
507 pop rsi
508 UNSHADOW_ARGS
509 pop rbp
512 %define mmx_filter_shift 7
514 ;void vp8_filter_block2d_bil4x4_var_mmx
516 ; unsigned char *ref_ptr,
517 ; int ref_pixels_per_line,
518 ; unsigned char *src_ptr,
519 ; int src_pixels_per_line,
520 ; unsigned short *HFilter,
521 ; unsigned short *VFilter,
522 ; int *sum,
523 ; unsigned int *sumsquared
525 global sym(vp8_filter_block2d_bil4x4_var_mmx)
526 sym(vp8_filter_block2d_bil4x4_var_mmx):
527 push rbp
528 mov rbp, rsp
529 SHADOW_ARGS_TO_STACK 8
530 GET_GOT rbx
531 push rsi
532 push rdi
533 sub rsp, 16
534 ; end prolog
537 pxor mm6, mm6 ;
538 pxor mm7, mm7 ;
540 mov rax, arg(4) ;HFilter ;
541 mov rdx, arg(5) ;VFilter ;
543 mov rsi, arg(0) ;ref_ptr ;
544 mov rdi, arg(2) ;src_ptr ;
546 mov rcx, 4 ;
547 pxor mm0, mm0 ;
549 movd mm1, [rsi] ;
550 movd mm3, [rsi+1] ;
552 punpcklbw mm1, mm0 ;
553 pmullw mm1, [rax] ;
555 punpcklbw mm3, mm0 ;
556 pmullw mm3, [rax+8] ;
558 paddw mm1, mm3 ;
559 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
561 psraw mm1, mmx_filter_shift ;
562 movq mm5, mm1
564 %if ABI_IS_32BIT
565 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
566 %else
567 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
568 add rsi, r8
569 %endif
571 filter_block2d_bil4x4_var_mmx_loop:
573 movd mm1, [rsi] ;
574 movd mm3, [rsi+1] ;
576 punpcklbw mm1, mm0 ;
577 pmullw mm1, [rax] ;
579 punpcklbw mm3, mm0 ;
580 pmullw mm3, [rax+8] ;
582 paddw mm1, mm3 ;
583 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
585 psraw mm1, mmx_filter_shift ;
586 movq mm3, mm5 ;
588 movq mm5, mm1 ;
589 pmullw mm3, [rdx] ;
591 pmullw mm1, [rdx+8] ;
592 paddw mm1, mm3 ;
595 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
596 psraw mm1, mmx_filter_shift ;
598 movd mm3, [rdi] ;
599 punpcklbw mm3, mm0 ;
601 psubw mm1, mm3 ;
602 paddw mm6, mm1 ;
604 pmaddwd mm1, mm1 ;
605 paddd mm7, mm1 ;
607 %if ABI_IS_32BIT
608 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
609 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
610 %else
611 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
612 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
613 add rsi, r8
614 add rdi, r9
615 %endif
616 sub rcx, 1 ;
617 jnz filter_block2d_bil4x4_var_mmx_loop ;
620 pxor mm3, mm3 ;
621 pxor mm2, mm2 ;
623 punpcklwd mm2, mm6 ;
624 punpckhwd mm3, mm6 ;
626 paddd mm2, mm3 ;
627 movq mm6, mm2 ;
629 psrlq mm6, 32 ;
630 paddd mm2, mm6 ;
632 psrad mm2, 16 ;
633 movq mm4, mm7 ;
635 psrlq mm4, 32 ;
636 paddd mm4, mm7 ;
638 mov rdi, arg(6) ;sum
639 mov rsi, arg(7) ;sumsquared
641 movd dword ptr [rdi], mm2 ;
642 movd dword ptr [rsi], mm4 ;
646 ; begin epilog
647 add rsp, 16
648 pop rdi
649 pop rsi
650 RESTORE_GOT
651 UNSHADOW_ARGS
652 pop rbp
658 ;void vp8_filter_block2d_bil_var_mmx
660 ; unsigned char *ref_ptr,
661 ; int ref_pixels_per_line,
662 ; unsigned char *src_ptr,
663 ; int src_pixels_per_line,
664 ; unsigned int Height,
665 ; unsigned short *HFilter,
666 ; unsigned short *VFilter,
667 ; int *sum,
668 ; unsigned int *sumsquared
670 global sym(vp8_filter_block2d_bil_var_mmx)
671 sym(vp8_filter_block2d_bil_var_mmx):
672 push rbp
673 mov rbp, rsp
674 SHADOW_ARGS_TO_STACK 9
675 GET_GOT rbx
676 push rsi
677 push rdi
678 sub rsp, 16
679 ; end prolog
681 pxor mm6, mm6 ;
682 pxor mm7, mm7 ;
683 mov rax, arg(5) ;HFilter ;
685 mov rdx, arg(6) ;VFilter ;
686 mov rsi, arg(0) ;ref_ptr ;
688 mov rdi, arg(2) ;src_ptr ;
689 movsxd rcx, dword ptr arg(4) ;Height ;
691 pxor mm0, mm0 ;
692 movq mm1, [rsi] ;
694 movq mm3, [rsi+1] ;
695 movq mm2, mm1 ;
697 movq mm4, mm3 ;
698 punpcklbw mm1, mm0 ;
700 punpckhbw mm2, mm0 ;
701 pmullw mm1, [rax] ;
703 pmullw mm2, [rax] ;
704 punpcklbw mm3, mm0 ;
706 punpckhbw mm4, mm0 ;
707 pmullw mm3, [rax+8] ;
709 pmullw mm4, [rax+8] ;
710 paddw mm1, mm3 ;
712 paddw mm2, mm4 ;
713 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
715 psraw mm1, mmx_filter_shift ;
716 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
718 psraw mm2, mmx_filter_shift ;
719 movq mm5, mm1
721 packuswb mm5, mm2 ;
722 %if ABI_IS_32BIT
723 add rsi, dword ptr arg(1) ;ref_pixels_per_line
724 %else
725 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
726 add rsi, r8
727 %endif
729 filter_block2d_bil_var_mmx_loop:
731 movq mm1, [rsi] ;
732 movq mm3, [rsi+1] ;
734 movq mm2, mm1 ;
735 movq mm4, mm3 ;
737 punpcklbw mm1, mm0 ;
738 punpckhbw mm2, mm0 ;
740 pmullw mm1, [rax] ;
741 pmullw mm2, [rax] ;
743 punpcklbw mm3, mm0 ;
744 punpckhbw mm4, mm0 ;
746 pmullw mm3, [rax+8] ;
747 pmullw mm4, [rax+8] ;
749 paddw mm1, mm3 ;
750 paddw mm2, mm4 ;
752 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
753 psraw mm1, mmx_filter_shift ;
755 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
756 psraw mm2, mmx_filter_shift ;
758 movq mm3, mm5 ;
759 movq mm4, mm5 ;
761 punpcklbw mm3, mm0 ;
762 punpckhbw mm4, mm0 ;
764 movq mm5, mm1 ;
765 packuswb mm5, mm2 ;
767 pmullw mm3, [rdx] ;
768 pmullw mm4, [rdx] ;
770 pmullw mm1, [rdx+8] ;
771 pmullw mm2, [rdx+8] ;
773 paddw mm1, mm3 ;
774 paddw mm2, mm4 ;
776 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
777 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
779 psraw mm1, mmx_filter_shift ;
780 psraw mm2, mmx_filter_shift ;
782 movq mm3, [rdi] ;
783 movq mm4, mm3 ;
785 punpcklbw mm3, mm0 ;
786 punpckhbw mm4, mm0 ;
788 psubw mm1, mm3 ;
789 psubw mm2, mm4 ;
791 paddw mm6, mm1 ;
792 pmaddwd mm1, mm1 ;
794 paddw mm6, mm2 ;
795 pmaddwd mm2, mm2 ;
797 paddd mm7, mm1 ;
798 paddd mm7, mm2 ;
800 %if ABI_IS_32BIT
801 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
802 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
803 %else
804 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
805 movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
806 add rsi, r8
807 add rdi, r9
808 %endif
809 sub rcx, 1 ;
810 jnz filter_block2d_bil_var_mmx_loop ;
813 pxor mm3, mm3 ;
814 pxor mm2, mm2 ;
816 punpcklwd mm2, mm6 ;
817 punpckhwd mm3, mm6 ;
819 paddd mm2, mm3 ;
820 movq mm6, mm2 ;
822 psrlq mm6, 32 ;
823 paddd mm2, mm6 ;
825 psrad mm2, 16 ;
826 movq mm4, mm7 ;
828 psrlq mm4, 32 ;
829 paddd mm4, mm7 ;
831 mov rdi, arg(7) ;sum
832 mov rsi, arg(8) ;sumsquared
834 movd dword ptr [rdi], mm2 ;
835 movd dword ptr [rsi], mm4 ;
837 ; begin epilog
838 add rsp, 16
839 pop rdi
840 pop rsi
841 RESTORE_GOT
842 UNSHADOW_ARGS
843 pop rbp
846 ;unsigned int vp8_get16x16pred_error_mmx
848 ; unsigned char *src_ptr,
849 ; int src_stride,
850 ; unsigned char *ref_ptr,
851 ; int ref_stride
853 global sym(vp8_get16x16pred_error_mmx)
854 sym(vp8_get16x16pred_error_mmx):
855 push rbp
856 mov rbp, rsp
857 SHADOW_ARGS_TO_STACK 4
858 GET_GOT rbx
859 push rsi
860 push rdi
861 sub rsp, 16
862 ; end prolog
864 mov rsi, arg(0) ;DWORD PTR [src_ptr]
865 mov rdi, arg(2) ;DWORD PTR [ref_ptr]
867 movsxd rax, DWORD PTR arg(1) ;[src_stride]
868 movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
870 pxor mm0, mm0 ; clear xmm0 for unpack
871 pxor mm7, mm7 ; clear xmm7 for accumulating diffs
873 pxor mm6, mm6 ; clear xmm6 for accumulating sse
874 mov rcx, 16
876 var16loop:
878 movq mm1, [rsi]
879 movq mm2, [rdi]
881 movq mm3, mm1
882 movq mm4, mm2
884 punpcklbw mm1, mm0
885 punpckhbw mm3, mm0
887 punpcklbw mm2, mm0
888 punpckhbw mm4, mm0
890 psubw mm1, mm2
891 psubw mm3, mm4
893 paddw mm7, mm1
894 pmaddwd mm1, mm1
896 paddw mm7, mm3
897 pmaddwd mm3, mm3
899 paddd mm6, mm1
900 paddd mm6, mm3
903 movq mm1, [rsi+8]
904 movq mm2, [rdi+8]
906 movq mm3, mm1
907 movq mm4, mm2
909 punpcklbw mm1, mm0
910 punpckhbw mm3, mm0
912 punpcklbw mm2, mm0
913 punpckhbw mm4, mm0
915 psubw mm1, mm2
916 psubw mm3, mm4
918 paddw mm7, mm1
919 pmaddwd mm1, mm1
921 paddw mm7, mm3
922 pmaddwd mm3, mm3
924 paddd mm6, mm1
925 paddd mm6, mm3
927 add rsi, rax
928 add rdi, rdx
930 sub rcx, 1
931 jnz var16loop
934 movq mm1, mm6
935 pxor mm6, mm6
937 pxor mm5, mm5
938 punpcklwd mm6, mm7
940 punpckhwd mm5, mm7
941 psrad mm5, 16
943 psrad mm6, 16
944 paddd mm6, mm5
946 movq mm2, mm1
947 psrlq mm1, 32
949 paddd mm2, mm1
950 movq mm7, mm6
952 psrlq mm6, 32
953 paddd mm6, mm7
955 movd DWORD PTR [rsp], mm6 ;Sum
956 movd DWORD PTR [rsp+4], mm2 ;SSE
958 ; return (SSE-((Sum*Sum)>>8));
959 movsxd rdx, dword ptr [rsp]
960 imul rdx, rdx
961 sar rdx, 8
962 movsxd rax, dword ptr [rsp + 4]
963 sub rax, rdx
966 ; begin epilog
967 add rsp, 16
968 pop rdi
969 pop rsi
970 RESTORE_GOT
971 UNSHADOW_ARGS
972 pop rbp
977 SECTION_RODATA
978 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
979 align 16
980 mmx_bi_rd:
981 times 4 dw 64