2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
15 global sym
(vp8_get_mb_ss_mmx
)
16 sym
(vp8_get_mb_ss_mmx
):
19 SHADOW_ARGS_TO_STACK
7
26 mov rax
, arg
(0) ;src_ptr
48 movq
QWORD PTR [rsp
], mm4
50 ;return sum[0]+sum[1];
51 movsxd rax
, dword ptr [rsp
]
52 movsxd rcx
, dword ptr [rsp
+4]
66 ;unsigned int vp8_get8x8var_mmx
68 ; unsigned char *src_ptr,
70 ; unsigned char *ref_ptr,
75 global sym
(vp8_get8x8var_mmx
)
76 sym
(vp8_get8x8var_mmx
):
79 SHADOW_ARGS_TO_STACK
6
87 pxor mm5
, mm5
; Blank mmx6
88 pxor mm6
, mm6
; Blank mmx7
89 pxor mm7
, mm7
; Blank mmx7
91 mov rax
, arg
(0) ;[src_ptr] ; Load base addresses
92 mov rbx
, arg
(2) ;[ref_ptr]
93 movsxd rcx
, dword ptr arg
(1) ;[source_stride]
94 movsxd rdx
, dword ptr arg
(3) ;[recon_stride]
97 movq mm0
, [rax
] ; Copy eight bytes to mm0
98 movq mm1
, [rbx
] ; Copy eight bytes to mm1
99 movq mm2
, mm0
; Take copies
100 movq mm3
, mm1
; Take copies
102 punpcklbw mm0
, mm6
; unpack to higher prrcision
104 punpckhbw mm2
, mm6
; unpack to higher prrcision
106 psubsw mm0
, mm1
; A-B (low order) to MM0
107 psubsw mm2
, mm3
; A-B (high order) to MM2
109 paddw mm5
, mm0
; accumulate differences in mm5
110 paddw mm5
, mm2
; accumulate differences in mm5
112 pmaddwd mm0
, mm0
; square and accumulate
113 pmaddwd mm2
, mm2
; square and accumulate
114 add rbx
,rdx
; Inc pointer into ref data
115 add rax
,rcx
; Inc pointer into the new data
116 movq mm1
, [rbx
] ; Copy eight bytes to mm1
117 paddd mm7
, mm0
; accumulate in mm7
118 paddd mm7
, mm2
; accumulate in mm7
122 movq mm0
, [rax
] ; Copy eight bytes to mm0
123 movq mm2
, mm0
; Take copies
124 movq mm3
, mm1
; Take copies
126 punpcklbw mm0
, mm6
; unpack to higher prrcision
128 punpckhbw mm2
, mm6
; unpack to higher prrcision
130 psubsw mm0
, mm1
; A-B (low order) to MM0
131 psubsw mm2
, mm3
; A-B (high order) to MM2
133 paddw mm5
, mm0
; accumulate differences in mm5
134 paddw mm5
, mm2
; accumulate differences in mm5
136 pmaddwd mm0
, mm0
; square and accumulate
137 pmaddwd mm2
, mm2
; square and accumulate
138 add rbx
,rdx
; Inc pointer into ref data
139 add rax
,rcx
; Inc pointer into the new data
140 movq mm1
, [rbx
] ; Copy eight bytes to mm1
141 paddd mm7
, mm0
; accumulate in mm7
142 paddd mm7
, mm2
; accumulate in mm7
145 movq mm0
, [rax
] ; Copy eight bytes to mm0
146 movq mm2
, mm0
; Take copies
147 movq mm3
, mm1
; Take copies
149 punpcklbw mm0
, mm6
; unpack to higher prrcision
151 punpckhbw mm2
, mm6
; unpack to higher prrcision
153 psubsw mm0
, mm1
; A-B (low order) to MM0
154 psubsw mm2
, mm3
; A-B (high order) to MM2
156 paddw mm5
, mm0
; accumulate differences in mm5
157 paddw mm5
, mm2
; accumulate differences in mm5
159 pmaddwd mm0
, mm0
; square and accumulate
160 pmaddwd mm2
, mm2
; square and accumulate
161 add rbx
,rdx
; Inc pointer into ref data
162 add rax
,rcx
; Inc pointer into the new data
163 movq mm1
, [rbx
] ; Copy eight bytes to mm1
164 paddd mm7
, mm0
; accumulate in mm7
165 paddd mm7
, mm2
; accumulate in mm7
168 movq mm0
, [rax
] ; Copy eight bytes to mm0
169 movq mm2
, mm0
; Take copies
170 movq mm3
, mm1
; Take copies
172 punpcklbw mm0
, mm6
; unpack to higher prrcision
174 punpckhbw mm2
, mm6
; unpack to higher prrcision
176 psubsw mm0
, mm1
; A-B (low order) to MM0
177 psubsw mm2
, mm3
; A-B (high order) to MM2
179 paddw mm5
, mm0
; accumulate differences in mm5
180 paddw mm5
, mm2
; accumulate differences in mm5
182 pmaddwd mm0
, mm0
; square and accumulate
183 pmaddwd mm2
, mm2
; square and accumulate
184 add rbx
,rdx
; Inc pointer into ref data
185 add rax
,rcx
; Inc pointer into the new data
186 movq mm1
, [rbx
] ; Copy eight bytes to mm1
187 paddd mm7
, mm0
; accumulate in mm7
188 paddd mm7
, mm2
; accumulate in mm7
191 movq mm0
, [rax
] ; Copy eight bytes to mm0
192 movq mm2
, mm0
; Take copies
193 movq mm3
, mm1
; Take copies
195 punpcklbw mm0
, mm6
; unpack to higher prrcision
197 punpckhbw mm2
, mm6
; unpack to higher prrcision
199 psubsw mm0
, mm1
; A-B (low order) to MM0
200 psubsw mm2
, mm3
; A-B (high order) to MM2
202 paddw mm5
, mm0
; accumulate differences in mm5
203 paddw mm5
, mm2
; accumulate differences in mm5
205 pmaddwd mm0
, mm0
; square and accumulate
206 pmaddwd mm2
, mm2
; square and accumulate
207 add rbx
,rdx
; Inc pointer into ref data
208 add rax
,rcx
; Inc pointer into the new data
209 movq mm1
, [rbx
] ; Copy eight bytes to mm1
210 ; movq mm4, [rbx + rdx]
211 paddd mm7
, mm0
; accumulate in mm7
212 paddd mm7
, mm2
; accumulate in mm7
215 movq mm0
, [rax
] ; Copy eight bytes to mm0
216 movq mm2
, mm0
; Take copies
217 movq mm3
, mm1
; Take copies
219 punpcklbw mm0
, mm6
; unpack to higher prrcision
221 punpckhbw mm2
, mm6
; unpack to higher prrcision
223 psubsw mm0
, mm1
; A-B (low order) to MM0
224 psubsw mm2
, mm3
; A-B (high order) to MM2
226 paddw mm5
, mm0
; accumulate differences in mm5
227 paddw mm5
, mm2
; accumulate differences in mm5
229 pmaddwd mm0
, mm0
; square and accumulate
230 pmaddwd mm2
, mm2
; square and accumulate
231 add rbx
,rdx
; Inc pointer into ref data
232 add rax
,rcx
; Inc pointer into the new data
233 movq mm1
, [rbx
] ; Copy eight bytes to mm1
234 paddd mm7
, mm0
; accumulate in mm7
235 paddd mm7
, mm2
; accumulate in mm7
238 movq mm0
, [rax
] ; Copy eight bytes to mm0
239 movq mm2
, mm0
; Take copies
240 movq mm3
, mm1
; Take copies
242 punpcklbw mm0
, mm6
; unpack to higher prrcision
244 punpckhbw mm2
, mm6
; unpack to higher prrcision
246 psubsw mm0
, mm1
; A-B (low order) to MM0
247 psubsw mm2
, mm3
; A-B (high order) to MM2
249 paddw mm5
, mm0
; accumulate differences in mm5
250 paddw mm5
, mm2
; accumulate differences in mm5
252 pmaddwd mm0
, mm0
; square and accumulate
253 pmaddwd mm2
, mm2
; square and accumulate
254 add rbx
,rdx
; Inc pointer into ref data
255 add rax
,rcx
; Inc pointer into the new data
256 movq mm1
, [rbx
] ; Copy eight bytes to mm1
257 paddd mm7
, mm0
; accumulate in mm7
258 paddd mm7
, mm2
; accumulate in mm7
261 movq mm0
, [rax
] ; Copy eight bytes to mm0
262 movq mm2
, mm0
; Take copies
263 movq mm3
, mm1
; Take copies
265 punpcklbw mm0
, mm6
; unpack to higher prrcision
267 punpckhbw mm2
, mm6
; unpack to higher prrcision
269 psubsw mm0
, mm1
; A-B (low order) to MM0
270 psubsw mm2
, mm3
; A-B (high order) to MM2
272 paddw mm5
, mm0
; accumulate differences in mm5
273 paddw mm5
, mm2
; accumulate differences in mm5
275 pmaddwd mm0
, mm0
; square and accumulate
276 pmaddwd mm2
, mm2
; square and accumulate
277 add rbx
,rdx
; Inc pointer into ref data
278 add rax
,rcx
; Inc pointer into the new data
279 paddd mm7
, mm0
; accumulate in mm7
280 paddd mm7
, mm2
; accumulate in mm7
282 ; Now accumulate the final results.
283 movq
QWORD PTR [rsp
+8], mm5
; copy back accumulated results into normal memory
284 movq
QWORD PTR [rsp
], mm7
; copy back accumulated results into normal memory
285 movsx rdx
, WORD PTR [rsp
+8]
286 movsx rcx
, WORD PTR [rsp
+10]
287 movsx rbx
, WORD PTR [rsp
+12]
288 movsx rax
, WORD PTR [rsp
+14]
292 movsxd rax
, DWORD PTR [rsp
]
293 movsxd rcx
, DWORD PTR [rsp
+4]
297 mov dword ptr [rsi
], eax
298 mov dword ptr [rdi
], edx
299 xor rax
, rax
; return 0
316 ; unsigned char *src_ptr,
318 ; unsigned char *ref_ptr,
323 global sym
(vp8_get4x4var_mmx
)
324 sym
(vp8_get4x4var_mmx
):
327 SHADOW_ARGS_TO_STACK
6
335 pxor mm5
, mm5
; Blank mmx6
336 pxor mm6
, mm6
; Blank mmx7
337 pxor mm7
, mm7
; Blank mmx7
339 mov rax
, arg
(0) ;[src_ptr] ; Load base addresses
340 mov rbx
, arg
(2) ;[ref_ptr]
341 movsxd rcx
, dword ptr arg
(1) ;[source_stride]
342 movsxd rdx
, dword ptr arg
(3) ;[recon_stride]
345 movq mm0
, [rax
] ; Copy eight bytes to mm0
346 movq mm1
, [rbx
] ; Copy eight bytes to mm1
347 punpcklbw mm0
, mm6
; unpack to higher prrcision
349 psubsw mm0
, mm1
; A-B (low order) to MM0
350 paddw mm5
, mm0
; accumulate differences in mm5
351 pmaddwd mm0
, mm0
; square and accumulate
352 add rbx
,rdx
; Inc pointer into ref data
353 add rax
,rcx
; Inc pointer into the new data
354 movq mm1
, [rbx
] ; Copy eight bytes to mm1
355 paddd mm7
, mm0
; accumulate in mm7
359 movq mm0
, [rax
] ; Copy eight bytes to mm0
360 punpcklbw mm0
, mm6
; unpack to higher prrcision
362 psubsw mm0
, mm1
; A-B (low order) to MM0
363 paddw mm5
, mm0
; accumulate differences in mm5
365 pmaddwd mm0
, mm0
; square and accumulate
366 add rbx
,rdx
; Inc pointer into ref data
367 add rax
,rcx
; Inc pointer into the new data
368 movq mm1
, [rbx
] ; Copy eight bytes to mm1
369 paddd mm7
, mm0
; accumulate in mm7
372 movq mm0
, [rax
] ; Copy eight bytes to mm0
373 punpcklbw mm0
, mm6
; unpack to higher prrcision
375 psubsw mm0
, mm1
; A-B (low order) to MM0
376 paddw mm5
, mm0
; accumulate differences in mm5
378 pmaddwd mm0
, mm0
; square and accumulate
379 add rbx
,rdx
; Inc pointer into ref data
380 add rax
,rcx
; Inc pointer into the new data
381 movq mm1
, [rbx
] ; Copy eight bytes to mm1
382 paddd mm7
, mm0
; accumulate in mm7
385 movq mm0
, [rax
] ; Copy eight bytes to mm0
387 punpcklbw mm0
, mm6
; unpack to higher prrcision
389 psubsw mm0
, mm1
; A-B (low order) to MM0
391 paddw mm5
, mm0
; accumulate differences in mm5
393 pmaddwd mm0
, mm0
; square and accumulate
394 paddd mm7
, mm0
; accumulate in mm7
397 ; Now accumulate the final results.
398 movq
QWORD PTR [rsp
+8], mm5
; copy back accumulated results into normal memory
399 movq
QWORD PTR [rsp
], mm7
; copy back accumulated results into normal memory
400 movsx rdx
, WORD PTR [rsp
+8]
401 movsx rcx
, WORD PTR [rsp
+10]
402 movsx rbx
, WORD PTR [rsp
+12]
403 movsx rax
, WORD PTR [rsp
+14]
407 movsxd rax
, DWORD PTR [rsp
]
408 movsxd rcx
, DWORD PTR [rsp
+4]
412 mov dword ptr [rsi
], eax
413 mov dword ptr [rdi
], edx
414 xor rax
, rax
; return 0
429 ;vp8_get4x4sse_cs_mmx
431 ; unsigned char *src_ptr,
433 ; unsigned char *ref_ptr,
436 global sym
(vp8_get4x4sse_cs_mmx
)
437 sym
(vp8_get4x4sse_cs_mmx
):
440 SHADOW_ARGS_TO_STACK
4
447 pxor mm6
, mm6
; Blank mmx7
448 pxor mm7
, mm7
; Blank mmx7
450 mov rax
, arg
(0) ;[src_ptr] ; Load base addresses
451 mov rbx
, arg
(2) ;[ref_ptr]
452 movsxd rcx
, dword ptr arg
(1) ;[source_stride]
453 movsxd rdx
, dword ptr arg
(3) ;[recon_stride]
455 movd mm0
, [rax
] ; Copy eight bytes to mm0
456 movd mm1
, [rbx
] ; Copy eight bytes to mm1
457 punpcklbw mm0
, mm6
; unpack to higher prrcision
459 psubsw mm0
, mm1
; A-B (low order) to MM0
460 pmaddwd mm0
, mm0
; square and accumulate
461 add rbx
,rdx
; Inc pointer into ref data
462 add rax
,rcx
; Inc pointer into the new data
463 movd mm1
, [rbx
] ; Copy eight bytes to mm1
464 paddd mm7
, mm0
; accumulate in mm7
467 movd mm0
, [rax
] ; Copy eight bytes to mm0
468 punpcklbw mm0
, mm6
; unpack to higher prrcision
470 psubsw mm0
, mm1
; A-B (low order) to MM0
471 pmaddwd mm0
, mm0
; square and accumulate
472 add rbx
,rdx
; Inc pointer into ref data
473 add rax
,rcx
; Inc pointer into the new data
474 movd mm1
, [rbx
] ; Copy eight bytes to mm1
475 paddd mm7
, mm0
; accumulate in mm7
478 movd mm0
, [rax
] ; Copy eight bytes to mm0
480 punpcklbw mm0
, mm6
; unpack to higher prrcision
481 psubsw mm0
, mm1
; A-B (low order) to MM0
483 pmaddwd mm0
, mm0
; square and accumulate
484 add rbx
,rdx
; Inc pointer into ref data
485 add rax
,rcx
; Inc pointer into the new data
486 movd mm1
, [rbx
] ; Copy eight bytes to mm1
487 paddd mm7
, mm0
; accumulate in mm7
490 movd mm0
, [rax
] ; Copy eight bytes to mm0
491 punpcklbw mm0
, mm6
; unpack to higher prrcision
493 psubsw mm0
, mm1
; A-B (low order) to MM0
494 pmaddwd mm0
, mm0
; square and accumulate
495 paddd mm7
, mm0
; accumulate in mm7
512 %define mmx_filter_shift
7
514 ;void vp8_filter_block2d_bil4x4_var_mmx
516 ; unsigned char *ref_ptr,
517 ; int ref_pixels_per_line,
518 ; unsigned char *src_ptr,
519 ; int src_pixels_per_line,
520 ; unsigned short *HFilter,
521 ; unsigned short *VFilter,
523 ; unsigned int *sumsquared
525 global sym
(vp8_filter_block2d_bil4x4_var_mmx
)
526 sym
(vp8_filter_block2d_bil4x4_var_mmx
):
529 SHADOW_ARGS_TO_STACK
8
540 mov rax
, arg
(4) ;HFilter ;
541 mov rdx
, arg
(5) ;VFilter ;
543 mov rsi
, arg
(0) ;ref_ptr ;
544 mov rdi
, arg
(2) ;src_ptr ;
556 pmullw mm3
, [rax
+8] ;
559 paddw mm1
, [GLOBAL(mmx_bi_rd
)] ;
561 psraw mm1
, mmx_filter_shift
;
565 add rsi
, dword ptr arg
(1) ;ref_pixels_per_line ;
567 movsxd r8
, dword ptr arg
(1) ;ref_pixels_per_line ;
571 filter_block2d_bil4x4_var_mmx_loop:
580 pmullw mm3
, [rax
+8] ;
583 paddw mm1
, [GLOBAL(mmx_bi_rd
)] ;
585 psraw mm1
, mmx_filter_shift
;
591 pmullw mm1
, [rdx
+8] ;
595 paddw mm1
, [GLOBAL(mmx_bi_rd
)] ;
596 psraw mm1
, mmx_filter_shift
;
608 add rsi
, dword ptr arg
(1) ;ref_pixels_per_line ;
609 add rdi
, dword ptr arg
(3) ;src_pixels_per_line ;
611 movsxd r8
, dword ptr arg
(1) ;ref_pixels_per_line
612 movsxd r9
, dword ptr arg
(3) ;src_pixels_per_line
617 jnz filter_block2d_bil4x4_var_mmx_loop
;
639 mov rsi
, arg
(7) ;sumsquared
641 movd
dword ptr [rdi
], mm2
;
642 movd
dword ptr [rsi
], mm4
;
658 ;void vp8_filter_block2d_bil_var_mmx
660 ; unsigned char *ref_ptr,
661 ; int ref_pixels_per_line,
662 ; unsigned char *src_ptr,
663 ; int src_pixels_per_line,
664 ; unsigned int Height,
665 ; unsigned short *HFilter,
666 ; unsigned short *VFilter,
668 ; unsigned int *sumsquared
670 global sym
(vp8_filter_block2d_bil_var_mmx
)
671 sym
(vp8_filter_block2d_bil_var_mmx
):
674 SHADOW_ARGS_TO_STACK
9
683 mov rax
, arg
(5) ;HFilter ;
685 mov rdx
, arg
(6) ;VFilter ;
686 mov rsi
, arg
(0) ;ref_ptr ;
688 mov rdi
, arg
(2) ;src_ptr ;
689 movsxd rcx
, dword ptr arg
(4) ;Height ;
707 pmullw mm3
, [rax
+8] ;
709 pmullw mm4
, [rax
+8] ;
713 paddw mm1
, [GLOBAL(mmx_bi_rd
)] ;
715 psraw mm1
, mmx_filter_shift
;
716 paddw mm2
, [GLOBAL(mmx_bi_rd
)] ;
718 psraw mm2
, mmx_filter_shift
;
723 add rsi
, dword ptr arg
(1) ;ref_pixels_per_line
725 movsxd r8
, dword ptr arg
(1) ;ref_pixels_per_line
729 filter_block2d_bil_var_mmx_loop:
746 pmullw mm3
, [rax
+8] ;
747 pmullw mm4
, [rax
+8] ;
752 paddw mm1
, [GLOBAL(mmx_bi_rd
)] ;
753 psraw mm1
, mmx_filter_shift
;
755 paddw mm2
, [GLOBAL(mmx_bi_rd
)] ;
756 psraw mm2
, mmx_filter_shift
;
770 pmullw mm1
, [rdx
+8] ;
771 pmullw mm2
, [rdx
+8] ;
776 paddw mm1
, [GLOBAL(mmx_bi_rd
)] ;
777 paddw mm2
, [GLOBAL(mmx_bi_rd
)] ;
779 psraw mm1
, mmx_filter_shift
;
780 psraw mm2
, mmx_filter_shift
;
801 add rsi
, dword ptr arg
(1) ;ref_pixels_per_line ;
802 add rdi
, dword ptr arg
(3) ;src_pixels_per_line ;
804 movsxd r8
, dword ptr arg
(1) ;ref_pixels_per_line ;
805 movsxd r9
, dword ptr arg
(3) ;src_pixels_per_line ;
810 jnz filter_block2d_bil_var_mmx_loop
;
832 mov rsi
, arg
(8) ;sumsquared
834 movd
dword ptr [rdi
], mm2
;
835 movd
dword ptr [rsi
], mm4
;
846 ;unsigned int vp8_get16x16pred_error_mmx
848 ; unsigned char *src_ptr,
850 ; unsigned char *ref_ptr,
853 global sym
(vp8_get16x16pred_error_mmx
)
854 sym
(vp8_get16x16pred_error_mmx
):
857 SHADOW_ARGS_TO_STACK
4
864 mov rsi
, arg
(0) ;DWORD PTR [src_ptr]
865 mov rdi
, arg
(2) ;DWORD PTR [ref_ptr]
867 movsxd rax
, DWORD PTR arg
(1) ;[src_stride]
868 movsxd rdx
, DWORD PTR arg
(3) ;[ref_stride]
870 pxor mm0
, mm0
; clear xmm0 for unpack
871 pxor mm7
, mm7
; clear xmm7 for accumulating diffs
873 pxor mm6
, mm6
; clear xmm6 for accumulating sse
955 movd
DWORD PTR [rsp
], mm6
;Sum
956 movd
DWORD PTR [rsp
+4], mm2
;SSE
958 ; return (SSE-((Sum*Sum)>>8));
959 movsxd rdx
, dword ptr [rsp
]
962 movsxd rax
, dword ptr [rsp
+ 4]
978 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};