2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;unsigned int vp8_sad16x16_wmt(
15 ; unsigned char *src_ptr,
17 ; unsigned char *ref_ptr,
19 global sym
(vp8_sad16x16_wmt
)
20 sym
(vp8_sad16x16_wmt
):
23 SHADOW_ARGS_TO_STACK
4
29 mov rsi
, arg
(0) ;src_ptr
30 mov rdi
, arg
(2) ;ref_ptr
32 movsxd rax
, dword ptr arg
(1) ;src_stride
33 movsxd rdx
, dword ptr arg
(3) ;ref_stride
42 movq xmm0
, QWORD PTR [rsi
]
43 movq xmm2
, QWORD PTR [rsi
+8]
45 movq xmm1
, QWORD PTR [rdi
]
46 movq xmm3
, QWORD PTR [rdi
+8]
48 movq xmm4
, QWORD PTR [rsi
+rax
]
49 movq xmm5
, QWORD PTR [rdi
+rdx
]
56 movq xmm2
, QWORD PTR [rsi
+rax
+8]
58 movq xmm3
, QWORD PTR [rdi
+rdx
+8]
71 jne .x16x16sad_wmt_loop
87 ;unsigned int vp8_sad8x16_wmt(
88 ; unsigned char *src_ptr,
90 ; unsigned char *ref_ptr,
93 global sym
(vp8_sad8x16_wmt
)
97 SHADOW_ARGS_TO_STACK
5
103 mov rsi
, arg
(0) ;src_ptr
104 mov rdi
, arg
(2) ;ref_ptr
106 movsxd rbx
, dword ptr arg
(1) ;src_stride
107 movsxd rdx
, dword ptr arg
(3) ;ref_stride
118 jg .x8x16sad_wmt_early_exit
120 movq mm0
, QWORD PTR [rsi
]
121 movq mm1
, QWORD PTR [rdi
]
123 movq mm2
, QWORD PTR [rsi
+rbx
]
124 movq mm3
, QWORD PTR [rdi
+rdx
]
136 jne .x8x16sad_wmt_loop
140 .
x8x16sad_wmt_early_exit:
151 ;unsigned int vp8_sad8x8_wmt(
152 ; unsigned char *src_ptr,
154 ; unsigned char *ref_ptr,
156 global sym
(vp8_sad8x8_wmt
)
160 SHADOW_ARGS_TO_STACK
5
166 mov rsi
, arg
(0) ;src_ptr
167 mov rdi
, arg
(2) ;ref_ptr
169 movsxd rbx
, dword ptr arg
(1) ;src_stride
170 movsxd rdx
, dword ptr arg
(3) ;ref_stride
179 jg .x8x8sad_wmt_early_exit
181 movq mm0
, QWORD PTR [rsi
]
182 movq mm1
, QWORD PTR [rdi
]
191 jne .x8x8sad_wmt_loop
194 .
x8x8sad_wmt_early_exit:
204 ;unsigned int vp8_sad4x4_wmt(
205 ; unsigned char *src_ptr,
207 ; unsigned char *ref_ptr,
209 global sym
(vp8_sad4x4_wmt
)
213 SHADOW_ARGS_TO_STACK
4
218 mov rsi
, arg
(0) ;src_ptr
219 mov rdi
, arg
(2) ;ref_ptr
221 movsxd rax
, dword ptr arg
(1) ;src_stride
222 movsxd rdx
, dword ptr arg
(3) ;ref_stride
224 movd mm0
, DWORD PTR [rsi
]
225 movd mm1
, DWORD PTR [rdi
]
227 movd mm2
, DWORD PTR [rsi
+rax
]
228 movd mm3
, DWORD PTR [rdi
+rdx
]
237 movd mm4
, DWORD PTR [rsi
]
239 movd mm5
, DWORD PTR [rdi
]
240 movd mm6
, DWORD PTR [rsi
+rax
]
242 movd mm7
, DWORD PTR [rdi
+rdx
]
259 ;unsigned int vp8_sad16x8_wmt(
260 ; unsigned char *src_ptr,
262 ; unsigned char *ref_ptr,
264 global sym
(vp8_sad16x8_wmt
)
265 sym
(vp8_sad16x8_wmt
):
268 SHADOW_ARGS_TO_STACK
5
275 mov rsi
, arg
(0) ;src_ptr
276 mov rdi
, arg
(2) ;ref_ptr
278 movsxd rbx
, dword ptr arg
(1) ;src_stride
279 movsxd rdx
, dword ptr arg
(3) ;ref_stride
288 jg .x16x8sad_wmt_early_exit
290 movq mm0
, QWORD PTR [rsi
]
291 movq mm2
, QWORD PTR [rsi
+8]
293 movq mm1
, QWORD PTR [rdi
]
294 movq mm3
, QWORD PTR [rdi
+8]
296 movq mm4
, QWORD PTR [rsi
+rbx
]
297 movq mm5
, QWORD PTR [rdi
+rdx
]
302 movq mm1
, QWORD PTR [rsi
+rbx
+8]
303 movq mm3
, QWORD PTR [rdi
+rdx
+8]
318 jne .x16x8sad_wmt_loop
322 .
x16x8sad_wmt_early_exit:
332 ;void vp8_copy32xn_sse2(
333 ; unsigned char *src_ptr,
335 ; unsigned char *dst_ptr,
338 global sym
(vp8_copy32xn_sse2
)
339 sym
(vp8_copy32xn_sse2
):
342 SHADOW_ARGS_TO_STACK
5
348 mov rsi
, arg
(0) ;src_ptr
349 mov rdi
, arg
(2) ;dst_ptr
351 movsxd rax
, dword ptr arg
(1) ;src_stride
352 movsxd rdx
, dword ptr arg
(3) ;dst_stride
353 movsxd rcx
, dword ptr arg
(4) ;height
355 .
block_copy_sse2_loopx4:
356 movdqu xmm0
, XMMWORD
PTR [rsi
]
357 movdqu xmm1
, XMMWORD
PTR [rsi
+ 16]
358 movdqu xmm2
, XMMWORD
PTR [rsi
+ rax
]
359 movdqu xmm3
, XMMWORD
PTR [rsi
+ rax
+ 16]
363 movdqu xmm4
, XMMWORD
PTR [rsi
]
364 movdqu xmm5
, XMMWORD
PTR [rsi
+ 16]
365 movdqu xmm6
, XMMWORD
PTR [rsi
+ rax
]
366 movdqu xmm7
, XMMWORD
PTR [rsi
+ rax
+ 16]
370 movdqa XMMWORD
PTR [rdi
], xmm0
371 movdqa XMMWORD
PTR [rdi
+ 16], xmm1
372 movdqa XMMWORD
PTR [rdi
+ rdx
], xmm2
373 movdqa XMMWORD
PTR [rdi
+ rdx
+ 16], xmm3
377 movdqa XMMWORD
PTR [rdi
], xmm4
378 movdqa XMMWORD
PTR [rdi
+ 16], xmm5
379 movdqa XMMWORD
PTR [rdi
+ rdx
], xmm6
380 movdqa XMMWORD
PTR [rdi
+ rdx
+ 16], xmm7
386 jge .block_copy_sse2_loopx4
391 .
block_copy_sse2_loop:
392 movdqu xmm0
, XMMWORD
PTR [rsi
]
393 movdqu xmm1
, XMMWORD
PTR [rsi
+ 16]
396 movdqa XMMWORD
PTR [rdi
], xmm0
397 movdqa XMMWORD
PTR [rdi
+ 16], xmm1
401 jne .block_copy_sse2_loop