Initial WebM release
[libvpx.git] / vp8 / encoder / x86 / sad_sse2.asm
blob53240bbf1bad59e97835e9dedad1e40287caf001
2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 %idefine QWORD
15 ;unsigned int vp8_sad16x16_wmt(
16 ; unsigned char *src_ptr,
17 ; int src_stride,
18 ; unsigned char *ref_ptr,
19 ; int ref_stride)
20 global sym(vp8_sad16x16_wmt)
21 sym(vp8_sad16x16_wmt):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 4
25 push rsi
26 push rdi
27 ; end prolog
29 mov rsi, arg(0) ;src_ptr
30 mov rdi, arg(2) ;ref_ptr
32 movsxd rax, dword ptr arg(1) ;src_stride
33 movsxd rdx, dword ptr arg(3) ;ref_stride
35 lea rcx, [rsi+rax*8]
37 lea rcx, [rcx+rax*8]
38 pxor xmm7, xmm7
40 x16x16sad_wmt_loop:
42 movq xmm0, QWORD PTR [rsi]
43 movq xmm2, QWORD PTR [rsi+8]
45 movq xmm1, QWORD PTR [rdi]
46 movq xmm3, QWORD PTR [rdi+8]
48 movq xmm4, QWORD PTR [rsi+rax]
49 movq xmm5, QWORD PTR [rdi+rdx]
52 punpcklbw xmm0, xmm2
53 punpcklbw xmm1, xmm3
55 psadbw xmm0, xmm1
56 movq xmm6, QWORD PTR [rsi+rax+8]
58 movq xmm3, QWORD PTR [rdi+rdx+8]
59 lea rsi, [rsi+rax*2]
61 lea rdi, [rdi+rdx*2]
62 punpcklbw xmm4, xmm6
64 punpcklbw xmm5, xmm3
65 psadbw xmm4, xmm5
67 paddw xmm7, xmm0
68 paddw xmm7, xmm4
70 cmp rsi, rcx
71 jne x16x16sad_wmt_loop
73 movq xmm0, xmm7
74 psrldq xmm7, 8
76 paddw xmm0, xmm7
77 movd rax, xmm0
79 ; begin epilog
80 pop rdi
81 pop rsi
82 UNSHADOW_ARGS
83 pop rbp
84 ret
86 ;unsigned int vp8_sad8x16_wmt(
87 ; unsigned char *src_ptr,
88 ; int src_stride,
89 ; unsigned char *ref_ptr,
90 ; int ref_stride,
91 ; int max_err)
92 global sym(vp8_sad8x16_wmt)
93 sym(vp8_sad8x16_wmt):
94 push rbp
95 mov rbp, rsp
96 SHADOW_ARGS_TO_STACK 5
97 push rbx
98 push rsi
99 push rdi
100 ; end prolog
102 mov rsi, arg(0) ;src_ptr
103 mov rdi, arg(2) ;ref_ptr
105 movsxd rbx, dword ptr arg(1) ;src_stride
106 movsxd rdx, dword ptr arg(3) ;ref_stride
108 lea rcx, [rsi+rbx*8]
110 lea rcx, [rcx+rbx*8]
111 pxor mm7, mm7
113 x8x16sad_wmt_loop:
115 movd rax, mm7
116 cmp rax, arg(4)
117 jg x8x16sad_wmt_early_exit
119 movq mm0, QWORD PTR [rsi]
120 movq mm1, QWORD PTR [rdi]
122 movq mm2, QWORD PTR [rsi+rbx]
123 movq mm3, QWORD PTR [rdi+rdx]
125 psadbw mm0, mm1
126 psadbw mm2, mm3
128 lea rsi, [rsi+rbx*2]
129 lea rdi, [rdi+rdx*2]
131 paddw mm7, mm0
132 paddw mm7, mm2
134 cmp rsi, rcx
135 jne x8x16sad_wmt_loop
137 movd rax, mm7
139 x8x16sad_wmt_early_exit:
141 ; begin epilog
142 pop rdi
143 pop rsi
144 pop rbx
145 UNSHADOW_ARGS
146 pop rbp
150 ;unsigned int vp8_sad8x8_wmt(
151 ; unsigned char *src_ptr,
152 ; int src_stride,
153 ; unsigned char *ref_ptr,
154 ; int ref_stride)
155 global sym(vp8_sad8x8_wmt)
156 sym(vp8_sad8x8_wmt):
157 push rbp
158 mov rbp, rsp
159 SHADOW_ARGS_TO_STACK 5
160 push rbx
161 push rsi
162 push rdi
163 ; end prolog
165 mov rsi, arg(0) ;src_ptr
166 mov rdi, arg(2) ;ref_ptr
168 movsxd rbx, dword ptr arg(1) ;src_stride
169 movsxd rdx, dword ptr arg(3) ;ref_stride
171 lea rcx, [rsi+rbx*8]
172 pxor mm7, mm7
174 x8x8sad_wmt_loop:
176 movd rax, mm7
177 cmp rax, arg(4)
178 jg x8x8sad_wmt_early_exit
180 movq mm0, QWORD PTR [rsi]
181 movq mm1, QWORD PTR [rdi]
183 psadbw mm0, mm1
184 lea rsi, [rsi+rbx]
186 add rdi, rdx
187 paddw mm7, mm0
189 cmp rsi, rcx
190 jne x8x8sad_wmt_loop
192 movd rax, mm7
193 x8x8sad_wmt_early_exit:
195 ; begin epilog
196 pop rdi
197 pop rsi
198 pop rbx
199 UNSHADOW_ARGS
200 pop rbp
203 ;unsigned int vp8_sad4x4_wmt(
204 ; unsigned char *src_ptr,
205 ; int src_stride,
206 ; unsigned char *ref_ptr,
207 ; int ref_stride)
208 global sym(vp8_sad4x4_wmt)
209 sym(vp8_sad4x4_wmt):
210 push rbp
211 mov rbp, rsp
212 SHADOW_ARGS_TO_STACK 4
213 push rsi
214 push rdi
215 ; end prolog
217 mov rsi, arg(0) ;src_ptr
218 mov rdi, arg(2) ;ref_ptr
220 movsxd rax, dword ptr arg(1) ;src_stride
221 movsxd rdx, dword ptr arg(3) ;ref_stride
223 movd mm0, QWORD PTR [rsi]
224 movd mm1, QWORD PTR [rdi]
226 movd mm2, QWORD PTR [rsi+rax]
227 movd mm3, QWORD PTR [rdi+rdx]
229 punpcklbw mm0, mm2
230 punpcklbw mm1, mm3
232 psadbw mm0, mm1
233 lea rsi, [rsi+rax*2]
235 lea rdi, [rdi+rdx*2]
236 movd mm4, QWORD PTR [rsi]
238 movd mm5, QWORD PTR [rdi]
239 movd mm6, QWORD PTR [rsi+rax]
241 movd mm7, QWORD PTR [rdi+rdx]
242 punpcklbw mm4, mm6
244 punpcklbw mm5, mm7
245 psadbw mm4, mm5
247 paddw mm0, mm4
248 movd rax, mm0
250 ; begin epilog
251 pop rdi
252 pop rsi
253 UNSHADOW_ARGS
254 pop rbp
258 ;unsigned int vp8_sad16x8_wmt(
259 ; unsigned char *src_ptr,
260 ; int src_stride,
261 ; unsigned char *ref_ptr,
262 ; int ref_stride)
263 global sym(vp8_sad16x8_wmt)
264 sym(vp8_sad16x8_wmt):
265 push rbp
266 mov rbp, rsp
267 SHADOW_ARGS_TO_STACK 5
268 push rbx
269 push rsi
270 push rdi
271 ; end prolog
274 mov rsi, arg(0) ;src_ptr
275 mov rdi, arg(2) ;ref_ptr
277 movsxd rbx, dword ptr arg(1) ;src_stride
278 movsxd rdx, dword ptr arg(3) ;ref_stride
280 lea rcx, [rsi+rbx*8]
281 pxor mm7, mm7
283 x16x8sad_wmt_loop:
285 movd rax, mm7
286 cmp rax, arg(4)
287 jg x16x8sad_wmt_early_exit
289 movq mm0, QWORD PTR [rsi]
290 movq mm2, QWORD PTR [rsi+8]
292 movq mm1, QWORD PTR [rdi]
293 movq mm3, QWORD PTR [rdi+8]
295 movq mm4, QWORD PTR [rsi+rbx]
296 movq mm5, QWORD PTR [rdi+rdx]
298 psadbw mm0, mm1
299 psadbw mm2, mm3
301 movq mm1, QWORD PTR [rsi+rbx+8]
302 movq mm3, QWORD PTR [rdi+rdx+8]
304 psadbw mm4, mm5
305 psadbw mm1, mm3
307 lea rsi, [rsi+rbx*2]
308 lea rdi, [rdi+rdx*2]
310 paddw mm0, mm2
311 paddw mm4, mm1
313 paddw mm7, mm0
314 paddw mm7, mm4
316 cmp rsi, rcx
317 jne x16x8sad_wmt_loop
319 movd rax, mm7
321 x16x8sad_wmt_early_exit:
323 ; begin epilog
324 pop rdi
325 pop rsi
326 pop rbx
327 UNSHADOW_ARGS
328 pop rbp