move new neon subpixel function
[libvpx.git] / vp8 / encoder / x86 / encodeopt.asm
blobc0f06bbbb638f868d799df669b5d55e4371d3289
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
15 global sym(vp8_block_error_xmm)
16 sym(vp8_block_error_xmm):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 2
20 push rsi
21 push rdi
22 ; end prologue
24 mov rsi, arg(0) ;coeff_ptr
26 mov rdi, arg(1) ;dcoef_ptr
27 movdqa xmm3, [rsi]
29 movdqa xmm4, [rdi]
30 movdqa xmm5, [rsi+16]
32 movdqa xmm6, [rdi+16]
33 psubw xmm3, xmm4
35 psubw xmm5, xmm6
36 pmaddwd xmm3, xmm3
37 pmaddwd xmm5, xmm5
39 paddd xmm3, xmm5
41 pxor xmm7, xmm7
42 movdqa xmm0, xmm3
44 punpckldq xmm0, xmm7
45 punpckhdq xmm3, xmm7
47 paddd xmm0, xmm3
48 movdqa xmm3, xmm0
50 psrldq xmm0, 8
51 paddd xmm0, xmm3
53 movq rax, xmm0
55 pop rdi
56 pop rsi
57 ; begin epilog
58 UNSHADOW_ARGS
59 pop rbp
60 ret
62 ;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
63 global sym(vp8_block_error_mmx)
64 sym(vp8_block_error_mmx):
65 push rbp
66 mov rbp, rsp
67 SHADOW_ARGS_TO_STACK 2
68 push rsi
69 push rdi
70 ; end prolog
73 mov rsi, arg(0) ;coeff_ptr
74 pxor mm7, mm7
76 mov rdi, arg(1) ;dcoef_ptr
77 movq mm3, [rsi]
79 movq mm4, [rdi]
80 movq mm5, [rsi+8]
82 movq mm6, [rdi+8]
83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0
85 movq mm2, mm7
86 psubw mm5, mm6
88 por mm1, mm2
89 pmaddwd mm5, mm5
91 pcmpeqw mm1, mm7
92 psubw mm3, mm4
94 pand mm1, mm3
95 pmaddwd mm1, mm1
97 paddd mm1, mm5
98 movq mm3, [rsi+16]
100 movq mm4, [rdi+16]
101 movq mm5, [rsi+24]
103 movq mm6, [rdi+24]
104 psubw mm5, mm6
106 pmaddwd mm5, mm5
107 psubw mm3, mm4
109 pmaddwd mm3, mm3
110 paddd mm3, mm5
112 paddd mm1, mm3
113 movq mm0, mm1
115 psrlq mm1, 32
116 paddd mm0, mm1
118 movq rax, mm0
120 pop rdi
121 pop rsi
122 ; begin epilog
123 UNSHADOW_ARGS
124 pop rbp
128 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
129 global sym(vp8_mbblock_error_mmx_impl)
130 sym(vp8_mbblock_error_mmx_impl):
131 push rbp
132 mov rbp, rsp
133 SHADOW_ARGS_TO_STACK 3
134 push rsi
135 push rdi
136 ; end prolog
139 mov rsi, arg(0) ;coeff_ptr
140 pxor mm7, mm7
142 mov rdi, arg(1) ;dcoef_ptr
143 pxor mm2, mm2
145 movd mm1, dword ptr arg(2) ;dc
146 por mm1, mm2
148 pcmpeqw mm1, mm7
149 mov rcx, 16
151 mberror_loop_mmx:
152 movq mm3, [rsi]
153 movq mm4, [rdi]
155 movq mm5, [rsi+8]
156 movq mm6, [rdi+8]
159 psubw mm5, mm6
160 pmaddwd mm5, mm5
162 psubw mm3, mm4
163 pand mm3, mm1
165 pmaddwd mm3, mm3
166 paddd mm2, mm5
168 paddd mm2, mm3
169 movq mm3, [rsi+16]
171 movq mm4, [rdi+16]
172 movq mm5, [rsi+24]
174 movq mm6, [rdi+24]
175 psubw mm5, mm6
177 pmaddwd mm5, mm5
178 psubw mm3, mm4
180 pmaddwd mm3, mm3
181 paddd mm2, mm5
183 paddd mm2, mm3
184 add rsi, 32
186 add rdi, 32
187 sub rcx, 1
189 jnz mberror_loop_mmx
191 movq mm0, mm2
192 psrlq mm2, 32
194 paddd mm0, mm2
195 movq rax, mm0
197 pop rdi
198 pop rsi
199 ; begin epilog
200 UNSHADOW_ARGS
201 pop rbp
205 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
206 global sym(vp8_mbblock_error_xmm_impl)
207 sym(vp8_mbblock_error_xmm_impl):
208 push rbp
209 mov rbp, rsp
210 SHADOW_ARGS_TO_STACK 3
211 push rsi
212 push rdi
213 ; end prolog
216 mov rsi, arg(0) ;coeff_ptr
217 pxor xmm7, xmm7
219 mov rdi, arg(1) ;dcoef_ptr
220 pxor xmm2, xmm2
222 movd xmm1, dword ptr arg(2) ;dc
223 por xmm1, xmm2
225 pcmpeqw xmm1, xmm7
226 mov rcx, 16
228 mberror_loop:
229 movdqa xmm3, [rsi]
230 movdqa xmm4, [rdi]
232 movdqa xmm5, [rsi+16]
233 movdqa xmm6, [rdi+16]
236 psubw xmm5, xmm6
237 pmaddwd xmm5, xmm5
239 psubw xmm3, xmm4
240 pand xmm3, xmm1
242 pmaddwd xmm3, xmm3
243 add rsi, 32
245 add rdi, 32
247 sub rcx, 1
248 paddd xmm2, xmm5
250 paddd xmm2, xmm3
251 jnz mberror_loop
253 movdqa xmm0, xmm2
254 punpckldq xmm0, xmm7
256 punpckhdq xmm2, xmm7
257 paddd xmm0, xmm2
259 movdqa xmm1, xmm0
260 psrldq xmm0, 8
262 paddd xmm0, xmm1
263 movq rax, xmm0
265 pop rdi
266 pop rsi
267 ; begin epilog
268 UNSHADOW_ARGS
269 pop rbp
273 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
274 global sym(vp8_mbuverror_mmx_impl)
275 sym(vp8_mbuverror_mmx_impl):
276 push rbp
277 mov rbp, rsp
278 SHADOW_ARGS_TO_STACK 2
279 push rsi
280 push rdi
281 ; end prolog
284 mov rsi, arg(0) ;s_ptr
285 mov rdi, arg(1) ;d_ptr
287 mov rcx, 16
288 pxor mm7, mm7
290 mbuverror_loop_mmx:
292 movq mm1, [rsi]
293 movq mm2, [rdi]
295 psubw mm1, mm2
296 pmaddwd mm1, mm1
299 movq mm3, [rsi+8]
300 movq mm4, [rdi+8]
302 psubw mm3, mm4
303 pmaddwd mm3, mm3
306 paddd mm7, mm1
307 paddd mm7, mm3
310 add rsi, 16
311 add rdi, 16
313 dec rcx
314 jnz mbuverror_loop_mmx
316 movq mm0, mm7
317 psrlq mm7, 32
319 paddd mm0, mm7
320 movq rax, mm0
322 pop rdi
323 pop rsi
324 ; begin epilog
325 UNSHADOW_ARGS
326 pop rbp
330 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
331 global sym(vp8_mbuverror_xmm_impl)
332 sym(vp8_mbuverror_xmm_impl):
333 push rbp
334 mov rbp, rsp
335 SHADOW_ARGS_TO_STACK 2
336 push rsi
337 push rdi
338 ; end prolog
341 mov rsi, arg(0) ;s_ptr
342 mov rdi, arg(1) ;d_ptr
344 mov rcx, 16
345 pxor xmm7, xmm7
347 mbuverror_loop:
349 movdqa xmm1, [rsi]
350 movdqa xmm2, [rdi]
352 psubw xmm1, xmm2
353 pmaddwd xmm1, xmm1
355 paddd xmm7, xmm1
357 add rsi, 16
358 add rdi, 16
360 dec rcx
361 jnz mbuverror_loop
363 pxor xmm0, xmm0
364 movdqa xmm1, xmm7
366 movdqa xmm2, xmm1
367 punpckldq xmm1, xmm0
369 punpckhdq xmm2, xmm0
370 paddd xmm1, xmm2
372 movdqa xmm2, xmm1
374 psrldq xmm1, 8
375 paddd xmm1, xmm2
377 movq rax, xmm1
379 pop rdi
380 pop rsi
381 ; begin epilog
382 UNSHADOW_ARGS
383 pop rbp