Add save/restore xmm registers in x86 assembly code
[libvpx.git] / vp8 / encoder / x86 / encodeopt.asm
blobe142a75738d195e8b0cd63d8a78b15c76854b1f4
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
15 global sym(vp8_block_error_xmm)
16 sym(vp8_block_error_xmm):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 2
20 push rsi
21 push rdi
22 ; end prologue
24 mov rsi, arg(0) ;coeff_ptr
25 mov rdi, arg(1) ;dcoef_ptr
27 movdqa xmm0, [rsi]
28 movdqa xmm1, [rdi]
30 movdqa xmm2, [rsi+16]
31 movdqa xmm3, [rdi+16]
33 psubw xmm0, xmm1
34 psubw xmm2, xmm3
36 pmaddwd xmm0, xmm0
37 pmaddwd xmm2, xmm2
39 paddd xmm0, xmm2
41 pxor xmm5, xmm5
42 movdqa xmm1, xmm0
44 punpckldq xmm0, xmm5
45 punpckhdq xmm1, xmm5
47 paddd xmm0, xmm1
48 movdqa xmm1, xmm0
50 psrldq xmm0, 8
51 paddd xmm0, xmm1
53 movq rax, xmm0
55 pop rdi
56 pop rsi
57 ; begin epilog
58 UNSHADOW_ARGS
59 pop rbp
60 ret
62 ;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
63 global sym(vp8_block_error_mmx)
64 sym(vp8_block_error_mmx):
65 push rbp
66 mov rbp, rsp
67 SHADOW_ARGS_TO_STACK 2
68 push rsi
69 push rdi
70 ; end prolog
73 mov rsi, arg(0) ;coeff_ptr
74 pxor mm7, mm7
76 mov rdi, arg(1) ;dcoef_ptr
77 movq mm3, [rsi]
79 movq mm4, [rdi]
80 movq mm5, [rsi+8]
82 movq mm6, [rdi+8]
83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0
85 movq mm2, mm7
86 psubw mm5, mm6
88 por mm1, mm2
89 pmaddwd mm5, mm5
91 pcmpeqw mm1, mm7
92 psubw mm3, mm4
94 pand mm1, mm3
95 pmaddwd mm1, mm1
97 paddd mm1, mm5
98 movq mm3, [rsi+16]
100 movq mm4, [rdi+16]
101 movq mm5, [rsi+24]
103 movq mm6, [rdi+24]
104 psubw mm5, mm6
106 pmaddwd mm5, mm5
107 psubw mm3, mm4
109 pmaddwd mm3, mm3
110 paddd mm3, mm5
112 paddd mm1, mm3
113 movq mm0, mm1
115 psrlq mm1, 32
116 paddd mm0, mm1
118 movq rax, mm0
120 pop rdi
121 pop rsi
122 ; begin epilog
123 UNSHADOW_ARGS
124 pop rbp
128 ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
129 global sym(vp8_mbblock_error_mmx_impl)
130 sym(vp8_mbblock_error_mmx_impl):
131 push rbp
132 mov rbp, rsp
133 SHADOW_ARGS_TO_STACK 3
134 push rsi
135 push rdi
136 ; end prolog
139 mov rsi, arg(0) ;coeff_ptr
140 pxor mm7, mm7
142 mov rdi, arg(1) ;dcoef_ptr
143 pxor mm2, mm2
145 movd mm1, dword ptr arg(2) ;dc
146 por mm1, mm2
148 pcmpeqw mm1, mm7
149 mov rcx, 16
151 mberror_loop_mmx:
152 movq mm3, [rsi]
153 movq mm4, [rdi]
155 movq mm5, [rsi+8]
156 movq mm6, [rdi+8]
159 psubw mm5, mm6
160 pmaddwd mm5, mm5
162 psubw mm3, mm4
163 pand mm3, mm1
165 pmaddwd mm3, mm3
166 paddd mm2, mm5
168 paddd mm2, mm3
169 movq mm3, [rsi+16]
171 movq mm4, [rdi+16]
172 movq mm5, [rsi+24]
174 movq mm6, [rdi+24]
175 psubw mm5, mm6
177 pmaddwd mm5, mm5
178 psubw mm3, mm4
180 pmaddwd mm3, mm3
181 paddd mm2, mm5
183 paddd mm2, mm3
184 add rsi, 32
186 add rdi, 32
187 sub rcx, 1
189 jnz mberror_loop_mmx
191 movq mm0, mm2
192 psrlq mm2, 32
194 paddd mm0, mm2
195 movq rax, mm0
197 pop rdi
198 pop rsi
199 ; begin epilog
200 UNSHADOW_ARGS
201 pop rbp
205 ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
206 global sym(vp8_mbblock_error_xmm_impl)
207 sym(vp8_mbblock_error_xmm_impl):
208 push rbp
209 mov rbp, rsp
210 SHADOW_ARGS_TO_STACK 3
211 SAVE_XMM ; 6
212 push rsi
213 push rdi
214 ; end prolog
217 mov rsi, arg(0) ;coeff_ptr
218 pxor xmm6, xmm6
220 mov rdi, arg(1) ;dcoef_ptr
221 pxor xmm4, xmm4
223 movd xmm5, dword ptr arg(2) ;dc
224 por xmm5, xmm4
226 pcmpeqw xmm5, xmm6
227 mov rcx, 16
229 mberror_loop:
230 movdqa xmm0, [rsi]
231 movdqa xmm1, [rdi]
233 movdqa xmm2, [rsi+16]
234 movdqa xmm3, [rdi+16]
237 psubw xmm2, xmm3
238 pmaddwd xmm2, xmm2
240 psubw xmm0, xmm1
241 pand xmm0, xmm5
243 pmaddwd xmm0, xmm0
244 add rsi, 32
246 add rdi, 32
248 sub rcx, 1
249 paddd xmm4, xmm2
251 paddd xmm4, xmm0
252 jnz mberror_loop
254 movdqa xmm0, xmm4
255 punpckldq xmm0, xmm6
257 punpckhdq xmm4, xmm6
258 paddd xmm0, xmm4
260 movdqa xmm1, xmm0
261 psrldq xmm0, 8
263 paddd xmm0, xmm1
264 movq rax, xmm0
266 pop rdi
267 pop rsi
268 ; begin epilog
269 RESTORE_XMM
270 UNSHADOW_ARGS
271 pop rbp
275 ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
276 global sym(vp8_mbuverror_mmx_impl)
277 sym(vp8_mbuverror_mmx_impl):
278 push rbp
279 mov rbp, rsp
280 SHADOW_ARGS_TO_STACK 2
281 push rsi
282 push rdi
283 ; end prolog
286 mov rsi, arg(0) ;s_ptr
287 mov rdi, arg(1) ;d_ptr
289 mov rcx, 16
290 pxor mm7, mm7
292 mbuverror_loop_mmx:
294 movq mm1, [rsi]
295 movq mm2, [rdi]
297 psubw mm1, mm2
298 pmaddwd mm1, mm1
301 movq mm3, [rsi+8]
302 movq mm4, [rdi+8]
304 psubw mm3, mm4
305 pmaddwd mm3, mm3
308 paddd mm7, mm1
309 paddd mm7, mm3
312 add rsi, 16
313 add rdi, 16
315 dec rcx
316 jnz mbuverror_loop_mmx
318 movq mm0, mm7
319 psrlq mm7, 32
321 paddd mm0, mm7
322 movq rax, mm0
324 pop rdi
325 pop rsi
326 ; begin epilog
327 UNSHADOW_ARGS
328 pop rbp
332 ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
333 global sym(vp8_mbuverror_xmm_impl)
334 sym(vp8_mbuverror_xmm_impl):
335 push rbp
336 mov rbp, rsp
337 SHADOW_ARGS_TO_STACK 2
338 push rsi
339 push rdi
340 ; end prolog
343 mov rsi, arg(0) ;s_ptr
344 mov rdi, arg(1) ;d_ptr
346 mov rcx, 16
347 pxor xmm3, xmm3
349 mbuverror_loop:
351 movdqa xmm1, [rsi]
352 movdqa xmm2, [rdi]
354 psubw xmm1, xmm2
355 pmaddwd xmm1, xmm1
357 paddd xmm3, xmm1
359 add rsi, 16
360 add rdi, 16
362 dec rcx
363 jnz mbuverror_loop
365 pxor xmm0, xmm0
366 movdqa xmm1, xmm3
368 movdqa xmm2, xmm1
369 punpckldq xmm1, xmm0
371 punpckhdq xmm2, xmm0
372 paddd xmm1, xmm2
374 movdqa xmm2, xmm1
376 psrldq xmm1, 8
377 paddd xmm1, xmm2
379 movq rax, xmm1
381 pop rdi
382 pop rsi
383 ; begin epilog
384 UNSHADOW_ARGS
385 pop rbp