Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vp8 / common / x86 / recon_mmx.asm
blobe7211fccb50e6fa5aaf8cbcb5707f3b131c60a45
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
13 ;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
14 global sym(vp8_recon_b_mmx)
15 sym(vp8_recon_b_mmx):
16 push rbp
17 mov rbp, rsp
18 SHADOW_ARGS_TO_STACK 4
19 push rsi
20 push rdi
21 ; end prolog
23 mov rsi, arg(0) ;s
24 mov rdi, arg(2) ;d
25 mov rdx, arg(1) ;q
26 movsxd rax, dword ptr arg(3) ;stride
27 pxor mm0, mm0
29 movd mm1, [rsi]
30 punpcklbw mm1, mm0
31 paddsw mm1, [rdx]
32 packuswb mm1, mm0 ; pack and unpack to saturate
33 movd [rdi], mm1
35 movd mm2, [rsi+16]
36 punpcklbw mm2, mm0
37 paddsw mm2, [rdx+32]
38 packuswb mm2, mm0 ; pack and unpack to saturate
39 movd [rdi+rax], mm2
41 movd mm3, [rsi+32]
42 punpcklbw mm3, mm0
43 paddsw mm3, [rdx+64]
44 packuswb mm3, mm0 ; pack and unpack to saturate
45 movd [rdi+2*rax], mm3
47 add rdi, rax
48 movd mm4, [rsi+48]
49 punpcklbw mm4, mm0
50 paddsw mm4, [rdx+96]
51 packuswb mm4, mm0 ; pack and unpack to saturate
52 movd [rdi+2*rax], mm4
54 ; begin epilog
55 pop rdi
56 pop rsi
57 UNSHADOW_ARGS
58 pop rbp
59 ret
62 ;void copy_mem8x8_mmx(
63 ; unsigned char *src,
64 ; int src_stride,
65 ; unsigned char *dst,
66 ; int dst_stride
67 ; )
68 global sym(vp8_copy_mem8x8_mmx)
69 sym(vp8_copy_mem8x8_mmx):
70 push rbp
71 mov rbp, rsp
72 SHADOW_ARGS_TO_STACK 4
73 push rsi
74 push rdi
75 ; end prolog
77 mov rsi, arg(0) ;src;
78 movq mm0, [rsi]
80 movsxd rax, dword ptr arg(1) ;src_stride;
81 mov rdi, arg(2) ;dst;
83 movq mm1, [rsi+rax]
84 movq mm2, [rsi+rax*2]
86 movsxd rcx, dword ptr arg(3) ;dst_stride
87 lea rsi, [rsi+rax*2]
89 movq [rdi], mm0
90 add rsi, rax
92 movq [rdi+rcx], mm1
93 movq [rdi+rcx*2], mm2
96 lea rdi, [rdi+rcx*2]
97 movq mm3, [rsi]
99 add rdi, rcx
100 movq mm4, [rsi+rax]
102 movq mm5, [rsi+rax*2]
103 movq [rdi], mm3
105 lea rsi, [rsi+rax*2]
106 movq [rdi+rcx], mm4
108 movq [rdi+rcx*2], mm5
109 lea rdi, [rdi+rcx*2]
111 movq mm0, [rsi+rax]
112 movq mm1, [rsi+rax*2]
114 movq [rdi+rcx], mm0
115 movq [rdi+rcx*2],mm1
117 ; begin epilog
118 pop rdi
119 pop rsi
120 UNSHADOW_ARGS
121 pop rbp
125 ;void copy_mem8x4_mmx(
126 ; unsigned char *src,
127 ; int src_stride,
128 ; unsigned char *dst,
129 ; int dst_stride
131 global sym(vp8_copy_mem8x4_mmx)
132 sym(vp8_copy_mem8x4_mmx):
133 push rbp
134 mov rbp, rsp
135 SHADOW_ARGS_TO_STACK 4
136 push rsi
137 push rdi
138 ; end prolog
140 mov rsi, arg(0) ;src;
141 movq mm0, [rsi]
143 movsxd rax, dword ptr arg(1) ;src_stride;
144 mov rdi, arg(2) ;dst;
146 movq mm1, [rsi+rax]
147 movq mm2, [rsi+rax*2]
149 movsxd rcx, dword ptr arg(3) ;dst_stride
150 lea rsi, [rsi+rax*2]
152 movq [rdi], mm0
153 movq [rdi+rcx], mm1
155 movq [rdi+rcx*2], mm2
156 lea rdi, [rdi+rcx*2]
158 movq mm3, [rsi+rax]
159 movq [rdi+rcx], mm3
161 ; begin epilog
162 pop rdi
163 pop rsi
164 UNSHADOW_ARGS
165 pop rbp
169 ;void copy_mem16x16_mmx(
170 ; unsigned char *src,
171 ; int src_stride,
172 ; unsigned char *dst,
173 ; int dst_stride
175 global sym(vp8_copy_mem16x16_mmx)
176 sym(vp8_copy_mem16x16_mmx):
177 push rbp
178 mov rbp, rsp
179 SHADOW_ARGS_TO_STACK 4
180 push rsi
181 push rdi
182 ; end prolog
184 mov rsi, arg(0) ;src;
185 movsxd rax, dword ptr arg(1) ;src_stride;
187 mov rdi, arg(2) ;dst;
188 movsxd rcx, dword ptr arg(3) ;dst_stride
190 movq mm0, [rsi]
191 movq mm3, [rsi+8];
193 movq mm1, [rsi+rax]
194 movq mm4, [rsi+rax+8]
196 movq mm2, [rsi+rax*2]
197 movq mm5, [rsi+rax*2+8]
199 lea rsi, [rsi+rax*2]
200 add rsi, rax
202 movq [rdi], mm0
203 movq [rdi+8], mm3
205 movq [rdi+rcx], mm1
206 movq [rdi+rcx+8], mm4
208 movq [rdi+rcx*2], mm2
209 movq [rdi+rcx*2+8], mm5
211 lea rdi, [rdi+rcx*2]
212 add rdi, rcx
214 movq mm0, [rsi]
215 movq mm3, [rsi+8];
217 movq mm1, [rsi+rax]
218 movq mm4, [rsi+rax+8]
220 movq mm2, [rsi+rax*2]
221 movq mm5, [rsi+rax*2+8]
223 lea rsi, [rsi+rax*2]
224 add rsi, rax
226 movq [rdi], mm0
227 movq [rdi+8], mm3
229 movq [rdi+rcx], mm1
230 movq [rdi+rcx+8], mm4
232 movq [rdi+rcx*2], mm2
233 movq [rdi+rcx*2+8], mm5
235 lea rdi, [rdi+rcx*2]
236 add rdi, rcx
238 movq mm0, [rsi]
239 movq mm3, [rsi+8];
241 movq mm1, [rsi+rax]
242 movq mm4, [rsi+rax+8]
244 movq mm2, [rsi+rax*2]
245 movq mm5, [rsi+rax*2+8]
247 lea rsi, [rsi+rax*2]
248 add rsi, rax
250 movq [rdi], mm0
251 movq [rdi+8], mm3
253 movq [rdi+rcx], mm1
254 movq [rdi+rcx+8], mm4
256 movq [rdi+rcx*2], mm2
257 movq [rdi+rcx*2+8], mm5
259 lea rdi, [rdi+rcx*2]
260 add rdi, rcx
262 movq mm0, [rsi]
263 movq mm3, [rsi+8];
265 movq mm1, [rsi+rax]
266 movq mm4, [rsi+rax+8]
268 movq mm2, [rsi+rax*2]
269 movq mm5, [rsi+rax*2+8]
271 lea rsi, [rsi+rax*2]
272 add rsi, rax
274 movq [rdi], mm0
275 movq [rdi+8], mm3
277 movq [rdi+rcx], mm1
278 movq [rdi+rcx+8], mm4
280 movq [rdi+rcx*2], mm2
281 movq [rdi+rcx*2+8], mm5
283 lea rdi, [rdi+rcx*2]
284 add rdi, rcx
286 movq mm0, [rsi]
287 movq mm3, [rsi+8];
289 movq mm1, [rsi+rax]
290 movq mm4, [rsi+rax+8]
292 movq mm2, [rsi+rax*2]
293 movq mm5, [rsi+rax*2+8]
295 lea rsi, [rsi+rax*2]
296 add rsi, rax
298 movq [rdi], mm0
299 movq [rdi+8], mm3
301 movq [rdi+rcx], mm1
302 movq [rdi+rcx+8], mm4
304 movq [rdi+rcx*2], mm2
305 movq [rdi+rcx*2+8], mm5
307 lea rdi, [rdi+rcx*2]
308 add rdi, rcx
310 movq mm0, [rsi]
311 movq mm3, [rsi+8];
313 movq [rdi], mm0
314 movq [rdi+8], mm3
316 ; begin epilog
317 pop rdi
318 pop rsi
319 UNSHADOW_ARGS
320 pop rbp