Define RDCOST only once
[libvpx.git] / vp8 / encoder / x86 / subtract_mmx.asm
bloba47e1f0d6ef7ac2c9aca5f0f458ea94179a957dc
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
15 ; short *diff, unsigned char *Predictor,
16 ; int pitch);
17 global sym(vp8_subtract_b_mmx_impl)
18 sym(vp8_subtract_b_mmx_impl):
19 push rbp
20 mov rbp, rsp
21 SHADOW_ARGS_TO_STACK 5
22 push rsi
23 push rdi
24 ; end prolog
27 mov rdi, arg(2) ;diff
28 mov rax, arg(3) ;Predictor
29 mov rsi, arg(0) ;z
30 movsxd rdx, dword ptr arg(1);src_stride;
31 movsxd rcx, dword ptr arg(4);pitch
32 pxor mm7, mm7
34 movd mm0, [rsi]
35 movd mm1, [rax]
36 punpcklbw mm0, mm7
37 punpcklbw mm1, mm7
38 psubw mm0, mm1
39 movq [rdi], mm0
42 movd mm0, [rsi+rdx]
43 movd mm1, [rax+rcx]
44 punpcklbw mm0, mm7
45 punpcklbw mm1, mm7
46 psubw mm0, mm1
47 movq [rdi+rcx*2],mm0
50 movd mm0, [rsi+rdx*2]
51 movd mm1, [rax+rcx*2]
52 punpcklbw mm0, mm7
53 punpcklbw mm1, mm7
54 psubw mm0, mm1
55 movq [rdi+rcx*4], mm0
57 lea rsi, [rsi+rdx*2]
58 lea rcx, [rcx+rcx*2]
62 movd mm0, [rsi+rdx]
63 movd mm1, [rax+rcx]
64 punpcklbw mm0, mm7
65 punpcklbw mm1, mm7
66 psubw mm0, mm1
67 movq [rdi+rcx*2], mm0
69 ; begin epilog
70 pop rdi
71 pop rsi
72 UNSHADOW_ARGS
73 pop rbp
74 ret
76 ;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
77 global sym(vp8_subtract_mby_mmx)
78 sym(vp8_subtract_mby_mmx):
79 push rbp
80 mov rbp, rsp
81 SHADOW_ARGS_TO_STACK 4
82 push rsi
83 push rdi
84 ; end prolog
87 mov rsi, arg(1) ;src
88 mov rdi, arg(0) ;diff
90 mov rax, arg(2) ;pred
91 movsxd rdx, dword ptr arg(3) ;stride
93 mov rcx, 16
94 pxor mm0, mm0
96 submby_loop:
98 movq mm1, [rsi]
99 movq mm3, [rax]
101 movq mm2, mm1
102 movq mm4, mm3
104 punpcklbw mm1, mm0
105 punpcklbw mm3, mm0
107 punpckhbw mm2, mm0
108 punpckhbw mm4, mm0
110 psubw mm1, mm3
111 psubw mm2, mm4
113 movq [rdi], mm1
114 movq [rdi+8], mm2
117 movq mm1, [rsi+8]
118 movq mm3, [rax+8]
120 movq mm2, mm1
121 movq mm4, mm3
123 punpcklbw mm1, mm0
124 punpcklbw mm3, mm0
126 punpckhbw mm2, mm0
127 punpckhbw mm4, mm0
129 psubw mm1, mm3
130 psubw mm2, mm4
132 movq [rdi+16], mm1
133 movq [rdi+24], mm2
136 add rdi, 32
137 add rax, 16
139 lea rsi, [rsi+rdx]
141 sub rcx, 1
142 jnz submby_loop
144 pop rdi
145 pop rsi
146 ; begin epilog
147 UNSHADOW_ARGS
148 pop rbp
152 ;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
153 global sym(vp8_subtract_mbuv_mmx)
154 sym(vp8_subtract_mbuv_mmx):
155 push rbp
156 mov rbp, rsp
157 SHADOW_ARGS_TO_STACK 5
158 push rsi
159 push rdi
160 ; end prolog
162 ;short *udiff = diff + 256;
163 ;short *vdiff = diff + 320;
164 ;unsigned char *upred = pred + 256;
165 ;unsigned char *vpred = pred + 320;
167 ;unsigned char *z = usrc;
168 ;unsigned short *diff = udiff;
169 ;unsigned char *Predictor= upred;
171 mov rdi, arg(0) ;diff
172 mov rax, arg(3) ;pred
173 mov rsi, arg(1) ;z = usrc
174 add rdi, 256*2 ;diff = diff + 256 (shorts)
175 add rax, 256 ;Predictor = pred + 256
176 movsxd rdx, dword ptr arg(4) ;stride;
177 pxor mm7, mm7
179 movq mm0, [rsi]
180 movq mm1, [rax]
181 movq mm3, mm0
182 movq mm4, mm1
183 punpcklbw mm0, mm7
184 punpcklbw mm1, mm7
185 punpckhbw mm3, mm7
186 punpckhbw mm4, mm7
187 psubw mm0, mm1
188 psubw mm3, mm4
189 movq [rdi], mm0
190 movq [rdi+8], mm3
193 movq mm0, [rsi+rdx]
194 movq mm1, [rax+8]
195 movq mm3, mm0
196 movq mm4, mm1
197 punpcklbw mm0, mm7
198 punpcklbw mm1, mm7
199 punpckhbw mm3, mm7
200 punpckhbw mm4, mm7
201 psubw mm0, mm1
202 psubw mm3, mm4
203 movq [rdi+16], mm0
204 movq [rdi+24], mm3
206 movq mm0, [rsi+rdx*2]
207 movq mm1, [rax+16]
208 movq mm3, mm0
209 movq mm4, mm1
210 punpcklbw mm0, mm7
211 punpcklbw mm1, mm7
212 punpckhbw mm3, mm7
213 punpckhbw mm4, mm7
214 psubw mm0, mm1
215 psubw mm3, mm4
216 movq [rdi+32], mm0
217 movq [rdi+40], mm3
218 lea rsi, [rsi+rdx*2]
221 movq mm0, [rsi+rdx]
222 movq mm1, [rax+24]
223 movq mm3, mm0
224 movq mm4, mm1
225 punpcklbw mm0, mm7
226 punpcklbw mm1, mm7
227 punpckhbw mm3, mm7
228 punpckhbw mm4, mm7
229 psubw mm0, mm1
230 psubw mm3, mm4
232 movq [rdi+48], mm0
233 movq [rdi+56], mm3
236 add rdi, 64
237 add rax, 32
238 lea rsi, [rsi+rdx*2]
241 movq mm0, [rsi]
242 movq mm1, [rax]
243 movq mm3, mm0
244 movq mm4, mm1
245 punpcklbw mm0, mm7
246 punpcklbw mm1, mm7
247 punpckhbw mm3, mm7
248 punpckhbw mm4, mm7
249 psubw mm0, mm1
250 psubw mm3, mm4
251 movq [rdi], mm0
252 movq [rdi+8], mm3
255 movq mm0, [rsi+rdx]
256 movq mm1, [rax+8]
257 movq mm3, mm0
258 movq mm4, mm1
259 punpcklbw mm0, mm7
260 punpcklbw mm1, mm7
261 punpckhbw mm3, mm7
262 punpckhbw mm4, mm7
263 psubw mm0, mm1
264 psubw mm3, mm4
265 movq [rdi+16], mm0
266 movq [rdi+24], mm3
268 movq mm0, [rsi+rdx*2]
269 movq mm1, [rax+16]
270 movq mm3, mm0
271 movq mm4, mm1
272 punpcklbw mm0, mm7
273 punpcklbw mm1, mm7
274 punpckhbw mm3, mm7
275 punpckhbw mm4, mm7
276 psubw mm0, mm1
277 psubw mm3, mm4
278 movq [rdi+32], mm0
279 movq [rdi+40], mm3
280 lea rsi, [rsi+rdx*2]
283 movq mm0, [rsi+rdx]
284 movq mm1, [rax+24]
285 movq mm3, mm0
286 movq mm4, mm1
287 punpcklbw mm0, mm7
288 punpcklbw mm1, mm7
289 punpckhbw mm3, mm7
290 punpckhbw mm4, mm7
291 psubw mm0, mm1
292 psubw mm3, mm4
294 movq [rdi+48], mm0
295 movq [rdi+56], mm3
297 ;unsigned char *z = vsrc;
298 ;unsigned short *diff = vdiff;
299 ;unsigned char *Predictor= vpred;
301 mov rdi, arg(0) ;diff
302 mov rax, arg(3) ;pred
303 mov rsi, arg(2) ;z = usrc
304 add rdi, 320*2 ;diff = diff + 320 (shorts)
305 add rax, 320 ;Predictor = pred + 320
306 movsxd rdx, dword ptr arg(4) ;stride;
307 pxor mm7, mm7
309 movq mm0, [rsi]
310 movq mm1, [rax]
311 movq mm3, mm0
312 movq mm4, mm1
313 punpcklbw mm0, mm7
314 punpcklbw mm1, mm7
315 punpckhbw mm3, mm7
316 punpckhbw mm4, mm7
317 psubw mm0, mm1
318 psubw mm3, mm4
319 movq [rdi], mm0
320 movq [rdi+8], mm3
323 movq mm0, [rsi+rdx]
324 movq mm1, [rax+8]
325 movq mm3, mm0
326 movq mm4, mm1
327 punpcklbw mm0, mm7
328 punpcklbw mm1, mm7
329 punpckhbw mm3, mm7
330 punpckhbw mm4, mm7
331 psubw mm0, mm1
332 psubw mm3, mm4
333 movq [rdi+16], mm0
334 movq [rdi+24], mm3
336 movq mm0, [rsi+rdx*2]
337 movq mm1, [rax+16]
338 movq mm3, mm0
339 movq mm4, mm1
340 punpcklbw mm0, mm7
341 punpcklbw mm1, mm7
342 punpckhbw mm3, mm7
343 punpckhbw mm4, mm7
344 psubw mm0, mm1
345 psubw mm3, mm4
346 movq [rdi+32], mm0
347 movq [rdi+40], mm3
348 lea rsi, [rsi+rdx*2]
351 movq mm0, [rsi+rdx]
352 movq mm1, [rax+24]
353 movq mm3, mm0
354 movq mm4, mm1
355 punpcklbw mm0, mm7
356 punpcklbw mm1, mm7
357 punpckhbw mm3, mm7
358 punpckhbw mm4, mm7
359 psubw mm0, mm1
360 psubw mm3, mm4
362 movq [rdi+48], mm0
363 movq [rdi+56], mm3
366 add rdi, 64
367 add rax, 32
368 lea rsi, [rsi+rdx*2]
371 movq mm0, [rsi]
372 movq mm1, [rax]
373 movq mm3, mm0
374 movq mm4, mm1
375 punpcklbw mm0, mm7
376 punpcklbw mm1, mm7
377 punpckhbw mm3, mm7
378 punpckhbw mm4, mm7
379 psubw mm0, mm1
380 psubw mm3, mm4
381 movq [rdi], mm0
382 movq [rdi+8], mm3
385 movq mm0, [rsi+rdx]
386 movq mm1, [rax+8]
387 movq mm3, mm0
388 movq mm4, mm1
389 punpcklbw mm0, mm7
390 punpcklbw mm1, mm7
391 punpckhbw mm3, mm7
392 punpckhbw mm4, mm7
393 psubw mm0, mm1
394 psubw mm3, mm4
395 movq [rdi+16], mm0
396 movq [rdi+24], mm3
398 movq mm0, [rsi+rdx*2]
399 movq mm1, [rax+16]
400 movq mm3, mm0
401 movq mm4, mm1
402 punpcklbw mm0, mm7
403 punpcklbw mm1, mm7
404 punpckhbw mm3, mm7
405 punpckhbw mm4, mm7
406 psubw mm0, mm1
407 psubw mm3, mm4
408 movq [rdi+32], mm0
409 movq [rdi+40], mm3
410 lea rsi, [rsi+rdx*2]
413 movq mm0, [rsi+rdx]
414 movq mm1, [rax+24]
415 movq mm3, mm0
416 movq mm4, mm1
417 punpcklbw mm0, mm7
418 punpcklbw mm1, mm7
419 punpckhbw mm3, mm7
420 punpckhbw mm4, mm7
421 psubw mm0, mm1
422 psubw mm3, mm4
424 movq [rdi+48], mm0
425 movq [rdi+56], mm3
427 ; begin epilog
428 pop rdi
429 pop rsi
430 UNSHADOW_ARGS
431 pop rbp