Define RDCOST only once
[libvpx.git] / vp8 / decoder / x86 / dequantize_mmx.asm
blob0d6133a462e95783e08d0e0df9e45a7ac61b24df
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
16 global sym(vp8_dequantize_b_impl_mmx)
17 sym(vp8_dequantize_b_impl_mmx):
18 push rbp
19 mov rbp, rsp
20 SHADOW_ARGS_TO_STACK 3
21 push rsi
22 push rdi
23 ; end prolog
25 mov rsi, arg(0) ;sq
26 mov rdi, arg(1) ;dq
27 mov rax, arg(2) ;q
29 movq mm1, [rsi]
30 pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
31 movq [rdi], mm1
33 movq mm1, [rsi+8]
34 pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
35 movq [rdi+8], mm1
37 movq mm1, [rsi+16]
38 pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
39 movq [rdi+16], mm1
41 movq mm1, [rsi+24]
42 pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
43 movq [rdi+24], mm1
45 ; begin epilog
46 pop rdi
47 pop rsi
48 UNSHADOW_ARGS
49 pop rbp
50 ret
53 ;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
54 global sym(vp8_dequant_idct_add_mmx)
55 sym(vp8_dequant_idct_add_mmx):
56 push rbp
57 mov rbp, rsp
58 SHADOW_ARGS_TO_STACK 6
59 GET_GOT rbx
60 push rsi
61 push rdi
62 ; end prolog
64 mov rax, arg(0) ;input
65 mov rdx, arg(1) ;dq
68 movq mm0, [rax ]
69 pmullw mm0, [rdx]
71 movq mm1, [rax +8]
72 pmullw mm1, [rdx +8]
74 movq mm2, [rax+16]
75 pmullw mm2, [rdx+16]
77 movq mm3, [rax+24]
78 pmullw mm3, [rdx+24]
80 mov rdx, arg(3) ;dest
81 mov rsi, arg(2) ;pred
82 pxor mm7, mm7
85 movq [rax], mm7
86 movq [rax+8], mm7
88 movq [rax+16],mm7
89 movq [rax+24],mm7
92 movsxd rax, dword ptr arg(4) ;pitch
93 movsxd rdi, dword ptr arg(5) ;stride
95 psubw mm0, mm2 ; b1= 0-2
96 paddw mm2, mm2 ;
98 movq mm5, mm1
99 paddw mm2, mm0 ; a1 =0+2
101 pmulhw mm5, [GLOBAL(x_s1sqr2)];
102 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
104 movq mm7, mm3 ;
105 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
107 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
108 psubw mm7, mm5 ; c1
110 movq mm5, mm1
111 movq mm4, mm3
113 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
114 paddw mm5, mm1
116 pmulhw mm3, [GLOBAL(x_s1sqr2)]
117 paddw mm3, mm4
119 paddw mm3, mm5 ; d1
120 movq mm6, mm2 ; a1
122 movq mm4, mm0 ; b1
123 paddw mm2, mm3 ;0
125 paddw mm4, mm7 ;1
126 psubw mm0, mm7 ;2
128 psubw mm6, mm3 ;3
130 movq mm1, mm2 ; 03 02 01 00
131 movq mm3, mm4 ; 23 22 21 20
133 punpcklwd mm1, mm0 ; 11 01 10 00
134 punpckhwd mm2, mm0 ; 13 03 12 02
136 punpcklwd mm3, mm6 ; 31 21 30 20
137 punpckhwd mm4, mm6 ; 33 23 32 22
139 movq mm0, mm1 ; 11 01 10 00
140 movq mm5, mm2 ; 13 03 12 02
142 punpckldq mm0, mm3 ; 30 20 10 00
143 punpckhdq mm1, mm3 ; 31 21 11 01
145 punpckldq mm2, mm4 ; 32 22 12 02
146 punpckhdq mm5, mm4 ; 33 23 13 03
148 movq mm3, mm5 ; 33 23 13 03
150 psubw mm0, mm2 ; b1= 0-2
151 paddw mm2, mm2 ;
153 movq mm5, mm1
154 paddw mm2, mm0 ; a1 =0+2
156 pmulhw mm5, [GLOBAL(x_s1sqr2)];
157 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
159 movq mm7, mm3 ;
160 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
162 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
163 psubw mm7, mm5 ; c1
165 movq mm5, mm1
166 movq mm4, mm3
168 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
169 paddw mm5, mm1
171 pmulhw mm3, [GLOBAL(x_s1sqr2)]
172 paddw mm3, mm4
174 paddw mm3, mm5 ; d1
175 paddw mm0, [GLOBAL(fours)]
177 paddw mm2, [GLOBAL(fours)]
178 movq mm6, mm2 ; a1
180 movq mm4, mm0 ; b1
181 paddw mm2, mm3 ;0
183 paddw mm4, mm7 ;1
184 psubw mm0, mm7 ;2
186 psubw mm6, mm3 ;3
187 psraw mm2, 3
189 psraw mm0, 3
190 psraw mm4, 3
192 psraw mm6, 3
194 movq mm1, mm2 ; 03 02 01 00
195 movq mm3, mm4 ; 23 22 21 20
197 punpcklwd mm1, mm0 ; 11 01 10 00
198 punpckhwd mm2, mm0 ; 13 03 12 02
200 punpcklwd mm3, mm6 ; 31 21 30 20
201 punpckhwd mm4, mm6 ; 33 23 32 22
203 movq mm0, mm1 ; 11 01 10 00
204 movq mm5, mm2 ; 13 03 12 02
206 punpckldq mm0, mm3 ; 30 20 10 00
207 punpckhdq mm1, mm3 ; 31 21 11 01
209 punpckldq mm2, mm4 ; 32 22 12 02
210 punpckhdq mm5, mm4 ; 33 23 13 03
212 pxor mm7, mm7
214 movd mm4, [rsi]
215 punpcklbw mm4, mm7
216 paddsw mm0, mm4
217 packuswb mm0, mm7
218 movd [rdx], mm0
220 movd mm4, [rsi+rax]
221 punpcklbw mm4, mm7
222 paddsw mm1, mm4
223 packuswb mm1, mm7
224 movd [rdx+rdi], mm1
226 movd mm4, [rsi+2*rax]
227 punpcklbw mm4, mm7
228 paddsw mm2, mm4
229 packuswb mm2, mm7
230 movd [rdx+rdi*2], mm2
232 add rdx, rdi
233 add rsi, rax
235 movd mm4, [rsi+2*rax]
236 punpcklbw mm4, mm7
237 paddsw mm5, mm4
238 packuswb mm5, mm7
239 movd [rdx+rdi*2], mm5
241 ; begin epilog
242 pop rdi
243 pop rsi
244 RESTORE_GOT
245 UNSHADOW_ARGS
246 pop rbp
250 ;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
251 global sym(vp8_dequant_dc_idct_add_mmx)
252 sym(vp8_dequant_dc_idct_add_mmx):
253 push rbp
254 mov rbp, rsp
255 SHADOW_ARGS_TO_STACK 7
256 GET_GOT rbx
257 push rsi
258 push rdi
259 ; end prolog
261 mov rax, arg(0) ;input
262 mov rdx, arg(1) ;dq
264 movq mm0, [rax ]
265 pmullw mm0, [rdx]
267 movq mm1, [rax +8]
268 pmullw mm1, [rdx +8]
270 movq mm2, [rax+16]
271 pmullw mm2, [rdx+16]
273 movq mm3, [rax+24]
274 pmullw mm3, [rdx+24]
276 mov rdx, arg(3) ;dest
277 mov rsi, arg(2) ;pred
278 pxor mm7, mm7
281 movq [rax], mm7
282 movq [rax+8], mm7
284 movq [rax+16],mm7
285 movq [rax+24],mm7
287 ; move lower word of Dc to lower word of mm0
288 psrlq mm0, 16
289 movzx rcx, word ptr arg(6) ;Dc
290 psllq mm0, 16
291 movq mm7, rcx
292 por mm0, mm7
294 movsxd rax, dword ptr arg(4) ;pitch
295 movsxd rdi, dword ptr arg(5) ;stride
297 psubw mm0, mm2 ; b1= 0-2
298 paddw mm2, mm2 ;
300 movq mm5, mm1
301 paddw mm2, mm0 ; a1 =0+2
303 pmulhw mm5, [GLOBAL(x_s1sqr2)];
304 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
306 movq mm7, mm3 ;
307 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
309 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
310 psubw mm7, mm5 ; c1
312 movq mm5, mm1
313 movq mm4, mm3
315 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
316 paddw mm5, mm1
318 pmulhw mm3, [GLOBAL(x_s1sqr2)]
319 paddw mm3, mm4
321 paddw mm3, mm5 ; d1
322 movq mm6, mm2 ; a1
324 movq mm4, mm0 ; b1
325 paddw mm2, mm3 ;0
327 paddw mm4, mm7 ;1
328 psubw mm0, mm7 ;2
330 psubw mm6, mm3 ;3
332 movq mm1, mm2 ; 03 02 01 00
333 movq mm3, mm4 ; 23 22 21 20
335 punpcklwd mm1, mm0 ; 11 01 10 00
336 punpckhwd mm2, mm0 ; 13 03 12 02
338 punpcklwd mm3, mm6 ; 31 21 30 20
339 punpckhwd mm4, mm6 ; 33 23 32 22
341 movq mm0, mm1 ; 11 01 10 00
342 movq mm5, mm2 ; 13 03 12 02
344 punpckldq mm0, mm3 ; 30 20 10 00
345 punpckhdq mm1, mm3 ; 31 21 11 01
347 punpckldq mm2, mm4 ; 32 22 12 02
348 punpckhdq mm5, mm4 ; 33 23 13 03
350 movq mm3, mm5 ; 33 23 13 03
352 psubw mm0, mm2 ; b1= 0-2
353 paddw mm2, mm2 ;
355 movq mm5, mm1
356 paddw mm2, mm0 ; a1 =0+2
358 pmulhw mm5, [GLOBAL(x_s1sqr2)];
359 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
361 movq mm7, mm3 ;
362 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
364 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
365 psubw mm7, mm5 ; c1
367 movq mm5, mm1
368 movq mm4, mm3
370 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
371 paddw mm5, mm1
373 pmulhw mm3, [GLOBAL(x_s1sqr2)]
374 paddw mm3, mm4
376 paddw mm3, mm5 ; d1
377 paddw mm0, [GLOBAL(fours)]
379 paddw mm2, [GLOBAL(fours)]
380 movq mm6, mm2 ; a1
382 movq mm4, mm0 ; b1
383 paddw mm2, mm3 ;0
385 paddw mm4, mm7 ;1
386 psubw mm0, mm7 ;2
388 psubw mm6, mm3 ;3
389 psraw mm2, 3
391 psraw mm0, 3
392 psraw mm4, 3
394 psraw mm6, 3
396 movq mm1, mm2 ; 03 02 01 00
397 movq mm3, mm4 ; 23 22 21 20
399 punpcklwd mm1, mm0 ; 11 01 10 00
400 punpckhwd mm2, mm0 ; 13 03 12 02
402 punpcklwd mm3, mm6 ; 31 21 30 20
403 punpckhwd mm4, mm6 ; 33 23 32 22
405 movq mm0, mm1 ; 11 01 10 00
406 movq mm5, mm2 ; 13 03 12 02
408 punpckldq mm0, mm3 ; 30 20 10 00
409 punpckhdq mm1, mm3 ; 31 21 11 01
411 punpckldq mm2, mm4 ; 32 22 12 02
412 punpckhdq mm5, mm4 ; 33 23 13 03
414 pxor mm7, mm7
416 movd mm4, [rsi]
417 punpcklbw mm4, mm7
418 paddsw mm0, mm4
419 packuswb mm0, mm7
420 movd [rdx], mm0
422 movd mm4, [rsi+rax]
423 punpcklbw mm4, mm7
424 paddsw mm1, mm4
425 packuswb mm1, mm7
426 movd [rdx+rdi], mm1
428 movd mm4, [rsi+2*rax]
429 punpcklbw mm4, mm7
430 paddsw mm2, mm4
431 packuswb mm2, mm7
432 movd [rdx+rdi*2], mm2
434 add rdx, rdi
435 add rsi, rax
437 movd mm4, [rsi+2*rax]
438 punpcklbw mm4, mm7
439 paddsw mm5, mm4
440 packuswb mm5, mm7
441 movd [rdx+rdi*2], mm5
443 ; begin epilog
444 pop rdi
445 pop rsi
446 RESTORE_GOT
447 UNSHADOW_ARGS
448 pop rbp
452 SECTION_RODATA
453 align 16
454 x_s1sqr2:
455 times 4 dw 0x8A8C
456 align 16
457 x_c1sqr2less1:
458 times 4 dw 0x4E7B
459 align 16
460 fours:
461 times 4 dw 0x0004