2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
16 global sym
(vp8_dequantize_b_impl_mmx
)
17 sym
(vp8_dequantize_b_impl_mmx
):
20 SHADOW_ARGS_TO_STACK
3
30 pmullw mm1
, [rax
+0] ; mm4 *= kernel 0 modifiers.
34 pmullw mm1
, [rax
+8] ; mm4 *= kernel 0 modifiers.
38 pmullw mm1
, [rax
+16] ; mm4 *= kernel 0 modifiers.
42 pmullw mm1
, [rax
+24] ; mm4 *= kernel 0 modifiers.
53 ;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
54 global sym
(vp8_dequant_idct_add_mmx
)
55 sym
(vp8_dequant_idct_add_mmx
):
58 SHADOW_ARGS_TO_STACK
6
64 mov rax
, arg
(0) ;input
92 movsxd rax
, dword ptr arg
(4) ;pitch
93 movsxd rdi
, dword ptr arg
(5) ;stride
95 psubw mm0
, mm2
; b1= 0-2
99 paddw mm2
, mm0
; a1 =0+2
101 pmulhw mm5
, [GLOBAL(x_s1sqr2
)];
102 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
105 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)];
107 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
113 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
116 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
130 movq mm1
, mm2
; 03 02 01 00
131 movq mm3
, mm4
; 23 22 21 20
133 punpcklwd mm1
, mm0
; 11 01 10 00
134 punpckhwd mm2
, mm0
; 13 03 12 02
136 punpcklwd mm3
, mm6
; 31 21 30 20
137 punpckhwd mm4
, mm6
; 33 23 32 22
139 movq mm0
, mm1
; 11 01 10 00
140 movq mm5
, mm2
; 13 03 12 02
142 punpckldq mm0
, mm3
; 30 20 10 00
143 punpckhdq mm1
, mm3
; 31 21 11 01
145 punpckldq mm2
, mm4
; 32 22 12 02
146 punpckhdq mm5
, mm4
; 33 23 13 03
148 movq mm3
, mm5
; 33 23 13 03
150 psubw mm0
, mm2
; b1= 0-2
154 paddw mm2
, mm0
; a1 =0+2
156 pmulhw mm5
, [GLOBAL(x_s1sqr2
)];
157 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
160 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)];
162 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
168 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
171 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
175 paddw mm0
, [GLOBAL(fours
)]
177 paddw mm2
, [GLOBAL(fours
)]
194 movq mm1
, mm2
; 03 02 01 00
195 movq mm3
, mm4
; 23 22 21 20
197 punpcklwd mm1
, mm0
; 11 01 10 00
198 punpckhwd mm2
, mm0
; 13 03 12 02
200 punpcklwd mm3
, mm6
; 31 21 30 20
201 punpckhwd mm4
, mm6
; 33 23 32 22
203 movq mm0
, mm1
; 11 01 10 00
204 movq mm5
, mm2
; 13 03 12 02
206 punpckldq mm0
, mm3
; 30 20 10 00
207 punpckhdq mm1
, mm3
; 31 21 11 01
209 punpckldq mm2
, mm4
; 32 22 12 02
210 punpckhdq mm5
, mm4
; 33 23 13 03
226 movd mm4
, [rsi
+2*rax
]
230 movd
[rdx
+rdi
*2], mm2
235 movd mm4
, [rsi
+2*rax
]
239 movd
[rdx
+rdi
*2], mm5
250 ;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
251 global sym
(vp8_dequant_dc_idct_add_mmx
)
252 sym
(vp8_dequant_dc_idct_add_mmx
):
255 SHADOW_ARGS_TO_STACK
7
261 mov rax
, arg
(0) ;input
276 mov rdx
, arg
(3) ;dest
277 mov rsi
, arg
(2) ;pred
287 ; move lower word of Dc to lower word of mm0
289 movzx rcx
, word ptr arg
(6) ;Dc
294 movsxd rax
, dword ptr arg
(4) ;pitch
295 movsxd rdi
, dword ptr arg
(5) ;stride
297 psubw mm0
, mm2
; b1= 0-2
301 paddw mm2
, mm0
; a1 =0+2
303 pmulhw mm5
, [GLOBAL(x_s1sqr2
)];
304 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
307 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)];
309 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
315 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
318 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
332 movq mm1
, mm2
; 03 02 01 00
333 movq mm3
, mm4
; 23 22 21 20
335 punpcklwd mm1
, mm0
; 11 01 10 00
336 punpckhwd mm2
, mm0
; 13 03 12 02
338 punpcklwd mm3
, mm6
; 31 21 30 20
339 punpckhwd mm4
, mm6
; 33 23 32 22
341 movq mm0
, mm1
; 11 01 10 00
342 movq mm5
, mm2
; 13 03 12 02
344 punpckldq mm0
, mm3
; 30 20 10 00
345 punpckhdq mm1
, mm3
; 31 21 11 01
347 punpckldq mm2
, mm4
; 32 22 12 02
348 punpckhdq mm5
, mm4
; 33 23 13 03
350 movq mm3
, mm5
; 33 23 13 03
352 psubw mm0
, mm2
; b1= 0-2
356 paddw mm2
, mm0
; a1 =0+2
358 pmulhw mm5
, [GLOBAL(x_s1sqr2
)];
359 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
362 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)];
364 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
370 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
373 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
377 paddw mm0
, [GLOBAL(fours
)]
379 paddw mm2
, [GLOBAL(fours
)]
396 movq mm1
, mm2
; 03 02 01 00
397 movq mm3
, mm4
; 23 22 21 20
399 punpcklwd mm1
, mm0
; 11 01 10 00
400 punpckhwd mm2
, mm0
; 13 03 12 02
402 punpcklwd mm3
, mm6
; 31 21 30 20
403 punpckhwd mm4
, mm6
; 33 23 32 22
405 movq mm0
, mm1
; 11 01 10 00
406 movq mm5
, mm2
; 13 03 12 02
408 punpckldq mm0
, mm3
; 30 20 10 00
409 punpckhdq mm1
, mm3
; 31 21 11 01
411 punpckldq mm2
, mm4
; 32 22 12 02
412 punpckhdq mm5
, mm4
; 33 23 13 03
428 movd mm4
, [rsi
+2*rax
]
432 movd
[rdx
+rdi
*2], mm2
437 movd mm4
, [rsi
+2*rax
]
441 movd
[rdx
+rdi
*2], mm5