2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; /****************************************************************************
17 ; * This implementation makes use of 16 bit fixed point verio of two multiply
19 ; * 1. sqrt(2) * cos (pi/8)
20 ; * 2. sqrt(2) * sin (pi/8)
21 ; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
22 ; * fixed point prrcision as the second one, we use a trick of
23 ; * x * a = x + x*(a-1)
25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
27 ; * For the second constant, becuase of the 16bit version is 35468, which
28 ; * is bigger than 32768, in signed 16 bit multiply, it become a negative
30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
32 ; **************************************************************************/
35 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
36 global sym
(vp8_short_idct4x4llm_mmx
)
37 sym
(vp8_short_idct4x4llm_mmx
):
40 SHADOW_ARGS_TO_STACK
3
44 mov rax
, arg
(0) ;input
45 mov rdx
, arg
(1) ;output
53 movsxd rax
, dword ptr arg
(2) ;pitch
55 psubw mm0
, mm2
; b1= 0-2
59 paddw mm2
, mm0
; a1 =0+2
61 pmulhw mm5
, [GLOBAL(x_s1sqr2
)] ;
62 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
65 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)] ;
67 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
73 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
76 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
90 movq mm1
, mm2
; 03 02 01 00
91 movq mm3
, mm4
; 23 22 21 20
93 punpcklwd mm1
, mm0
; 11 01 10 00
94 punpckhwd mm2
, mm0
; 13 03 12 02
96 punpcklwd mm3
, mm6
; 31 21 30 20
97 punpckhwd mm4
, mm6
; 33 23 32 22
99 movq mm0
, mm1
; 11 01 10 00
100 movq mm5
, mm2
; 13 03 12 02
102 punpckldq mm0
, mm3
; 30 20 10 00
103 punpckhdq mm1
, mm3
; 31 21 11 01
105 punpckldq mm2
, mm4
; 32 22 12 02
106 punpckhdq mm5
, mm4
; 33 23 13 03
108 movq mm3
, mm5
; 33 23 13 03
110 psubw mm0
, mm2
; b1= 0-2
114 paddw mm2
, mm0
; a1 =0+2
116 pmulhw mm5
, [GLOBAL(x_s1sqr2
)] ;
117 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
120 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)] ;
122 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
128 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
131 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
135 paddw mm0
, [GLOBAL(fours
)]
137 paddw mm2
, [GLOBAL(fours
)]
154 movq mm1
, mm2
; 03 02 01 00
155 movq mm3
, mm4
; 23 22 21 20
157 punpcklwd mm1
, mm0
; 11 01 10 00
158 punpckhwd mm2
, mm0
; 13 03 12 02
160 punpcklwd mm3
, mm6
; 31 21 30 20
161 punpckhwd mm4
, mm6
; 33 23 32 22
163 movq mm0
, mm1
; 11 01 10 00
164 movq mm5
, mm2
; 13 03 12 02
166 punpckldq mm0
, mm3
; 30 20 10 00
167 punpckhdq mm1
, mm3
; 31 21 11 01
169 punpckldq mm2
, mm4
; 32 22 12 02
170 punpckhdq mm5
, mm4
; 33 23 13 03
175 movq
[rdx
+rax
*2], mm2
178 movq
[rdx
+rax
*2], mm5
187 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
188 global sym
(vp8_short_idct4x4llm_1_mmx
)
189 sym
(vp8_short_idct4x4llm_1_mmx
):
192 SHADOW_ARGS_TO_STACK
3
196 mov rax
, arg
(0) ;input
199 paddw mm0
, [GLOBAL(fours
)]
200 mov rdx
, arg
(1) ;output
203 movsxd rax
, dword ptr arg
(2) ;pitch
211 movq
[rdx
+rax
*2], mm0
214 movq
[rdx
+rax
*2], mm0
223 ;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
224 global sym
(vp8_dc_only_idct_add_mmx
)
225 sym
(vp8_dc_only_idct_add_mmx
):
228 SHADOW_ARGS_TO_STACK
5
234 mov rsi
, arg
(1) ;s -- prediction
235 mov rdi
, arg
(2) ;d -- destination
236 movsxd rax
, dword ptr arg
(4) ;stride
237 movsxd rdx
, dword ptr arg
(3) ;pitch
240 movd mm5
, arg
(0) ;input_dc
242 paddw mm5
, [GLOBAL(fours
)]
252 packuswb mm1
, mm0
; pack and unpack to saturate
258 packuswb mm2
, mm0
; pack and unpack to saturate
261 movd mm3
, [rsi
+2*rdx
]
264 packuswb mm3
, mm0
; pack and unpack to saturate
265 movd
[rdi
+2*rax
], mm3
269 movd mm4
, [rsi
+2*rdx
]
272 packuswb mm4
, mm0
; pack and unpack to saturate
273 movd
[rdi
+2*rax
], mm4