Merge "keep values in registers during quantization"
[libvpx.git] / vp8 / common / x86 / idctllm_mmx.asm
blob43735bc4b6f41521acdc42b0b75a9a91329c9e3a
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; /****************************************************************************
15 ; * Notes:
16 ; *
17 ; * This implementation makes use of 16 bit fixed point verio of two multiply
18 ; * constants:
19 ; * 1. sqrt(2) * cos (pi/8)
20 ; * 2. sqrt(2) * sin (pi/8)
21 ; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
22 ; * fixed point prrcision as the second one, we use a trick of
23 ; * x * a = x + x*(a-1)
24 ; * so
25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
26 ; *
27 ; * For the second constant, becuase of the 16bit version is 35468, which
28 ; * is bigger than 32768, in signed 16 bit multiply, it become a negative
29 ; * number.
30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
31 ; *
32 ; **************************************************************************/
35 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
36 global sym(vp8_short_idct4x4llm_mmx)
37 sym(vp8_short_idct4x4llm_mmx):
38 push rbp
39 mov rbp, rsp
40 SHADOW_ARGS_TO_STACK 3
41 GET_GOT rbx
42 ; end prolog
44 mov rax, arg(0) ;input
45 mov rdx, arg(1) ;output
47 movq mm0, [rax ]
48 movq mm1, [rax+ 8]
50 movq mm2, [rax+16]
51 movq mm3, [rax+24]
53 movsxd rax, dword ptr arg(2) ;pitch
55 psubw mm0, mm2 ; b1= 0-2
56 paddw mm2, mm2 ;
58 movq mm5, mm1
59 paddw mm2, mm0 ; a1 =0+2
61 pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
62 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
64 movq mm7, mm3 ;
65 pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
67 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
68 psubw mm7, mm5 ; c1
70 movq mm5, mm1
71 movq mm4, mm3
73 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
74 paddw mm5, mm1
76 pmulhw mm3, [GLOBAL(x_s1sqr2)]
77 paddw mm3, mm4
79 paddw mm3, mm5 ; d1
80 movq mm6, mm2 ; a1
82 movq mm4, mm0 ; b1
83 paddw mm2, mm3 ;0
85 paddw mm4, mm7 ;1
86 psubw mm0, mm7 ;2
88 psubw mm6, mm3 ;3
90 movq mm1, mm2 ; 03 02 01 00
91 movq mm3, mm4 ; 23 22 21 20
93 punpcklwd mm1, mm0 ; 11 01 10 00
94 punpckhwd mm2, mm0 ; 13 03 12 02
96 punpcklwd mm3, mm6 ; 31 21 30 20
97 punpckhwd mm4, mm6 ; 33 23 32 22
99 movq mm0, mm1 ; 11 01 10 00
100 movq mm5, mm2 ; 13 03 12 02
102 punpckldq mm0, mm3 ; 30 20 10 00
103 punpckhdq mm1, mm3 ; 31 21 11 01
105 punpckldq mm2, mm4 ; 32 22 12 02
106 punpckhdq mm5, mm4 ; 33 23 13 03
108 movq mm3, mm5 ; 33 23 13 03
110 psubw mm0, mm2 ; b1= 0-2
111 paddw mm2, mm2 ;
113 movq mm5, mm1
114 paddw mm2, mm0 ; a1 =0+2
116 pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
117 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
119 movq mm7, mm3 ;
120 pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
122 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
123 psubw mm7, mm5 ; c1
125 movq mm5, mm1
126 movq mm4, mm3
128 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
129 paddw mm5, mm1
131 pmulhw mm3, [GLOBAL(x_s1sqr2)]
132 paddw mm3, mm4
134 paddw mm3, mm5 ; d1
135 paddw mm0, [GLOBAL(fours)]
137 paddw mm2, [GLOBAL(fours)]
138 movq mm6, mm2 ; a1
140 movq mm4, mm0 ; b1
141 paddw mm2, mm3 ;0
143 paddw mm4, mm7 ;1
144 psubw mm0, mm7 ;2
146 psubw mm6, mm3 ;3
147 psraw mm2, 3
149 psraw mm0, 3
150 psraw mm4, 3
152 psraw mm6, 3
154 movq mm1, mm2 ; 03 02 01 00
155 movq mm3, mm4 ; 23 22 21 20
157 punpcklwd mm1, mm0 ; 11 01 10 00
158 punpckhwd mm2, mm0 ; 13 03 12 02
160 punpcklwd mm3, mm6 ; 31 21 30 20
161 punpckhwd mm4, mm6 ; 33 23 32 22
163 movq mm0, mm1 ; 11 01 10 00
164 movq mm5, mm2 ; 13 03 12 02
166 punpckldq mm0, mm3 ; 30 20 10 00
167 punpckhdq mm1, mm3 ; 31 21 11 01
169 punpckldq mm2, mm4 ; 32 22 12 02
170 punpckhdq mm5, mm4 ; 33 23 13 03
172 movq [rdx], mm0
174 movq [rdx+rax], mm1
175 movq [rdx+rax*2], mm2
177 add rdx, rax
178 movq [rdx+rax*2], mm5
180 ; begin epilog
181 RESTORE_GOT
182 UNSHADOW_ARGS
183 pop rbp
187 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
188 global sym(vp8_short_idct4x4llm_1_mmx)
189 sym(vp8_short_idct4x4llm_1_mmx):
190 push rbp
191 mov rbp, rsp
192 SHADOW_ARGS_TO_STACK 3
193 GET_GOT rbx
194 ; end prolog
196 mov rax, arg(0) ;input
197 movd mm0, [rax]
199 paddw mm0, [GLOBAL(fours)]
200 mov rdx, arg(1) ;output
202 psraw mm0, 3
203 movsxd rax, dword ptr arg(2) ;pitch
205 punpcklwd mm0, mm0
206 punpckldq mm0, mm0
208 movq [rdx], mm0
209 movq [rdx+rax], mm0
211 movq [rdx+rax*2], mm0
212 add rdx, rax
214 movq [rdx+rax*2], mm0
217 ; begin epilog
218 RESTORE_GOT
219 UNSHADOW_ARGS
220 pop rbp
223 ;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
224 global sym(vp8_dc_only_idct_add_mmx)
225 sym(vp8_dc_only_idct_add_mmx):
226 push rbp
227 mov rbp, rsp
228 SHADOW_ARGS_TO_STACK 5
229 GET_GOT rbx
230 push rsi
231 push rdi
232 ; end prolog
234 mov rsi, arg(1) ;s -- prediction
235 mov rdi, arg(2) ;d -- destination
236 movsxd rax, dword ptr arg(4) ;stride
237 movsxd rdx, dword ptr arg(3) ;pitch
238 pxor mm0, mm0
240 movd mm5, arg(0) ;input_dc
242 paddw mm5, [GLOBAL(fours)]
244 psraw mm5, 3
246 punpcklwd mm5, mm5
247 punpckldq mm5, mm5
249 movd mm1, [rsi]
250 punpcklbw mm1, mm0
251 paddsw mm1, mm5
252 packuswb mm1, mm0 ; pack and unpack to saturate
253 movd [rdi], mm1
255 movd mm2, [rsi+rdx]
256 punpcklbw mm2, mm0
257 paddsw mm2, mm5
258 packuswb mm2, mm0 ; pack and unpack to saturate
259 movd [rdi+rax], mm2
261 movd mm3, [rsi+2*rdx]
262 punpcklbw mm3, mm0
263 paddsw mm3, mm5
264 packuswb mm3, mm0 ; pack and unpack to saturate
265 movd [rdi+2*rax], mm3
267 add rdi, rax
268 add rsi, rdx
269 movd mm4, [rsi+2*rdx]
270 punpcklbw mm4, mm0
271 paddsw mm4, mm5
272 packuswb mm4, mm0 ; pack and unpack to saturate
273 movd [rdi+2*rax], mm4
275 ; begin epilog
276 pop rdi
277 pop rsi
278 RESTORE_GOT
279 UNSHADOW_ARGS
280 pop rbp
283 SECTION_RODATA
284 align 16
285 x_s1sqr2:
286 times 4 dw 0x8A8C
287 align 16
288 x_c1sqr2less1:
289 times 4 dw 0x4E7B
290 align 16
291 fours:
292 times 4 dw 0x0004