Added vp8_fast_quantize_b_sse2
[libvpx.git] / vp8 / encoder / x86 / quantize_mmx.asm
blobf29a54ecd12832d674c62435dd9adefd73d6c48d
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
15 ; short *qcoeff_ptr,short *dequant_ptr,
16 ; short *scan_mask, short *round_ptr,
17 ; short *quant_ptr, short *dqcoeff_ptr);
18 global sym(vp8_fast_quantize_b_impl_mmx)
19 sym(vp8_fast_quantize_b_impl_mmx):
20 push rbp
21 mov rbp, rsp
22 SHADOW_ARGS_TO_STACK 8
23 push rsi
24 push rdi
25 ; end prolog
28 mov rsi, arg(0) ;coeff_ptr
29 movq mm0, [rsi]
31 mov rax, arg(1) ;zbin_ptr
32 movq mm1, [rax]
34 movq mm3, mm0
35 psraw mm0, 15
37 pxor mm3, mm0
38 psubw mm3, mm0 ; abs
40 movq mm2, mm3
41 pcmpgtw mm1, mm2
43 pandn mm1, mm2
44 movq mm3, mm1
46 mov rdx, arg(6) ;quant_ptr
47 movq mm1, [rdx]
49 mov rcx, arg(5) ;round_ptr
50 movq mm2, [rcx]
52 paddw mm3, mm2
53 pmulhuw mm3, mm1
55 pxor mm3, mm0
56 psubw mm3, mm0 ;gain the sign back
58 mov rdi, arg(2) ;qcoeff_ptr
59 movq mm0, mm3
61 movq [rdi], mm3
63 mov rax, arg(3) ;dequant_ptr
64 movq mm2, [rax]
66 pmullw mm3, mm2
67 mov rax, arg(7) ;dqcoeff_ptr
69 movq [rax], mm3
71 ; next 8
72 movq mm4, [rsi+8]
74 mov rax, arg(1) ;zbin_ptr
75 movq mm5, [rax+8]
77 movq mm7, mm4
78 psraw mm4, 15
80 pxor mm7, mm4
81 psubw mm7, mm4 ; abs
83 movq mm6, mm7
84 pcmpgtw mm5, mm6
86 pandn mm5, mm6
87 movq mm7, mm5
89 movq mm5, [rdx+8]
90 movq mm6, [rcx+8]
92 paddw mm7, mm6
93 pmulhuw mm7, mm5
95 pxor mm7, mm4
96 psubw mm7, mm4;gain the sign back
98 mov rdi, arg(2) ;qcoeff_ptr
100 movq mm1, mm7
101 movq [rdi+8], mm7
103 mov rax, arg(3) ;dequant_ptr
104 movq mm6, [rax+8]
106 pmullw mm7, mm6
107 mov rax, arg(7) ;dqcoeff_ptr
109 movq [rax+8], mm7
112 ; next 8
113 movq mm4, [rsi+16]
115 mov rax, arg(1) ;zbin_ptr
116 movq mm5, [rax+16]
118 movq mm7, mm4
119 psraw mm4, 15
121 pxor mm7, mm4
122 psubw mm7, mm4 ; abs
124 movq mm6, mm7
125 pcmpgtw mm5, mm6
127 pandn mm5, mm6
128 movq mm7, mm5
130 movq mm5, [rdx+16]
131 movq mm6, [rcx+16]
133 paddw mm7, mm6
134 pmulhuw mm7, mm5
136 pxor mm7, mm4
137 psubw mm7, mm4;gain the sign back
139 mov rdi, arg(2) ;qcoeff_ptr
141 movq mm1, mm7
142 movq [rdi+16], mm7
144 mov rax, arg(3) ;dequant_ptr
145 movq mm6, [rax+16]
147 pmullw mm7, mm6
148 mov rax, arg(7) ;dqcoeff_ptr
150 movq [rax+16], mm7
153 ; next 8
154 movq mm4, [rsi+24]
156 mov rax, arg(1) ;zbin_ptr
157 movq mm5, [rax+24]
159 movq mm7, mm4
160 psraw mm4, 15
162 pxor mm7, mm4
163 psubw mm7, mm4 ; abs
165 movq mm6, mm7
166 pcmpgtw mm5, mm6
168 pandn mm5, mm6
169 movq mm7, mm5
171 movq mm5, [rdx+24]
172 movq mm6, [rcx+24]
174 paddw mm7, mm6
175 pmulhuw mm7, mm5
177 pxor mm7, mm4
178 psubw mm7, mm4;gain the sign back
180 mov rdi, arg(2) ;qcoeff_ptr
182 movq mm1, mm7
183 movq [rdi+24], mm7
185 mov rax, arg(3) ;dequant_ptr
186 movq mm6, [rax+24]
188 pmullw mm7, mm6
189 mov rax, arg(7) ;dqcoeff_ptr
191 movq [rax+24], mm7
195 mov rdi, arg(4) ;scan_mask
196 mov rsi, arg(2) ;qcoeff_ptr
198 pxor mm5, mm5
199 pxor mm7, mm7
201 movq mm0, [rsi]
202 movq mm1, [rsi+8]
204 movq mm2, [rdi]
205 movq mm3, [rdi+8];
207 pcmpeqw mm0, mm7
208 pcmpeqw mm1, mm7
210 pcmpeqw mm6, mm6
211 pxor mm0, mm6
213 pxor mm1, mm6
214 psrlw mm0, 15
216 psrlw mm1, 15
217 pmaddwd mm0, mm2
219 pmaddwd mm1, mm3
220 movq mm5, mm0
222 paddd mm5, mm1
224 movq mm0, [rsi+16]
225 movq mm1, [rsi+24]
227 movq mm2, [rdi+16]
228 movq mm3, [rdi+24];
230 pcmpeqw mm0, mm7
231 pcmpeqw mm1, mm7
233 pcmpeqw mm6, mm6
234 pxor mm0, mm6
236 pxor mm1, mm6
237 psrlw mm0, 15
239 psrlw mm1, 15
240 pmaddwd mm0, mm2
242 pmaddwd mm1, mm3
243 paddd mm5, mm0
245 paddd mm5, mm1
246 movq mm0, mm5
248 psrlq mm5, 32
249 paddd mm0, mm5
251 ; eob adjustment begins here
252 movq rcx, mm0
253 and rcx, 0xffff
255 xor rdx, rdx
256 sub rdx, rcx ; rdx=-rcx
258 bsr rax, rcx
259 inc rax
261 sar rdx, 31
262 and rax, rdx
263 ; Substitute the sse assembly for the old mmx mixed assembly/C. The
264 ; following is kept as reference
265 ; movq rcx, mm0
266 ; bsr rax, rcx
268 ; mov eob, rax
269 ; mov eee, rcx
271 ;if(eee==0)
273 ; eob=-1;
275 ;else if(eee<0)
277 ; eob=15;
279 ;d->eob = eob+1;
281 ; begin epilog
282 pop rdi
283 pop rsi
284 UNSHADOW_ARGS
285 pop rbp