Removed unused vp8_recon_intra4x4mb function
[libvpx.git] / vp8 / encoder / x86 / quantize_sse2.asm
blob45e1a2ad3387278728633840ad239460a7185afc
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
14 ;int vp8_regular_quantize_b_impl_sse2(
15 ; short *coeff_ptr,
16 ; short *zbin_ptr,
17 ; short *qcoeff_ptr,
18 ; short *dequant_ptr,
19 ; const int *default_zig_zag,
20 ; short *round_ptr,
21 ; short *quant_ptr,
22 ; short *dqcoeff_ptr,
23 ; unsigned short zbin_oq_value,
24 ; short *zbin_boost_ptr,
25 ; short *quant_shift);
27 global sym(vp8_regular_quantize_b_impl_sse2)
28 sym(vp8_regular_quantize_b_impl_sse2):
29 push rbp
30 mov rbp, rsp
31 SHADOW_ARGS_TO_STACK 11
32 SAVE_XMM
33 push rsi
34 push rdi
35 push rbx
36 ALIGN_STACK 16, rax
37 %define abs_minus_zbin 0
38 %define temp_qcoeff 32
39 %define qcoeff 64
40 %define eob_tmp 96
41 %define stack_size 112
42 sub rsp, stack_size
43 ; end prolog
45 mov rdx, arg(0) ; coeff_ptr
46 mov rcx, arg(1) ; zbin_ptr
47 movd xmm7, arg(8) ; zbin_oq_value
48 mov rdi, arg(5) ; round_ptr
49 mov rsi, arg(6) ; quant_ptr
51 ; z
52 movdqa xmm0, OWORD PTR[rdx]
53 movdqa xmm4, OWORD PTR[rdx + 16]
55 pshuflw xmm7, xmm7, 0
56 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
58 movdqa xmm1, xmm0
59 movdqa xmm5, xmm4
61 ; sz
62 psraw xmm0, 15
63 psraw xmm4, 15
65 ; (z ^ sz)
66 pxor xmm1, xmm0
67 pxor xmm5, xmm4
69 ; x = abs(z)
70 psubw xmm1, xmm0
71 psubw xmm5, xmm4
73 movdqa xmm2, OWORD PTR[rcx]
74 movdqa xmm3, OWORD PTR[rcx + 16]
76 ; *zbin_ptr + zbin_oq_value
77 paddw xmm2, xmm7
78 paddw xmm3, xmm7
80 ; x - (*zbin_ptr + zbin_oq_value)
81 psubw xmm1, xmm2
82 psubw xmm5, xmm3
83 movdqa OWORD PTR[rsp + abs_minus_zbin], xmm1
84 movdqa OWORD PTR[rsp + abs_minus_zbin + 16], xmm5
86 ; add (zbin_ptr + zbin_oq_value) back
87 paddw xmm1, xmm2
88 paddw xmm5, xmm3
90 movdqa xmm2, OWORD PTR[rdi]
91 movdqa xmm6, OWORD PTR[rdi + 16]
93 movdqa xmm3, OWORD PTR[rsi]
94 movdqa xmm7, OWORD PTR[rsi + 16]
96 ; x + round
97 paddw xmm1, xmm2
98 paddw xmm5, xmm6
100 ; y = x * quant_ptr >> 16
101 pmulhw xmm3, xmm1
102 pmulhw xmm7, xmm5
104 ; y += x
105 paddw xmm1, xmm3
106 paddw xmm5, xmm7
108 movdqa OWORD PTR[rsp + temp_qcoeff], xmm1
109 movdqa OWORD PTR[rsp + temp_qcoeff + 16], xmm5
111 pxor xmm6, xmm6
112 ; zero qcoeff
113 movdqa OWORD PTR[rsp + qcoeff], xmm6
114 movdqa OWORD PTR[rsp + qcoeff + 16], xmm6
116 mov [rsp + eob_tmp], DWORD -1 ; eob
117 mov rsi, arg(9) ; zbin_boost_ptr
118 mov rdi, arg(4) ; default_zig_zag
119 mov rax, arg(10) ; quant_shift_ptr
121 %macro ZIGZAG_LOOP 2
122 rq_zigzag_loop_%1:
123 movsxd rdx, DWORD PTR[rdi + (%1 * 4)] ; rc
124 movsx ebx, WORD PTR [rsi] ; *zbin_boost_ptr
125 lea rsi, [rsi + 2] ; zbin_boost_ptr++
128 movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
130 ; if (x >= zbin)
131 sub ecx, ebx ; x - zbin
132 jl rq_zigzag_loop_%2 ; x < zbin
134 movsx ebx, WORD PTR[rsp + temp_qcoeff + rdx *2]
136 ; downshift by quant_shift[rdx]
137 movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc]
138 sar ebx, cl ; also sets Z bit
139 je rq_zigzag_loop_%2 ; !y
140 mov WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
142 mov rsi, arg(9) ; reset to b->zrun_zbin_boost
143 mov [rsp + eob_tmp], DWORD %1 ; eob = i
144 %endmacro
145 ZIGZAG_LOOP 0, 1
146 ZIGZAG_LOOP 1, 2
147 ZIGZAG_LOOP 2, 3
148 ZIGZAG_LOOP 3, 4
149 ZIGZAG_LOOP 4, 5
150 ZIGZAG_LOOP 5, 6
151 ZIGZAG_LOOP 6, 7
152 ZIGZAG_LOOP 7, 8
153 ZIGZAG_LOOP 8, 9
154 ZIGZAG_LOOP 9, 10
155 ZIGZAG_LOOP 10, 11
156 ZIGZAG_LOOP 11, 12
157 ZIGZAG_LOOP 12, 13
158 ZIGZAG_LOOP 13, 14
159 ZIGZAG_LOOP 14, 15
160 ZIGZAG_LOOP 15, end
161 rq_zigzag_loop_end:
163 mov rbx, arg(2) ; qcoeff_ptr
164 mov rcx, arg(3) ; dequant_ptr
165 mov rsi, arg(7) ; dqcoeff_ptr
166 mov rax, [rsp + eob_tmp] ; eob
168 movdqa xmm2, OWORD PTR[rsp + qcoeff]
169 movdqa xmm3, OWORD PTR[rsp + qcoeff + 16]
171 ; y ^ sz
172 pxor xmm2, xmm0
173 pxor xmm3, xmm4
174 ; x = (y ^ sz) - sz
175 psubw xmm2, xmm0
176 psubw xmm3, xmm4
178 movdqa xmm0, OWORD PTR[rcx]
179 movdqa xmm1, OWORD PTR[rcx + 16]
181 pmullw xmm0, xmm2
182 pmullw xmm1, xmm3
184 movdqa OWORD PTR[rbx], xmm2
185 movdqa OWORD PTR[rbx + 16], xmm3
186 movdqa OWORD PTR[rsi], xmm0 ; store dqcoeff
187 movdqa OWORD PTR[rsi + 16], xmm1 ; store dqcoeff
189 add rax, 1
191 ; begin epilog
192 add rsp, stack_size
193 pop rsp
194 pop rbx
195 pop rdi
196 pop rsi
197 RESTORE_XMM
198 UNSHADOW_ARGS
199 pop rbp
202 ;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
203 ; short *qcoeff_ptr,short *dequant_ptr,
204 ; short *inv_scan_order, short *round_ptr,
205 ; short *quant_ptr, short *dqcoeff_ptr);
206 global sym(vp8_fast_quantize_b_impl_sse2)
207 sym(vp8_fast_quantize_b_impl_sse2):
208 push rbp
209 mov rbp, rsp
210 SHADOW_ARGS_TO_STACK 7
211 push rsi
212 push rdi
213 ; end prolog
215 mov rdx, arg(0) ;coeff_ptr
216 mov rcx, arg(2) ;dequant_ptr
217 mov rdi, arg(4) ;round_ptr
218 mov rsi, arg(5) ;quant_ptr
220 movdqa xmm0, XMMWORD PTR[rdx]
221 movdqa xmm4, XMMWORD PTR[rdx + 16]
223 movdqa xmm2, XMMWORD PTR[rdi] ;round lo
224 movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi
226 movdqa xmm1, xmm0
227 movdqa xmm5, xmm4
229 psraw xmm0, 15 ;sign of z (aka sz)
230 psraw xmm4, 15 ;sign of z (aka sz)
232 pxor xmm1, xmm0
233 pxor xmm5, xmm4
234 psubw xmm1, xmm0 ;x = abs(z)
235 psubw xmm5, xmm4 ;x = abs(z)
237 paddw xmm1, xmm2
238 paddw xmm5, xmm3
240 pmulhw xmm1, XMMWORD PTR[rsi]
241 pmulhw xmm5, XMMWORD PTR[rsi + 16]
243 mov rdi, arg(1) ;qcoeff_ptr
244 mov rsi, arg(6) ;dqcoeff_ptr
246 movdqa xmm2, XMMWORD PTR[rcx]
247 movdqa xmm3, XMMWORD PTR[rcx + 16]
249 pxor xmm1, xmm0
250 pxor xmm5, xmm4
251 psubw xmm1, xmm0
252 psubw xmm5, xmm4
254 movdqa XMMWORD PTR[rdi], xmm1
255 movdqa XMMWORD PTR[rdi + 16], xmm5
257 pmullw xmm2, xmm1
258 pmullw xmm3, xmm5
260 mov rdi, arg(3) ;inv_scan_order
262 ; Start with 16
263 pxor xmm4, xmm4 ;clear all bits
264 pcmpeqw xmm1, xmm4
265 pcmpeqw xmm5, xmm4
267 pcmpeqw xmm4, xmm4 ;set all bits
268 pxor xmm1, xmm4
269 pxor xmm5, xmm4
271 pand xmm1, XMMWORD PTR[rdi]
272 pand xmm5, XMMWORD PTR[rdi+16]
274 pmaxsw xmm1, xmm5
276 ; now down to 8
277 pshufd xmm5, xmm1, 00001110b
279 pmaxsw xmm1, xmm5
281 ; only 4 left
282 pshuflw xmm5, xmm1, 00001110b
284 pmaxsw xmm1, xmm5
286 ; okay, just 2!
287 pshuflw xmm5, xmm1, 00000001b
289 pmaxsw xmm1, xmm5
291 movd rax, xmm1
292 and rax, 0xff
294 movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff
295 movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff
297 ; begin epilog
298 pop rdi
299 pop rsi
300 UNSHADOW_ARGS
301 pop rbp