Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vp8 / encoder / x86 / quantize_sse2.asm
blobc483933df1497d779124ad2fbf6e913e016b29fc
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
12 %include "asm_enc_offsets.asm"
15 ; void vp8_regular_quantize_b_sse2 | arg
16 ; (BLOCK *b, | 0
17 ; BLOCKD *d) | 1
19 global sym(vp8_regular_quantize_b_sse2)
20 sym(vp8_regular_quantize_b_sse2):
21 push rbp
22 mov rbp, rsp
23 SAVE_XMM 7
24 GET_GOT rbx
26 %if ABI_IS_32BIT
27 push rdi
28 push rsi
29 %else
30 %ifidn __OUTPUT_FORMAT__,x64
31 push rdi
32 push rsi
33 %endif
34 %endif
36 ALIGN_STACK 16, rax
37 %define zrun_zbin_boost 0 ; 8
38 %define abs_minus_zbin 8 ; 32
39 %define temp_qcoeff 40 ; 32
40 %define qcoeff 72 ; 32
41 %define stack_size 104
42 sub rsp, stack_size
43 ; end prolog
45 %if ABI_IS_32BIT
46 mov rdi, arg(0) ; BLOCK *b
47 mov rsi, arg(1) ; BLOCKD *d
48 %else
49 %ifidn __OUTPUT_FORMAT__,x64
50 mov rdi, rcx ; BLOCK *b
51 mov rsi, rdx ; BLOCKD *d
52 %else
53 ;mov rdi, rdi ; BLOCK *b
54 ;mov rsi, rsi ; BLOCKD *d
55 %endif
56 %endif
58 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr
59 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr
60 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
62 ; z
63 movdqa xmm0, [rdx]
64 movdqa xmm4, [rdx + 16]
65 mov rdx, [rdi + vp8_block_round] ; round_ptr
67 pshuflw xmm7, xmm7, 0
68 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
70 movdqa xmm1, xmm0
71 movdqa xmm5, xmm4
73 ; sz
74 psraw xmm0, 15
75 psraw xmm4, 15
77 ; (z ^ sz)
78 pxor xmm1, xmm0
79 pxor xmm5, xmm4
81 ; x = abs(z)
82 psubw xmm1, xmm0
83 psubw xmm5, xmm4
85 movdqa xmm2, [rcx]
86 movdqa xmm3, [rcx + 16]
87 mov rcx, [rdi + vp8_block_quant] ; quant_ptr
89 ; *zbin_ptr + zbin_oq_value
90 paddw xmm2, xmm7
91 paddw xmm3, xmm7
93 ; x - (*zbin_ptr + zbin_oq_value)
94 psubw xmm1, xmm2
95 psubw xmm5, xmm3
96 movdqa [rsp + abs_minus_zbin], xmm1
97 movdqa [rsp + abs_minus_zbin + 16], xmm5
99 ; add (zbin_ptr + zbin_oq_value) back
100 paddw xmm1, xmm2
101 paddw xmm5, xmm3
103 movdqa xmm2, [rdx]
104 movdqa xmm6, [rdx + 16]
106 movdqa xmm3, [rcx]
107 movdqa xmm7, [rcx + 16]
109 ; x + round
110 paddw xmm1, xmm2
111 paddw xmm5, xmm6
113 ; y = x * quant_ptr >> 16
114 pmulhw xmm3, xmm1
115 pmulhw xmm7, xmm5
117 ; y += x
118 paddw xmm1, xmm3
119 paddw xmm5, xmm7
121 movdqa [rsp + temp_qcoeff], xmm1
122 movdqa [rsp + temp_qcoeff + 16], xmm5
124 pxor xmm6, xmm6
125 ; zero qcoeff
126 movdqa [rsp + qcoeff], xmm6
127 movdqa [rsp + qcoeff + 16], xmm6
129 mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
130 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
131 mov [rsp + zrun_zbin_boost], rdx
133 %macro ZIGZAG_LOOP 1
135 movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
137 ; if (x >= zbin)
138 sub cx, WORD PTR[rdx] ; x - zbin
139 lea rdx, [rdx + 2] ; zbin_boost_ptr++
140 jl .rq_zigzag_loop_%1 ; x < zbin
142 movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
144 ; downshift by quant_shift[rc]
145 movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
146 sar edi, cl ; also sets Z bit
147 je .rq_zigzag_loop_%1 ; !y
148 mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
149 mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
150 .rq_zigzag_loop_%1:
151 %endmacro
152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
153 ZIGZAG_LOOP 0
154 ZIGZAG_LOOP 1
155 ZIGZAG_LOOP 4
156 ZIGZAG_LOOP 8
157 ZIGZAG_LOOP 5
158 ZIGZAG_LOOP 2
159 ZIGZAG_LOOP 3
160 ZIGZAG_LOOP 6
161 ZIGZAG_LOOP 9
162 ZIGZAG_LOOP 12
163 ZIGZAG_LOOP 13
164 ZIGZAG_LOOP 10
165 ZIGZAG_LOOP 7
166 ZIGZAG_LOOP 11
167 ZIGZAG_LOOP 14
168 ZIGZAG_LOOP 15
170 movdqa xmm2, [rsp + qcoeff]
171 movdqa xmm3, [rsp + qcoeff + 16]
173 mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
174 mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
176 ; y ^ sz
177 pxor xmm2, xmm0
178 pxor xmm3, xmm4
179 ; x = (y ^ sz) - sz
180 psubw xmm2, xmm0
181 psubw xmm3, xmm4
183 ; dequant
184 movdqa xmm0, [rcx]
185 movdqa xmm1, [rcx + 16]
187 mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
189 pmullw xmm0, xmm2
190 pmullw xmm1, xmm3
192 movdqa [rcx], xmm2 ; store qcoeff
193 movdqa [rcx + 16], xmm3
194 movdqa [rdi], xmm0 ; store dqcoeff
195 movdqa [rdi + 16], xmm1
197 ; select the last value (in zig_zag order) for EOB
198 pcmpeqw xmm2, xmm6
199 pcmpeqw xmm3, xmm6
201 pcmpeqw xmm6, xmm6
202 pxor xmm2, xmm6
203 pxor xmm3, xmm6
204 ; mask inv_zig_zag
205 pand xmm2, [GLOBAL(inv_zig_zag)]
206 pand xmm3, [GLOBAL(inv_zig_zag + 16)]
207 ; select the max value
208 pmaxsw xmm2, xmm3
209 pshufd xmm3, xmm2, 00001110b
210 pmaxsw xmm2, xmm3
211 pshuflw xmm3, xmm2, 00001110b
212 pmaxsw xmm2, xmm3
213 pshuflw xmm3, xmm2, 00000001b
214 pmaxsw xmm2, xmm3
215 movd eax, xmm2
216 and eax, 0xff
217 mov [rsi + vp8_blockd_eob], eax
219 ; begin epilog
220 add rsp, stack_size
221 pop rsp
222 %if ABI_IS_32BIT
223 pop rsi
224 pop rdi
225 %else
226 %ifidn __OUTPUT_FORMAT__,x64
227 pop rsi
228 pop rdi
229 %endif
230 %endif
231 RESTORE_GOT
232 RESTORE_XMM
233 pop rbp
236 ; void vp8_fast_quantize_b_sse2 | arg
237 ; (BLOCK *b, | 0
238 ; BLOCKD *d) | 1
240 global sym(vp8_fast_quantize_b_sse2)
241 sym(vp8_fast_quantize_b_sse2):
242 push rbp
243 mov rbp, rsp
244 GET_GOT rbx
246 %if ABI_IS_32BIT
247 push rdi
248 push rsi
249 %else
250 %ifidn __OUTPUT_FORMAT__,x64
251 push rdi
252 push rsi
253 %else
254 ; these registers are used for passing arguments
255 %endif
256 %endif
258 ; end prolog
260 %if ABI_IS_32BIT
261 mov rdi, arg(0) ; BLOCK *b
262 mov rsi, arg(1) ; BLOCKD *d
263 %else
264 %ifidn __OUTPUT_FORMAT__,x64
265 mov rdi, rcx ; BLOCK *b
266 mov rsi, rdx ; BLOCKD *d
267 %else
268 ;mov rdi, rdi ; BLOCK *b
269 ;mov rsi, rsi ; BLOCKD *d
270 %endif
271 %endif
273 mov rax, [rdi + vp8_block_coeff]
274 mov rcx, [rdi + vp8_block_round]
275 mov rdx, [rdi + vp8_block_quant_fast]
277 ; z = coeff
278 movdqa xmm0, [rax]
279 movdqa xmm4, [rax + 16]
281 ; dup z so we can save sz
282 movdqa xmm1, xmm0
283 movdqa xmm5, xmm4
285 ; sz = z >> 15
286 psraw xmm0, 15
287 psraw xmm4, 15
289 ; x = abs(z) = (z ^ sz) - sz
290 pxor xmm1, xmm0
291 pxor xmm5, xmm4
292 psubw xmm1, xmm0
293 psubw xmm5, xmm4
295 ; x += round
296 paddw xmm1, [rcx]
297 paddw xmm5, [rcx + 16]
299 mov rax, [rsi + vp8_blockd_qcoeff]
300 mov rcx, [rsi + vp8_blockd_dequant]
301 mov rdi, [rsi + vp8_blockd_dqcoeff]
303 ; y = x * quant >> 16
304 pmulhw xmm1, [rdx]
305 pmulhw xmm5, [rdx + 16]
307 ; x = (y ^ sz) - sz
308 pxor xmm1, xmm0
309 pxor xmm5, xmm4
310 psubw xmm1, xmm0
311 psubw xmm5, xmm4
313 ; qcoeff = x
314 movdqa [rax], xmm1
315 movdqa [rax + 16], xmm5
317 ; x * dequant
318 movdqa xmm2, xmm1
319 movdqa xmm3, xmm5
320 pmullw xmm2, [rcx]
321 pmullw xmm3, [rcx + 16]
323 ; dqcoeff = x * dequant
324 movdqa [rdi], xmm2
325 movdqa [rdi + 16], xmm3
327 pxor xmm4, xmm4 ;clear all bits
328 pcmpeqw xmm1, xmm4
329 pcmpeqw xmm5, xmm4
331 pcmpeqw xmm4, xmm4 ;set all bits
332 pxor xmm1, xmm4
333 pxor xmm5, xmm4
335 pand xmm1, [GLOBAL(inv_zig_zag)]
336 pand xmm5, [GLOBAL(inv_zig_zag + 16)]
338 pmaxsw xmm1, xmm5
340 ; now down to 8
341 pshufd xmm5, xmm1, 00001110b
343 pmaxsw xmm1, xmm5
345 ; only 4 left
346 pshuflw xmm5, xmm1, 00001110b
348 pmaxsw xmm1, xmm5
350 ; okay, just 2!
351 pshuflw xmm5, xmm1, 00000001b
353 pmaxsw xmm1, xmm5
355 movd eax, xmm1
356 and eax, 0xff
357 mov [rsi + vp8_blockd_eob], eax
359 ; begin epilog
360 %if ABI_IS_32BIT
361 pop rsi
362 pop rdi
363 %else
364 %ifidn __OUTPUT_FORMAT__,x64
365 pop rsi
366 pop rdi
367 %endif
368 %endif
370 RESTORE_GOT
371 pop rbp
374 SECTION_RODATA
375 align 16
376 inv_zig_zag:
377 dw 0x0001, 0x0002, 0x0006, 0x0007
378 dw 0x0003, 0x0005, 0x0008, 0x000d
379 dw 0x0004, 0x0009, 0x000c, 0x000e
380 dw 0x000a, 0x000b, 0x000f, 0x0010