2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
14 ;int vp8_regular_quantize_b_impl_sse2(
19 ; const int *default_zig_zag,
23 ; unsigned short zbin_oq_value,
24 ; short *zbin_boost_ptr,
25 ; short *quant_shift);
27 global sym
(vp8_regular_quantize_b_impl_sse2
)
28 sym
(vp8_regular_quantize_b_impl_sse2
):
31 SHADOW_ARGS_TO_STACK
11
37 %define abs_minus_zbin
0
38 %define temp_qcoeff
32
41 %define stack_size
112
45 mov rdx
, arg
(0) ; coeff_ptr
46 mov rcx
, arg
(1) ; zbin_ptr
47 movd xmm7
, arg
(8) ; zbin_oq_value
48 mov rdi
, arg
(5) ; round_ptr
49 mov rsi
, arg
(6) ; quant_ptr
52 movdqa xmm0
, OWORD
PTR[rdx
]
53 movdqa xmm4
, OWORD
PTR[rdx
+ 16]
56 punpcklwd xmm7
, xmm7
; duplicated zbin_oq_value
73 movdqa xmm2
, OWORD
PTR[rcx
]
74 movdqa xmm3
, OWORD
PTR[rcx
+ 16]
76 ; *zbin_ptr + zbin_oq_value
80 ; x - (*zbin_ptr + zbin_oq_value)
83 movdqa OWORD
PTR[rsp
+ abs_minus_zbin
], xmm1
84 movdqa OWORD
PTR[rsp
+ abs_minus_zbin
+ 16], xmm5
86 ; add (zbin_ptr + zbin_oq_value) back
90 movdqa xmm2
, OWORD
PTR[rdi
]
91 movdqa xmm6
, OWORD
PTR[rdi
+ 16]
93 movdqa xmm3
, OWORD
PTR[rsi
]
94 movdqa xmm7
, OWORD
PTR[rsi
+ 16]
100 ; y = x * quant_ptr >> 16
108 movdqa OWORD
PTR[rsp
+ temp_qcoeff
], xmm1
109 movdqa OWORD
PTR[rsp
+ temp_qcoeff
+ 16], xmm5
113 movdqa OWORD
PTR[rsp
+ qcoeff
], xmm6
114 movdqa OWORD
PTR[rsp
+ qcoeff
+ 16], xmm6
116 mov [rsp
+ eob_tmp
], DWORD -1 ; eob
117 mov rsi
, arg
(9) ; zbin_boost_ptr
118 mov rdi
, arg
(4) ; default_zig_zag
119 mov rax
, arg
(10) ; quant_shift_ptr
123 movsxd rdx
, DWORD PTR[rdi
+ (%1 * 4)] ; rc
124 movsx ebx, WORD PTR [rsi
] ; *zbin_boost_ptr
125 lea rsi
, [rsi
+ 2] ; zbin_boost_ptr++
128 movsx ecx, WORD PTR[rsp
+ abs_minus_zbin
+ rdx
*2]
131 sub ecx, ebx ; x - zbin
132 jl rq_zigzag_loop_
%2 ; x < zbin
134 movsx ebx, WORD PTR[rsp
+ temp_qcoeff
+ rdx
*2]
136 ; downshift by quant_shift[rdx]
137 movsx ecx, WORD PTR[rax
+ rdx
*2] ; quant_shift_ptr[rc]
138 sar ebx, cl ; also sets Z bit
139 je rq_zigzag_loop_
%2 ; !y
140 mov WORD PTR[rsp
+ qcoeff
+ rdx
* 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
142 mov rsi
, arg
(9) ; reset to b->zrun_zbin_boost
143 mov [rsp
+ eob_tmp
], DWORD %1 ; eob = i
163 mov rbx
, arg
(2) ; qcoeff_ptr
164 mov rcx
, arg
(3) ; dequant_ptr
165 mov rsi
, arg
(7) ; dqcoeff_ptr
166 mov rax
, [rsp
+ eob_tmp
] ; eob
168 movdqa xmm2
, OWORD
PTR[rsp
+ qcoeff
]
169 movdqa xmm3
, OWORD
PTR[rsp
+ qcoeff
+ 16]
178 movdqa xmm0
, OWORD
PTR[rcx
]
179 movdqa xmm1
, OWORD
PTR[rcx
+ 16]
184 movdqa OWORD
PTR[rbx
], xmm2
185 movdqa OWORD
PTR[rbx
+ 16], xmm3
186 movdqa OWORD
PTR[rsi
], xmm0
; store dqcoeff
187 movdqa OWORD
PTR[rsi
+ 16], xmm1
; store dqcoeff
202 ;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
203 ; short *qcoeff_ptr,short *dequant_ptr,
204 ; short *inv_scan_order, short *round_ptr,
205 ; short *quant_ptr, short *dqcoeff_ptr);
206 global sym
(vp8_fast_quantize_b_impl_sse2
)
207 sym
(vp8_fast_quantize_b_impl_sse2
):
210 SHADOW_ARGS_TO_STACK
7
215 mov rdx
, arg
(0) ;coeff_ptr
216 mov rcx
, arg
(2) ;dequant_ptr
217 mov rdi
, arg
(4) ;round_ptr
218 mov rsi
, arg
(5) ;quant_ptr
220 movdqa xmm0
, XMMWORD
PTR[rdx
]
221 movdqa xmm4
, XMMWORD
PTR[rdx
+ 16]
223 movdqa xmm2
, XMMWORD
PTR[rdi
] ;round lo
224 movdqa xmm3
, XMMWORD
PTR[rdi
+ 16] ;round hi
229 psraw xmm0
, 15 ;sign of z (aka sz)
230 psraw xmm4
, 15 ;sign of z (aka sz)
234 psubw xmm1
, xmm0
;x = abs(z)
235 psubw xmm5
, xmm4
;x = abs(z)
240 pmulhw xmm1
, XMMWORD
PTR[rsi
]
241 pmulhw xmm5
, XMMWORD
PTR[rsi
+ 16]
243 mov rdi
, arg
(1) ;qcoeff_ptr
244 mov rsi
, arg
(6) ;dqcoeff_ptr
246 movdqa xmm2
, XMMWORD
PTR[rcx
]
247 movdqa xmm3
, XMMWORD
PTR[rcx
+ 16]
254 movdqa XMMWORD
PTR[rdi
], xmm1
255 movdqa XMMWORD
PTR[rdi
+ 16], xmm5
260 mov rdi
, arg
(3) ;inv_scan_order
263 pxor xmm4
, xmm4
;clear all bits
267 pcmpeqw xmm4
, xmm4
;set all bits
271 pand xmm1
, XMMWORD
PTR[rdi
]
272 pand xmm5
, XMMWORD
PTR[rdi
+16]
277 pshufd xmm5
, xmm1
, 00001110b
282 pshuflw xmm5
, xmm1
, 00001110b
287 pshuflw xmm5
, xmm1
, 00000001b
294 movdqa XMMWORD
PTR[rsi
], xmm2
;store dqcoeff
295 movdqa XMMWORD
PTR[rsi
+ 16], xmm3
;store dqcoeff