vp8/encoder/x86/quantize_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license and patent
   5 ;  grant that can be found in the LICENSE file in the root of the source
   6 ;  tree. All contributing project authors may be found in the AUTHORS
   7 ;  file in the root of the source tree.
   8 ;
   9
  10
  11 %include "vpx_ports/x86_abi_support.asm"
  12 %include "asm_enc_offsets.asm"
  13
  14
  15 ; void vp8_regular_quantize_b_sse2 | arg
  16 ;  (BLOCK  *b,                     |  0
  17 ;   BLOCKD *d)                     |  1
  18
  19 global sym(vp8_regular_quantize_b_sse2)
  20 sym(vp8_regular_quantize_b_sse2):
  21     push        rbp
  22     mov         rbp, rsp
  23     SAVE_XMM 7
  24     GET_GOT     rbx
  25
  26 %if ABI_IS_32BIT
  27     push        rdi
  28     push        rsi
  29 %else
  30   %ifidn __OUTPUT_FORMAT__,x64
  31     push        rdi
  32     push        rsi
  33   %endif
  34 %endif
  35
  36     ALIGN_STACK 16, rax
  37     %define zrun_zbin_boost   0  ;  8
  38     %define abs_minus_zbin    8  ; 32
  39     %define temp_qcoeff       40 ; 32
  40     %define qcoeff            72 ; 32
  41     %define stack_size        104
  42     sub         rsp, stack_size
  43     ; end prolog
  44
  45 %if ABI_IS_32BIT
  46     mov         rdi, arg(0)                 ; BLOCK *b
  47     mov         rsi, arg(1)                 ; BLOCKD *d
  48 %else
  49   %ifidn __OUTPUT_FORMAT__,x64
  50     mov         rdi, rcx                    ; BLOCK *b
  51     mov         rsi, rdx                    ; BLOCKD *d
  52   %else
  53     ;mov         rdi, rdi                    ; BLOCK *b
  54     ;mov         rsi, rsi                    ; BLOCKD *d
  55   %endif
  56 %endif
  57
  58     mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
  59     mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
  60     movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
  61
  62     ; z
  63     movdqa      xmm0, [rdx]
  64     movdqa      xmm4, [rdx + 16]
  65     mov         rdx, [rdi + vp8_block_round] ; round_ptr
  66
  67     pshuflw     xmm7, xmm7, 0
  68     punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
  69
  70     movdqa      xmm1, xmm0
  71     movdqa      xmm5, xmm4
  72
  73     ; sz
  74     psraw       xmm0, 15
  75     psraw       xmm4, 15
  76
  77     ; (z ^ sz)
  78     pxor        xmm1, xmm0
  79     pxor        xmm5, xmm4
  80
  81     ; x = abs(z)
  82     psubw       xmm1, xmm0
  83     psubw       xmm5, xmm4
  84
  85     movdqa      xmm2, [rcx]
  86     movdqa      xmm3, [rcx + 16]
  87     mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
  88
  89     ; *zbin_ptr + zbin_oq_value
  90     paddw       xmm2, xmm7
  91     paddw       xmm3, xmm7
  92
  93     ; x - (*zbin_ptr + zbin_oq_value)
  94     psubw       xmm1, xmm2
  95     psubw       xmm5, xmm3
  96     movdqa      [rsp + abs_minus_zbin], xmm1
  97     movdqa      [rsp + abs_minus_zbin + 16], xmm5
  98
  99     ; add (zbin_ptr + zbin_oq_value) back
 100     paddw       xmm1, xmm2
 101     paddw       xmm5, xmm3
 102
 103     movdqa      xmm2, [rdx]
 104     movdqa      xmm6, [rdx + 16]
 105
 106     movdqa      xmm3, [rcx]
 107     movdqa      xmm7, [rcx + 16]
 108
 109     ; x + round
 110     paddw       xmm1, xmm2
 111     paddw       xmm5, xmm6
 112
 113     ; y = x * quant_ptr >> 16
 114     pmulhw      xmm3, xmm1
 115     pmulhw      xmm7, xmm5
 116
 117     ; y += x
 118     paddw       xmm1, xmm3
 119     paddw       xmm5, xmm7
 120
 121     movdqa      [rsp + temp_qcoeff], xmm1
 122     movdqa      [rsp + temp_qcoeff + 16], xmm5
 123
 124     pxor        xmm6, xmm6
 125     ; zero qcoeff
 126     movdqa      [rsp + qcoeff], xmm6
 127     movdqa      [rsp + qcoeff + 16], xmm6
 128
 129     mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
 130     mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
 131     mov         [rsp + zrun_zbin_boost], rdx
 132
 133 %macro ZIGZAG_LOOP 1
 134     ; x
 135     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
 136
 137     ; if (x >= zbin)
 138     sub         cx, WORD PTR[rdx]           ; x - zbin
 139     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
 140     jl          .rq_zigzag_loop_%1           ; x < zbin
 141
 142     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
 143
 144     ; downshift by quant_shift[rc]
 145     movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
 146     sar         edi, cl                     ; also sets Z bit
 147     je          .rq_zigzag_loop_%1           ; !y
 148     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
 149     mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
 150 .rq_zigzag_loop_%1:
 151 %endmacro
 152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
 153 ZIGZAG_LOOP  0
 154 ZIGZAG_LOOP  1
 155 ZIGZAG_LOOP  4
 156 ZIGZAG_LOOP  8
 157 ZIGZAG_LOOP  5
 158 ZIGZAG_LOOP  2
 159 ZIGZAG_LOOP  3
 160 ZIGZAG_LOOP  6
 161 ZIGZAG_LOOP  9
 162 ZIGZAG_LOOP 12
 163 ZIGZAG_LOOP 13
 164 ZIGZAG_LOOP 10
 165 ZIGZAG_LOOP  7
 166 ZIGZAG_LOOP 11
 167 ZIGZAG_LOOP 14
 168 ZIGZAG_LOOP 15
 169
 170     movdqa      xmm2, [rsp + qcoeff]
 171     movdqa      xmm3, [rsp + qcoeff + 16]
 172
 173     mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
 174     mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
 175
 176     ; y ^ sz
 177     pxor        xmm2, xmm0
 178     pxor        xmm3, xmm4
 179     ; x = (y ^ sz) - sz
 180     psubw       xmm2, xmm0
 181     psubw       xmm3, xmm4
 182
 183     ; dequant
 184     movdqa      xmm0, [rcx]
 185     movdqa      xmm1, [rcx + 16]
 186
 187     mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
 188
 189     pmullw      xmm0, xmm2
 190     pmullw      xmm1, xmm3
 191
 192     movdqa      [rcx], xmm2        ; store qcoeff
 193     movdqa      [rcx + 16], xmm3
 194     movdqa      [rdi], xmm0        ; store dqcoeff
 195     movdqa      [rdi + 16], xmm1
 196
 197     ; select the last value (in zig_zag order) for EOB
 198     pcmpeqw     xmm2, xmm6
 199     pcmpeqw     xmm3, xmm6
 200     ; !
 201     pcmpeqw     xmm6, xmm6
 202     pxor        xmm2, xmm6
 203     pxor        xmm3, xmm6
 204     ; mask inv_zig_zag
 205     pand        xmm2, [GLOBAL(inv_zig_zag)]
 206     pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
 207     ; select the max value
 208     pmaxsw      xmm2, xmm3
 209     pshufd      xmm3, xmm2, 00001110b
 210     pmaxsw      xmm2, xmm3
 211     pshuflw     xmm3, xmm2, 00001110b
 212     pmaxsw      xmm2, xmm3
 213     pshuflw     xmm3, xmm2, 00000001b
 214     pmaxsw      xmm2, xmm3
 215     movd        eax, xmm2
 216     and         eax, 0xff
 217     mov         [rsi + vp8_blockd_eob], eax
 218
 219     ; begin epilog
 220     add         rsp, stack_size
 221     pop         rsp
 222 %if ABI_IS_32BIT
 223     pop         rsi
 224     pop         rdi
 225 %else
 226   %ifidn __OUTPUT_FORMAT__,x64
 227     pop         rsi
 228     pop         rdi
 229   %endif
 230 %endif
 231     RESTORE_GOT
 232     RESTORE_XMM
 233     pop         rbp
 234     ret
 235
 236 ; void vp8_fast_quantize_b_sse2 | arg
 237 ;  (BLOCK  *b,                  |  0
 238 ;   BLOCKD *d)                  |  1
 239
 240 global sym(vp8_fast_quantize_b_sse2)
 241 sym(vp8_fast_quantize_b_sse2):
 242     push        rbp
 243     mov         rbp, rsp
 244     GET_GOT     rbx
 245
 246 %if ABI_IS_32BIT
 247     push        rdi
 248     push        rsi
 249 %else
 250   %ifidn __OUTPUT_FORMAT__,x64
 251     push        rdi
 252     push        rsi
 253   %else
 254     ; these registers are used for passing arguments
 255   %endif
 256 %endif
 257
 258     ; end prolog
 259
 260 %if ABI_IS_32BIT
 261     mov         rdi, arg(0)                 ; BLOCK *b
 262     mov         rsi, arg(1)                 ; BLOCKD *d
 263 %else
 264   %ifidn __OUTPUT_FORMAT__,x64
 265     mov         rdi, rcx                    ; BLOCK *b
 266     mov         rsi, rdx                    ; BLOCKD *d
 267   %else
 268     ;mov         rdi, rdi                    ; BLOCK *b
 269     ;mov         rsi, rsi                    ; BLOCKD *d
 270   %endif
 271 %endif
 272
 273     mov         rax, [rdi + vp8_block_coeff]
 274     mov         rcx, [rdi + vp8_block_round]
 275     mov         rdx, [rdi + vp8_block_quant_fast]
 276
 277     ; z = coeff
 278     movdqa      xmm0, [rax]
 279     movdqa      xmm4, [rax + 16]
 280
 281     ; dup z so we can save sz
 282     movdqa      xmm1, xmm0
 283     movdqa      xmm5, xmm4
 284
 285     ; sz = z >> 15
 286     psraw       xmm0, 15
 287     psraw       xmm4, 15
 288
 289     ; x = abs(z) = (z ^ sz) - sz
 290     pxor        xmm1, xmm0
 291     pxor        xmm5, xmm4
 292     psubw       xmm1, xmm0
 293     psubw       xmm5, xmm4
 294
 295     ; x += round
 296     paddw       xmm1, [rcx]
 297     paddw       xmm5, [rcx + 16]
 298
 299     mov         rax, [rsi + vp8_blockd_qcoeff]
 300     mov         rcx, [rsi + vp8_blockd_dequant]
 301     mov         rdi, [rsi + vp8_blockd_dqcoeff]
 302
 303     ; y = x * quant >> 16
 304     pmulhw      xmm1, [rdx]
 305     pmulhw      xmm5, [rdx + 16]
 306
 307     ; x = (y ^ sz) - sz
 308     pxor        xmm1, xmm0
 309     pxor        xmm5, xmm4
 310     psubw       xmm1, xmm0
 311     psubw       xmm5, xmm4
 312
 313     ; qcoeff = x
 314     movdqa      [rax], xmm1
 315     movdqa      [rax + 16], xmm5
 316
 317     ; x * dequant
 318     movdqa      xmm2, xmm1
 319     movdqa      xmm3, xmm5
 320     pmullw      xmm2, [rcx]
 321     pmullw      xmm3, [rcx + 16]
 322
 323     ; dqcoeff = x * dequant
 324     movdqa      [rdi], xmm2
 325     movdqa      [rdi + 16], xmm3
 326
 327     pxor        xmm4, xmm4                  ;clear all bits
 328     pcmpeqw     xmm1, xmm4
 329     pcmpeqw     xmm5, xmm4
 330
 331     pcmpeqw     xmm4, xmm4                  ;set all bits
 332     pxor        xmm1, xmm4
 333     pxor        xmm5, xmm4
 334
 335     pand        xmm1, [GLOBAL(inv_zig_zag)]
 336     pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
 337
 338     pmaxsw      xmm1, xmm5
 339
 340     ; now down to 8
 341     pshufd      xmm5, xmm1, 00001110b
 342
 343     pmaxsw      xmm1, xmm5
 344
 345     ; only 4 left
 346     pshuflw     xmm5, xmm1, 00001110b
 347
 348     pmaxsw      xmm1, xmm5
 349
 350     ; okay, just 2!
 351     pshuflw     xmm5, xmm1, 00000001b
 352
 353     pmaxsw      xmm1, xmm5
 354
 355     movd        eax, xmm1
 356     and         eax, 0xff
 357     mov         [rsi + vp8_blockd_eob], eax
 358
 359     ; begin epilog
 360 %if ABI_IS_32BIT
 361     pop         rsi
 362     pop         rdi
 363 %else
 364   %ifidn __OUTPUT_FORMAT__,x64
 365     pop         rsi
 366     pop         rdi
 367   %endif
 368 %endif
 369
 370     RESTORE_GOT
 371     pop         rbp
 372     ret
 373
 374 SECTION_RODATA
 375 align 16
 376 inv_zig_zag:
 377   dw 0x0001, 0x0002, 0x0006, 0x0007
 378   dw 0x0003, 0x0005, 0x0008, 0x000d
 379   dw 0x0004, 0x0009, 0x000c, 0x000e
 380   dw 0x000a, 0x000b, 0x000f, 0x0010