vp8/common/x86/idctllm_mmx.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ; /****************************************************************************
  15 ; * Notes:
  16 ; *
  17 ; * This implementation makes use of 16 bit fixed point verio of two multiply
  18 ; * constants:
  19 ; *        1.   sqrt(2) * cos (pi/8)
  20 ; *         2.   sqrt(2) * sin (pi/8)
  21 ; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
  22 ; * fixed point prrcision as the second one, we use a trick of
  23 ; *        x * a = x + x*(a-1)
  24 ; * so
  25 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
  26 ; *
  27 ; * For     the second constant, becuase of the 16bit version is 35468, which
  28 ; * is bigger than 32768, in signed 16 bit multiply, it become a negative
  29 ; * number.
  30 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
  31 ; *
  32 ; **************************************************************************/
  33
  34
  35 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
  36 global sym(vp8_short_idct4x4llm_mmx)
  37 sym(vp8_short_idct4x4llm_mmx):
  38     push        rbp
  39     mov         rbp, rsp
  40     SHADOW_ARGS_TO_STACK 3
  41     GET_GOT     rbx
  42     ; end prolog
  43
  44         mov         rax,            arg(0) ;input
  45         mov         rdx,            arg(1) ;output
  46
  47         movq        mm0,            [rax   ]
  48         movq        mm1,            [rax+ 8]
  49
  50         movq        mm2,            [rax+16]
  51         movq        mm3,            [rax+24]
  52
  53         movsxd      rax,            dword ptr arg(2) ;pitch
  54
  55         psubw       mm0,            mm2             ; b1= 0-2
  56         paddw       mm2,            mm2             ;
  57
  58         movq        mm5,            mm1
  59         paddw       mm2,            mm0             ; a1 =0+2
  60
  61         pmulhw      mm5,            [GLOBAL(x_s1sqr2)]       ;
  62         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
  63
  64         movq        mm7,            mm3             ;
  65         pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
  66
  67         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
  68         psubw       mm7,            mm5             ; c1
  69
  70         movq        mm5,            mm1
  71         movq        mm4,            mm3
  72
  73         pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
  74         paddw       mm5,            mm1
  75
  76         pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
  77         paddw       mm3,            mm4
  78
  79         paddw       mm3,            mm5             ; d1
  80         movq        mm6,            mm2             ; a1
  81
  82         movq        mm4,            mm0             ; b1
  83         paddw       mm2,            mm3             ;0
  84
  85         paddw       mm4,            mm7             ;1
  86         psubw       mm0,            mm7             ;2
  87
  88         psubw       mm6,            mm3             ;3
  89
  90         movq        mm1,            mm2             ; 03 02 01 00
  91         movq        mm3,            mm4             ; 23 22 21 20
  92
  93         punpcklwd   mm1,            mm0             ; 11 01 10 00
  94         punpckhwd   mm2,            mm0             ; 13 03 12 02
  95
  96         punpcklwd   mm3,            mm6             ; 31 21 30 20
  97         punpckhwd   mm4,            mm6             ; 33 23 32 22
  98
  99         movq        mm0,            mm1             ; 11 01 10 00
 100         movq        mm5,            mm2             ; 13 03 12 02
 101
 102         punpckldq   mm0,            mm3             ; 30 20 10 00
 103         punpckhdq   mm1,            mm3             ; 31 21 11 01
 104
 105         punpckldq   mm2,            mm4             ; 32 22 12 02
 106         punpckhdq   mm5,            mm4             ; 33 23 13 03
 107
 108         movq        mm3,            mm5             ; 33 23 13 03
 109
 110         psubw       mm0,            mm2             ; b1= 0-2
 111         paddw       mm2,            mm2             ;
 112
 113         movq        mm5,            mm1
 114         paddw       mm2,            mm0             ; a1 =0+2
 115
 116         pmulhw      mm5,            [GLOBAL(x_s1sqr2)]        ;
 117         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 118
 119         movq        mm7,            mm3             ;
 120         pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
 121
 122         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
 123         psubw       mm7,            mm5             ; c1
 124
 125         movq        mm5,            mm1
 126         movq        mm4,            mm3
 127
 128         pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
 129         paddw       mm5,            mm1
 130
 131         pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
 132         paddw       mm3,            mm4
 133
 134         paddw       mm3,            mm5             ; d1
 135         paddw       mm0,            [GLOBAL(fours)]
 136
 137         paddw       mm2,            [GLOBAL(fours)]
 138         movq        mm6,            mm2             ; a1
 139
 140         movq        mm4,            mm0             ; b1
 141         paddw       mm2,            mm3             ;0
 142
 143         paddw       mm4,            mm7             ;1
 144         psubw       mm0,            mm7             ;2
 145
 146         psubw       mm6,            mm3             ;3
 147         psraw       mm2,            3
 148
 149         psraw       mm0,            3
 150         psraw       mm4,            3
 151
 152         psraw       mm6,            3
 153
 154         movq        mm1,            mm2             ; 03 02 01 00
 155         movq        mm3,            mm4             ; 23 22 21 20
 156
 157         punpcklwd   mm1,            mm0             ; 11 01 10 00
 158         punpckhwd   mm2,            mm0             ; 13 03 12 02
 159
 160         punpcklwd   mm3,            mm6             ; 31 21 30 20
 161         punpckhwd   mm4,            mm6             ; 33 23 32 22
 162
 163         movq        mm0,            mm1             ; 11 01 10 00
 164         movq        mm5,            mm2             ; 13 03 12 02
 165
 166         punpckldq   mm0,            mm3             ; 30 20 10 00
 167         punpckhdq   mm1,            mm3             ; 31 21 11 01
 168
 169         punpckldq   mm2,            mm4             ; 32 22 12 02
 170         punpckhdq   mm5,            mm4             ; 33 23 13 03
 171
 172         movq        [rdx],          mm0
 173
 174         movq        [rdx+rax],      mm1
 175         movq        [rdx+rax*2],    mm2
 176
 177         add         rdx,            rax
 178         movq        [rdx+rax*2],    mm5
 179
 180     ; begin epilog
 181     RESTORE_GOT
 182     UNSHADOW_ARGS
 183     pop         rbp
 184     ret
 185
 186
 187 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
 188 global sym(vp8_short_idct4x4llm_1_mmx)
 189 sym(vp8_short_idct4x4llm_1_mmx):
 190     push        rbp
 191     mov         rbp, rsp
 192     SHADOW_ARGS_TO_STACK 3
 193     GET_GOT     rbx
 194     ; end prolog
 195
 196         mov         rax,            arg(0) ;input
 197         movd        mm0,            [rax]
 198
 199         paddw       mm0,            [GLOBAL(fours)]
 200         mov         rdx,            arg(1) ;output
 201
 202         psraw       mm0,            3
 203         movsxd      rax,            dword ptr arg(2) ;pitch
 204
 205         punpcklwd   mm0,            mm0
 206         punpckldq   mm0,            mm0
 207
 208         movq        [rdx],          mm0
 209         movq        [rdx+rax],      mm0
 210
 211         movq        [rdx+rax*2],    mm0
 212         add         rdx,            rax
 213
 214         movq        [rdx+rax*2],    mm0
 215
 216
 217     ; begin epilog
 218     RESTORE_GOT
 219     UNSHADOW_ARGS
 220     pop         rbp
 221     ret
 222
 223 ;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
 224 global sym(vp8_dc_only_idct_add_mmx)
 225 sym(vp8_dc_only_idct_add_mmx):
 226     push        rbp
 227     mov         rbp, rsp
 228     SHADOW_ARGS_TO_STACK 5
 229     GET_GOT     rbx
 230     push        rsi
 231     push        rdi
 232     ; end prolog
 233
 234         mov         rsi,            arg(1) ;s -- prediction
 235         mov         rdi,            arg(2) ;d -- destination
 236         movsxd      rax,            dword ptr arg(4) ;stride
 237         movsxd      rdx,            dword ptr arg(3) ;pitch
 238         pxor        mm0,            mm0
 239
 240         movd        mm5,            arg(0) ;input_dc
 241
 242         paddw       mm5,            [GLOBAL(fours)]
 243
 244         psraw       mm5,            3
 245
 246         punpcklwd   mm5,            mm5
 247         punpckldq   mm5,            mm5
 248
 249         movd        mm1,            [rsi]
 250         punpcklbw   mm1,            mm0
 251         paddsw      mm1,            mm5
 252         packuswb    mm1,            mm0              ; pack and unpack to saturate
 253         movd        [rdi],          mm1
 254
 255         movd        mm2,            [rsi+rdx]
 256         punpcklbw   mm2,            mm0
 257         paddsw      mm2,            mm5
 258         packuswb    mm2,            mm0              ; pack and unpack to saturate
 259         movd        [rdi+rax],      mm2
 260
 261         movd        mm3,            [rsi+2*rdx]
 262         punpcklbw   mm3,            mm0
 263         paddsw      mm3,            mm5
 264         packuswb    mm3,            mm0              ; pack and unpack to saturate
 265         movd        [rdi+2*rax],    mm3
 266
 267         add         rdi,            rax
 268         add         rsi,            rdx
 269         movd        mm4,            [rsi+2*rdx]
 270         punpcklbw   mm4,            mm0
 271         paddsw      mm4,            mm5
 272         packuswb    mm4,            mm0              ; pack and unpack to saturate
 273         movd        [rdi+2*rax],    mm4
 274
 275     ; begin epilog
 276     pop rdi
 277     pop rsi
 278     RESTORE_GOT
 279     UNSHADOW_ARGS
 280     pop         rbp
 281     ret
 282
 283 SECTION_RODATA
 284 align 16
 285 x_s1sqr2:
 286     times 4 dw 0x8A8C
 287 align 16
 288 x_c1sqr2less1:
 289     times 4 dw 0x4E7B
 290 align 16
 291 fours:
 292     times 4 dw 0x0004