vp8/common/x86/recon_mmx.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13 ;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
  14 global sym(vp8_recon_b_mmx)
  15 sym(vp8_recon_b_mmx):
  16     push        rbp
  17     mov         rbp, rsp
  18     SHADOW_ARGS_TO_STACK 4
  19     push        rsi
  20     push        rdi
  21     ; end prolog
  22
  23         mov       rsi, arg(0) ;s
  24         mov       rdi, arg(2) ;d
  25         mov       rdx, arg(1) ;q
  26         movsxd    rax, dword ptr arg(3) ;stride
  27         pxor      mm0, mm0
  28
  29         movd      mm1, [rsi]
  30         punpcklbw mm1, mm0
  31         paddsw    mm1, [rdx]
  32         packuswb  mm1,  mm0              ; pack and unpack to saturate
  33         movd      [rdi], mm1
  34
  35         movd      mm2, [rsi+16]
  36         punpcklbw mm2, mm0
  37         paddsw    mm2, [rdx+32]
  38         packuswb  mm2, mm0              ; pack and unpack to saturate
  39         movd      [rdi+rax], mm2
  40
  41         movd      mm3, [rsi+32]
  42         punpcklbw mm3, mm0
  43         paddsw    mm3, [rdx+64]
  44         packuswb  mm3,  mm0              ; pack and unpack to saturate
  45         movd      [rdi+2*rax], mm3
  46
  47         add       rdi, rax
  48         movd      mm4, [rsi+48]
  49         punpcklbw mm4, mm0
  50         paddsw    mm4, [rdx+96]
  51         packuswb  mm4, mm0              ; pack and unpack to saturate
  52         movd      [rdi+2*rax], mm4
  53
  54     ; begin epilog
  55     pop rdi
  56     pop rsi
  57     UNSHADOW_ARGS
  58     pop         rbp
  59     ret
  60
  61
  62 ;void copy_mem8x8_mmx(
  63 ;    unsigned char *src,
  64 ;    int src_stride,
  65 ;    unsigned char *dst,
  66 ;    int dst_stride
  67 ;    )
  68 global sym(vp8_copy_mem8x8_mmx)
  69 sym(vp8_copy_mem8x8_mmx):
  70     push        rbp
  71     mov         rbp, rsp
  72     SHADOW_ARGS_TO_STACK 4
  73     push        rsi
  74     push        rdi
  75     ; end prolog
  76
  77         mov         rsi,        arg(0) ;src;
  78         movq        mm0,        [rsi]
  79
  80         movsxd      rax,        dword ptr arg(1) ;src_stride;
  81         mov         rdi,        arg(2) ;dst;
  82
  83         movq        mm1,        [rsi+rax]
  84         movq        mm2,        [rsi+rax*2]
  85
  86         movsxd      rcx,        dword ptr arg(3) ;dst_stride
  87         lea         rsi,        [rsi+rax*2]
  88
  89         movq        [rdi],      mm0
  90         add         rsi,        rax
  91
  92         movq        [rdi+rcx],      mm1
  93         movq        [rdi+rcx*2],    mm2
  94
  95
  96         lea         rdi,        [rdi+rcx*2]
  97         movq        mm3,        [rsi]
  98
  99         add         rdi,        rcx
 100         movq        mm4,        [rsi+rax]
 101
 102         movq        mm5,        [rsi+rax*2]
 103         movq        [rdi],      mm3
 104
 105         lea         rsi,        [rsi+rax*2]
 106         movq        [rdi+rcx],  mm4
 107
 108         movq        [rdi+rcx*2],    mm5
 109         lea         rdi,        [rdi+rcx*2]
 110
 111         movq        mm0,        [rsi+rax]
 112         movq        mm1,        [rsi+rax*2]
 113
 114         movq        [rdi+rcx],  mm0
 115         movq        [rdi+rcx*2],mm1
 116
 117     ; begin epilog
 118     pop rdi
 119     pop rsi
 120     UNSHADOW_ARGS
 121     pop         rbp
 122     ret
 123
 124
 125 ;void copy_mem8x4_mmx(
 126 ;    unsigned char *src,
 127 ;    int src_stride,
 128 ;    unsigned char *dst,
 129 ;    int dst_stride
 130 ;    )
 131 global sym(vp8_copy_mem8x4_mmx)
 132 sym(vp8_copy_mem8x4_mmx):
 133     push        rbp
 134     mov         rbp, rsp
 135     SHADOW_ARGS_TO_STACK 4
 136     push        rsi
 137     push        rdi
 138     ; end prolog
 139
 140         mov         rsi,        arg(0) ;src;
 141         movq        mm0,        [rsi]
 142
 143         movsxd      rax,        dword ptr arg(1) ;src_stride;
 144         mov         rdi,        arg(2) ;dst;
 145
 146         movq        mm1,        [rsi+rax]
 147         movq        mm2,        [rsi+rax*2]
 148
 149         movsxd      rcx,        dword ptr arg(3) ;dst_stride
 150         lea         rsi,        [rsi+rax*2]
 151
 152         movq        [rdi],      mm0
 153         movq        [rdi+rcx],      mm1
 154
 155         movq        [rdi+rcx*2],    mm2
 156         lea         rdi,        [rdi+rcx*2]
 157
 158         movq        mm3,        [rsi+rax]
 159         movq        [rdi+rcx],      mm3
 160
 161     ; begin epilog
 162     pop rdi
 163     pop rsi
 164     UNSHADOW_ARGS
 165     pop         rbp
 166     ret
 167
 168
 169 ;void copy_mem16x16_mmx(
 170 ;    unsigned char *src,
 171 ;    int src_stride,
 172 ;    unsigned char *dst,
 173 ;    int dst_stride
 174 ;    )
 175 global sym(vp8_copy_mem16x16_mmx)
 176 sym(vp8_copy_mem16x16_mmx):
 177     push        rbp
 178     mov         rbp, rsp
 179     SHADOW_ARGS_TO_STACK 4
 180     push        rsi
 181     push        rdi
 182     ; end prolog
 183
 184         mov         rsi,        arg(0) ;src;
 185         movsxd      rax,        dword ptr arg(1) ;src_stride;
 186
 187         mov         rdi,        arg(2) ;dst;
 188         movsxd      rcx,        dword ptr arg(3) ;dst_stride
 189
 190         movq        mm0,            [rsi]
 191         movq        mm3,            [rsi+8];
 192
 193         movq        mm1,            [rsi+rax]
 194         movq        mm4,            [rsi+rax+8]
 195
 196         movq        mm2,            [rsi+rax*2]
 197         movq        mm5,            [rsi+rax*2+8]
 198
 199         lea         rsi,            [rsi+rax*2]
 200         add         rsi,            rax
 201
 202         movq        [rdi],          mm0
 203         movq        [rdi+8],        mm3
 204
 205         movq        [rdi+rcx],      mm1
 206         movq        [rdi+rcx+8],    mm4
 207
 208         movq        [rdi+rcx*2],    mm2
 209         movq        [rdi+rcx*2+8],  mm5
 210
 211         lea         rdi,            [rdi+rcx*2]
 212         add         rdi,            rcx
 213
 214         movq        mm0,            [rsi]
 215         movq        mm3,            [rsi+8];
 216
 217         movq        mm1,            [rsi+rax]
 218         movq        mm4,            [rsi+rax+8]
 219
 220         movq        mm2,            [rsi+rax*2]
 221         movq        mm5,            [rsi+rax*2+8]
 222
 223         lea         rsi,            [rsi+rax*2]
 224         add         rsi,            rax
 225
 226         movq        [rdi],          mm0
 227         movq        [rdi+8],        mm3
 228
 229         movq        [rdi+rcx],      mm1
 230         movq        [rdi+rcx+8],    mm4
 231
 232         movq        [rdi+rcx*2],    mm2
 233         movq        [rdi+rcx*2+8],  mm5
 234
 235         lea         rdi,            [rdi+rcx*2]
 236         add         rdi,            rcx
 237
 238         movq        mm0,            [rsi]
 239         movq        mm3,            [rsi+8];
 240
 241         movq        mm1,            [rsi+rax]
 242         movq        mm4,            [rsi+rax+8]
 243
 244         movq        mm2,            [rsi+rax*2]
 245         movq        mm5,            [rsi+rax*2+8]
 246
 247         lea         rsi,            [rsi+rax*2]
 248         add         rsi,            rax
 249
 250         movq        [rdi],          mm0
 251         movq        [rdi+8],        mm3
 252
 253         movq        [rdi+rcx],      mm1
 254         movq        [rdi+rcx+8],    mm4
 255
 256         movq        [rdi+rcx*2],    mm2
 257         movq        [rdi+rcx*2+8],  mm5
 258
 259         lea         rdi,            [rdi+rcx*2]
 260         add         rdi,            rcx
 261
 262         movq        mm0,            [rsi]
 263         movq        mm3,            [rsi+8];
 264
 265         movq        mm1,            [rsi+rax]
 266         movq        mm4,            [rsi+rax+8]
 267
 268         movq        mm2,            [rsi+rax*2]
 269         movq        mm5,            [rsi+rax*2+8]
 270
 271         lea         rsi,            [rsi+rax*2]
 272         add         rsi,            rax
 273
 274         movq        [rdi],          mm0
 275         movq        [rdi+8],        mm3
 276
 277         movq        [rdi+rcx],      mm1
 278         movq        [rdi+rcx+8],    mm4
 279
 280         movq        [rdi+rcx*2],    mm2
 281         movq        [rdi+rcx*2+8],  mm5
 282
 283         lea         rdi,            [rdi+rcx*2]
 284         add         rdi,            rcx
 285
 286         movq        mm0,            [rsi]
 287         movq        mm3,            [rsi+8];
 288
 289         movq        mm1,            [rsi+rax]
 290         movq        mm4,            [rsi+rax+8]
 291
 292         movq        mm2,            [rsi+rax*2]
 293         movq        mm5,            [rsi+rax*2+8]
 294
 295         lea         rsi,            [rsi+rax*2]
 296         add         rsi,            rax
 297
 298         movq        [rdi],          mm0
 299         movq        [rdi+8],        mm3
 300
 301         movq        [rdi+rcx],      mm1
 302         movq        [rdi+rcx+8],    mm4
 303
 304         movq        [rdi+rcx*2],    mm2
 305         movq        [rdi+rcx*2+8],  mm5
 306
 307         lea         rdi,            [rdi+rcx*2]
 308         add         rdi,            rcx
 309
 310         movq        mm0,            [rsi]
 311         movq        mm3,            [rsi+8];
 312
 313         movq        [rdi],          mm0
 314         movq        [rdi+8],        mm3
 315
 316     ; begin epilog
 317     pop rdi
 318     pop rsi
 319     UNSHADOW_ARGS
 320     pop         rbp
 321     ret