theora-old/lib/x86_64/recon_mmx.c

   1 /********************************************************************
   2  *                                                                  *
   3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
   4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
   5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
   6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
   7  *                                                                  *
   8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
   9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  10  *                                                                  *
  11  ********************************************************************
  12
  13   function:
  14   last mod: $Id$
  15
  16  ********************************************************************/
  17
  18 #include "codec_internal.h"
  19
  20 typedef unsigned long long ogg_uint64_t;
  21
  22 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
  23
  24 static void copy8x8__mmx (unsigned char *src,
  25                           unsigned char *dest,
  26                           ogg_uint32_t stride)
  27 {
  28   __asm__ __volatile__ (
  29     "  .balign 16                      \n\t"
  30
  31     "  lea         (%2, %2, 2), %%rdi  \n\t"
  32
  33     "  movq        (%1), %%mm0         \n\t"
  34     "  movq        (%1, %2), %%mm1     \n\t"
  35     "  movq        (%1, %2, 2), %%mm2  \n\t"
  36     "  movq        (%1, %%rdi), %%mm3  \n\t"
  37
  38     "  lea         (%1, %2, 4), %1     \n\t"
  39
  40     "  movq        %%mm0, (%0)         \n\t"
  41     "  movq        %%mm1, (%0, %2)     \n\t"
  42     "  movq        %%mm2, (%0, %2, 2)  \n\t"
  43     "  movq        %%mm3, (%0, %%rdi)  \n\t"
  44
  45     "  lea         (%0, %2, 4), %0     \n\t"
  46
  47     "  movq        (%1), %%mm0         \n\t"
  48     "  movq        (%1, %2), %%mm1     \n\t"
  49     "  movq        (%1, %2, 2), %%mm2  \n\t"
  50     "  movq        (%1, %%rdi), %%mm3  \n\t"
  51
  52     "  movq        %%mm0, (%0)         \n\t"
  53     "  movq        %%mm1, (%0, %2)     \n\t"
  54     "  movq        %%mm2, (%0, %2, 2)  \n\t"
  55     "  movq        %%mm3, (%0, %%rdi)  \n\t"
  56       : "+a" (dest)
  57       : "c" (src),
  58         "d" ((ogg_uint64_t)stride)
  59       : "memory", "rdi"
  60   );
  61 }
  62
  63 static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
  64                                  ogg_uint32_t LineStep)
  65 {
  66   __asm__ __volatile__ (
  67     "  .balign 16                      \n\t"
  68
  69     "  movq        %[V128], %%mm0      \n\t" /* Set mm0 to 0x8080808080808080 */
  70
  71     "  lea         128(%1), %%rdi      \n\t" /* Endpoint in input buffer */
  72     "1:                                \n\t"
  73     "  movq         (%1), %%mm2        \n\t" /* First four input values */
  74
  75     "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
  76     "  por         %%mm0, %%mm0        \n\t"
  77     "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
  78     "  lea         16(%1), %1          \n\t" /* Step source buffer */
  79     "  cmp         %%rdi, %1           \n\t" /* are we done */
  80
  81     "  movq        %%mm2, (%0)         \n\t" /* store results */
  82
  83     "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
  84     "  jc          1b                  \n\t" /* Loop back if we are not done */
  85       : "+r" (ReconPtr)
  86       : "r" (ChangePtr),
  87         "r" ((ogg_uint64_t)LineStep),
  88         [V128] "m" (V128)
  89       : "memory", "rdi"
  90   );
  91 }
  92
  93 static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
  94                                  ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
  95 {
  96   __asm__ __volatile__ (
  97     "  .balign 16                      \n\t"
  98
  99     "  pxor        %%mm0, %%mm0        \n\t"
 100     "  lea         128(%1), %%rdi      \n\t"
 101
 102     "1:                                \n\t"
 103     "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
 104
 105     "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
 106     "  movq        %%mm2, %%mm3        \n\t"
 107     "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
 108     "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
 109     "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
 110     "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
 111     "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
 112     "  add         %3, %2              \n\t" /* next row of reference pixels */
 113     "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
 114     "  lea         16(%1), %1          \n\t" /* next row of changes */
 115     "  cmp         %%rdi, %1           \n\t" /* are we done? */
 116
 117     "  movq        %%mm2, (%0)         \n\t" /* store result */
 118
 119     "  lea         (%0, %3), %0        \n\t" /* next row of output */
 120     "  jc          1b                  \n\t"
 121       : "+r" (ReconPtr)
 122       : "r" (ChangePtr),
 123         "r" (RefPtr),
 124         "r" ((ogg_uint64_t)LineStep)
 125       : "memory", "rdi"
 126   );
 127 }
 128
 129 static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
 130                                       unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
 131                                       ogg_uint32_t LineStep)
 132 {
 133   __asm__ __volatile__ (
 134     "  .balign 16                      \n\t"
 135
 136     "  pxor        %%mm0, %%mm0        \n\t"
 137     "  lea         128(%1), %%rdi      \n\t"
 138
 139     "1:                                \n\t"
 140     "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
 141     "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
 142
 143     "  movq        %%mm2, %%mm3        \n\t"
 144     "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
 145     "  movq        %%mm4, %%mm5        \n\t"
 146     "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
 147     "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
 148     "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
 149     "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
 150     "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
 151     "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
 152     "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
 153     "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
 154     "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
 155     "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
 156     "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
 157     "  lea         16(%1), %1          \n\t" /* next row of changes */
 158     "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
 159     "  add         %4, %2              \n\t" /* next row of reference pixels */
 160     "  add         %4, %3              \n\t" /* next row of reference pixels */
 161     "  movq        %%mm2, (%0)         \n\t" /* store result */
 162     "  add         %4, %0              \n\t" /* next row of output */
 163     "  cmp         %%rdi, %1           \n\t" /* are we done? */
 164     "  jc          1b                  \n\t"
 165       : "+r" (ReconPtr)
 166       : "r" (ChangePtr),
 167         "r" (RefPtr1),
 168         "r" (RefPtr2),
 169         "r" ((ogg_uint64_t)LineStep)
 170       : "memory", "rdi"
 171   );
 172 }
 173
 174 void dsp_mmx_recon_init(DspFunctions *funcs)
 175 {
 176   TH_DEBUG("enabling accelerated x86_64 mmx recon functions.\n");
 177   funcs->copy8x8 = copy8x8__mmx;
 178   funcs->recon_intra8x8 = recon_intra8x8__mmx;
 179   funcs->recon_inter8x8 = recon_inter8x8__mmx;
 180   funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
 181 }
 182