theora-old/lib/x86_32/dsp_mmx.c

   1 /********************************************************************
   2  *                                                                  *
   3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
   4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
   5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
   6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
   7  *                                                                  *
   8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
   9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  10  *                                                                  *
  11  ********************************************************************
  12
  13   function:
  14   last mod: $Id$
  15
  16  ********************************************************************/
  17
  18 #include <stdlib.h>
  19
  20 #include "codec_internal.h"
  21 #include "dsp.h"
  22
  23 #if defined(USE_ASM)
  24
  25 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
  26
  27 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
  28 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
  29 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
  30
  31 #define SUB_LOOP                                                              \
  32   "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                   \
  33   "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */                  \
  34   "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */\
  35   "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */\
  36   /* convert from UINT8 to INT16 */                                           \
  37   "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */            \
  38   "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */           \
  39   "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */            \
  40   "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */           \
  41   /* start calculation */                                                     \
  42   "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */        \
  43   "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */        \
  44   "  movq        %%mm0,  (%2)     \n\t" /* write answer out */                \
  45   "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */                \
  46   /* Increment pointers */                                                    \
  47   "  add         $16, %2           \n\t"                                      \
  48   "  add         %3, %0           \n\t"                                       \
  49   "  add         %4, %1           \n\t"
  50
  51 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
  52                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
  53                   ogg_uint32_t ReconPixelsPerLine)
  54 {
  55   __asm__ __volatile__ (
  56     "  .p2align 4                   \n\t"
  57
  58     "  pxor        %%mm7, %%mm7     \n\t"
  59     SUB_LOOP
  60     SUB_LOOP
  61     SUB_LOOP
  62     SUB_LOOP
  63     SUB_LOOP
  64     SUB_LOOP
  65     SUB_LOOP
  66     SUB_LOOP
  67      : "+r" (FiltPtr),
  68        "+r" (ReconPtr),
  69        "+r" (DctInputPtr)
  70      : "m" (PixelsPerLine),
  71        "m" (ReconPixelsPerLine)
  72      : "memory"
  73   );
  74 }
  75
  76 #define SUB_128_LOOP                                                          \
  77   "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                   \
  78   "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */\
  79   /* convert from UINT8 to INT16 */                                           \
  80   "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */            \
  81   "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */            \
  82   /* start calculation */                                                     \
  83   "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */             \
  84   "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */             \
  85   "  movq        %%mm0,  (%1)     \n\t" /* write answer out */                \
  86   "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */                \
  87   /* Increment pointers */                                                    \
  88   "  add         $16, %1           \n\t"                                      \
  89   "  add         %2, %0           \n\t"
  90
  91
  92 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
  93                       ogg_uint32_t PixelsPerLine)
  94 {
  95   __asm__ __volatile__ (
  96     "  .p2align 4                   \n\t"
  97
  98     "  pxor        %%mm7, %%mm7     \n\t"
  99     "  movq        %[V128], %%mm1   \n\t"
 100     SUB_128_LOOP
 101     SUB_128_LOOP
 102     SUB_128_LOOP
 103     SUB_128_LOOP
 104     SUB_128_LOOP
 105     SUB_128_LOOP
 106     SUB_128_LOOP
 107     SUB_128_LOOP
 108      : "+r" (FiltPtr),
 109        "+r" (DctInputPtr)
 110      : "m" (PixelsPerLine),
 111        [V128] "m" (V128)
 112      : "memory"
 113   );
 114 }
 115
 116 #define SUB_AVG2_LOOP                                                         \
 117   "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                   \
 118   "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */                 \
 119   "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */                 \
 120   "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */\
 121   "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */\
 122   "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */\
 123   /* convert from UINT8 to INT16 */                                           \
 124   "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */            \
 125   "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */          \
 126   "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */          \
 127   "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */            \
 128   "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */          \
 129   "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */          \
 130   /* average ReconPtr1 and ReconPtr2 */                                       \
 131   "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */     \
 132   "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */     \
 133   "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ \
 134   "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ \
 135   "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
 136   "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
 137   "  movq        %%mm0,  (%3)     \n\t" /* write answer out */                \
 138   "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */                \
 139   /* Increment pointers */                                                    \
 140   "  add         $16, %3           \n\t"                                      \
 141   "  add         %4, %0           \n\t"                                       \
 142   "  add         %5, %1           \n\t"                                       \
 143   "  add         %5, %2           \n\t"
 144
 145
 146 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
 147                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
 148                      ogg_uint32_t PixelsPerLine,
 149                      ogg_uint32_t ReconPixelsPerLine)
 150 {
 151   __asm__ __volatile__ (
 152     "  .p2align 4                   \n\t"
 153
 154     "  pxor        %%mm7, %%mm7     \n\t"
 155     SUB_AVG2_LOOP
 156     SUB_AVG2_LOOP
 157     SUB_AVG2_LOOP
 158     SUB_AVG2_LOOP
 159     SUB_AVG2_LOOP
 160     SUB_AVG2_LOOP
 161     SUB_AVG2_LOOP
 162     SUB_AVG2_LOOP
 163      : "+r" (FiltPtr),
 164        "+r" (ReconPtr1),
 165        "+r" (ReconPtr2),
 166        "+r" (DctInputPtr)
 167      : "m" (PixelsPerLine),
 168        "m" (ReconPixelsPerLine)
 169      : "memory"
 170   );
 171 }
 172
 173 static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
 174 {
 175   ogg_uint32_t MaxSad;
 176
 177   __asm__ __volatile__ (
 178     "  .p2align 4                   \n\t"
 179
 180     "  pxor        %%mm6, %%mm6     \n\t"       /* zero out mm6 for unpack */
 181     "  pxor        %%mm7, %%mm7     \n\t"       /* zero out mm7 for unpack */
 182     "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
 183     "  movq        (%2), %%mm1      \n\t"
 184
 185     "  movq        %%mm0, %%mm2     \n\t"
 186     "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
 187     "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
 188     "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
 189
 190     "  movq        %%mm0, %%mm1     \n\t"
 191
 192     "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
 193     "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
 194
 195     "  movq        %%mm0, %%mm2     \n\t"
 196     "  movq        %%mm1, %%mm3     \n\t"
 197     "  psrlq       $32, %%mm2       \n\t"       /* fold and add */
 198     "  psrlq       $32, %%mm3       \n\t"
 199     "  paddw       %%mm2, %%mm0     \n\t"
 200     "  paddw       %%mm3, %%mm1     \n\t"
 201     "  movq        %%mm0, %%mm2     \n\t"
 202     "  movq        %%mm1, %%mm3     \n\t"
 203     "  psrlq       $16, %%mm2       \n\t"
 204     "  psrlq       $16, %%mm3       \n\t"
 205     "  paddw       %%mm2, %%mm0     \n\t"
 206     "  paddw       %%mm3, %%mm1     \n\t"
 207
 208     "  psubusw     %%mm0, %%mm1     \n\t"
 209     "  paddw       %%mm0, %%mm1     \n\t"       /* mm1 = max(mm1, mm0) */
 210     "  movd        %%mm1, %0        \n\t"
 211     "  andl        $0xffff, %0      \n\t"
 212
 213      : "=m" (MaxSad),
 214        "+r" (Src1),
 215        "+r" (Src2)
 216      :
 217      : "memory"
 218   );
 219   return MaxSad;
 220 }
 221
 222 static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
 223                                     ogg_uint32_t stride)
 224 {
 225   ogg_uint32_t MaxSad;
 226
 227   __asm__ __volatile__ (
 228     "  .p2align 4                   \n\t"
 229
 230     "  pxor        %%mm3, %%mm3     \n\t"       /* zero out mm3 for unpack */
 231     "  pxor        %%mm4, %%mm4     \n\t"       /* mm4 low sum */
 232     "  pxor        %%mm5, %%mm5     \n\t"       /* mm5 high sum */
 233     "  pxor        %%mm6, %%mm6     \n\t"       /* mm6 low sum */
 234     "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 high sum */
 235     "  mov         $4, %%edi        \n\t"       /* 4 rows */
 236     "1:                             \n\t"
 237     "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
 238     "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 239
 240     "  movq        %%mm0, %%mm2     \n\t"
 241     "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
 242     "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
 243     "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
 244     "  movq        %%mm0, %%mm1     \n\t"
 245
 246     "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
 247     "  paddw       %%mm0, %%mm4     \n\t"       /* accumulate difference... */
 248     "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
 249     "  paddw       %%mm1, %%mm5     \n\t"       /* accumulate difference... */
 250     "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
 251     "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 252
 253     "  dec         %%edi            \n\t"
 254     "  jnz 1b                       \n\t"
 255
 256     "  mov         $4, %%edi        \n\t"       /* 4 rows */
 257     "2:                             \n\t"
 258     "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
 259     "  movq        (%2), %%mm1      \n\t"       /* take 8 bytes */
 260
 261     "  movq        %%mm0, %%mm2     \n\t"
 262     "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
 263     "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
 264     "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
 265     "  movq        %%mm0, %%mm1     \n\t"
 266
 267     "  punpcklbw   %%mm3, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
 268     "  paddw       %%mm0, %%mm6     \n\t"       /* accumulate difference... */
 269     "  punpckhbw   %%mm3, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
 270     "  paddw       %%mm1, %%mm7     \n\t"       /* accumulate difference... */
 271     "  add         %3, %1           \n\t"       /* Inc pointer into the new data */
 272     "  add         %3, %2           \n\t"       /* Inc pointer into the new data */
 273
 274     "  dec         %%edi            \n\t"
 275     "  jnz 2b                       \n\t"
 276
 277     "  psubusw     %%mm6, %%mm7     \n\t"
 278     "  paddw       %%mm6, %%mm7     \n\t"       /* mm7 = max(mm7, mm6) */
 279     "  psubusw     %%mm4, %%mm5     \n\t"
 280     "  paddw       %%mm4, %%mm5     \n\t"       /* mm5 = max(mm5, mm4) */
 281     "  psubusw     %%mm5, %%mm7     \n\t"
 282     "  paddw       %%mm5, %%mm7     \n\t"       /* mm7 = max(mm5, mm7) */
 283     "  movq        %%mm7, %%mm6     \n\t"
 284     "  psrlq       $32, %%mm6       \n\t"
 285     "  psubusw     %%mm6, %%mm7     \n\t"
 286     "  paddw       %%mm6, %%mm7     \n\t"       /* mm7 = max(mm5, mm7) */
 287     "  movq        %%mm7, %%mm6     \n\t"
 288     "  psrlq       $16, %%mm6       \n\t"
 289     "  psubusw     %%mm6, %%mm7     \n\t"
 290     "  paddw       %%mm6, %%mm7     \n\t"       /* mm7 = max(mm5, mm7) */
 291     "  movd        %%mm7, %0        \n\t"
 292     "  andl        $0xffff, %0      \n\t"
 293
 294      : "=r" (MaxSad),
 295        "+r" (Src1),
 296        "+r" (Src2)
 297      : "r" (stride)
 298      : "memory", "edi"
 299   );
 300
 301   return MaxSad;
 302 }
 303
 304 #define SAD_LOOP                                                              \
 305   "  movq        (%1), %%mm0      \n\t" /* take 8 bytes */                    \
 306   "  movq        (%2), %%mm1      \n\t"                                       \
 307   "  movq        %%mm0, %%mm2     \n\t"                                       \
 308   "  psubusb     %%mm1, %%mm0     \n\t"         /* A - B */                         \
 309   "  psubusb     %%mm2, %%mm1     \n\t" /* B - A */                           \
 310   "  por         %%mm1, %%mm0     \n\t" /* and or gives abs difference */     \
 311   "  movq        %%mm0, %%mm1     \n\t"                                       \
 312   "  punpcklbw   %%mm6, %%mm0     \n\t" /* unpack to higher precision for accumulation */ \
 313   "  paddw       %%mm0, %%mm7     \n\t" /* accumulate difference... */        \
 314   "  punpckhbw   %%mm6, %%mm1     \n\t" /* unpack high four bytes to higher precision */ \
 315   "  add         %3, %1           \n\t" /* Inc pointer into the new data */   \
 316   "  paddw       %%mm1, %%mm7     \n\t" /* accumulate difference... */        \
 317   "  add         %4, %2           \n\t" /* Inc pointer into ref data */
 318
 319 static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
 320                             unsigned char *ptr2, ogg_uint32_t stride2)
 321 {
 322   ogg_uint32_t  DiffVal;
 323
 324   __asm__ __volatile__ (
 325     "  .p2align 4                   \n\t"
 326     "  pxor        %%mm6, %%mm6     \n\t"       /* zero out mm6 for unpack */
 327     "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 contains the result */
 328     SAD_LOOP
 329     SAD_LOOP
 330     SAD_LOOP
 331     SAD_LOOP
 332     SAD_LOOP
 333     SAD_LOOP
 334     SAD_LOOP
 335     SAD_LOOP
 336     "  movq        %%mm7, %%mm0     \n\t"
 337     "  psrlq       $32, %%mm7       \n\t"
 338     "  paddw       %%mm0, %%mm7     \n\t"
 339     "  movq        %%mm7, %%mm0     \n\t"
 340     "  psrlq       $16, %%mm7       \n\t"
 341     "  paddw       %%mm0, %%mm7     \n\t"
 342     "  movd        %%mm7, %0        \n\t"
 343     "  andl        $0xffff, %0      \n\t"
 344
 345      : "=m" (DiffVal),
 346        "+r" (ptr1),
 347        "+r" (ptr2)
 348      : "r" (stride1),
 349        "r" (stride2)
 350      : "memory"
 351   );
 352
 353   return DiffVal;
 354 }
 355
 356 static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
 357                                   unsigned char *ptr2, ogg_uint32_t stride2,
 358                                   ogg_uint32_t thres)
 359 {
 360   return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
 361 }
 362
 363 static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
 364                                       unsigned char *RefDataPtr1,
 365                                       unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
 366                                       ogg_uint32_t thres)
 367 {
 368   ogg_uint32_t  DiffVal;
 369
 370   __asm__ __volatile__ (
 371     "  .p2align 4                   \n\t"
 372
 373     "  pcmpeqd     %%mm5, %%mm5     \n\t"       /* fefefefefefefefe in mm5 */
 374     "  paddb       %%mm5, %%mm5     \n\t"
 375
 376     "  pxor        %%mm6, %%mm6     \n\t"       /* zero out mm6 for unpack */
 377     "  pxor        %%mm7, %%mm7     \n\t"       /* mm7 contains the result */
 378     "  mov         $8, %%edi        \n\t"       /* 8 rows */
 379     "1:                             \n\t"
 380     "  movq        (%1), %%mm0      \n\t"       /* take 8 bytes */
 381
 382     "  movq        (%2), %%mm2      \n\t"
 383     "  movq        (%3), %%mm3      \n\t"       /* take average of mm2 and mm3 */
 384     "  movq        %%mm2, %%mm1     \n\t"
 385     "  pand        %%mm3, %%mm1     \n\t"
 386     "  pxor        %%mm2, %%mm3     \n\t"
 387     "  pand        %%mm5, %%mm3     \n\t"
 388     "  psrlq       $1, %%mm3        \n\t"
 389     "  paddb       %%mm3, %%mm1     \n\t"
 390
 391     "  movq        %%mm0, %%mm2     \n\t"
 392
 393     "  psubusb     %%mm1, %%mm0     \n\t"       /* A - B */
 394     "  psubusb     %%mm2, %%mm1     \n\t"       /* B - A */
 395     "  por         %%mm1, %%mm0     \n\t"       /* and or gives abs difference */
 396     "  movq        %%mm0, %%mm1     \n\t"
 397
 398     "  punpcklbw   %%mm6, %%mm0     \n\t"       /* unpack to higher precision for accumulation */
 399     "  paddw       %%mm0, %%mm7     \n\t"       /* accumulate difference... */
 400     "  punpckhbw   %%mm6, %%mm1     \n\t"       /* unpack high four bytes to higher precision */
 401     "  add         %4, %1           \n\t"       /* Inc pointer into the new data */
 402     "  paddw       %%mm1, %%mm7     \n\t"       /* accumulate difference... */
 403     "  add         %5, %2           \n\t"       /* Inc pointer into ref data */
 404     "  add         %5, %3           \n\t"       /* Inc pointer into ref data */
 405
 406     "  dec         %%edi            \n\t"
 407     "  jnz 1b                       \n\t"
 408
 409     "  movq        %%mm7, %%mm0     \n\t"
 410     "  psrlq       $32, %%mm7       \n\t"
 411     "  paddw       %%mm0, %%mm7     \n\t"
 412     "  movq        %%mm7, %%mm0     \n\t"
 413     "  psrlq       $16, %%mm7       \n\t"
 414     "  paddw       %%mm0, %%mm7     \n\t"
 415     "  movd        %%mm7, %0        \n\t"
 416     "  andl        $0xffff, %0      \n\t"
 417
 418      : "=m" (DiffVal),
 419        "+r" (SrcData),
 420        "+r" (RefDataPtr1),
 421        "+r" (RefDataPtr2)
 422      : "m" (SrcStride),
 423        "m" (RefStride)
 424      : "edi", "memory"
 425   );
 426
 427   return DiffVal;
 428 }
 429
 430 static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
 431 {
 432   ogg_uint32_t  XSum;
 433   ogg_uint32_t  XXSum;
 434
 435   __asm__ __volatile__ (
 436     "  .p2align 4                   \n\t"
 437
 438     "  pxor        %%mm5, %%mm5     \n\t"
 439     "  pxor        %%mm6, %%mm6     \n\t"
 440     "  pxor        %%mm7, %%mm7     \n\t"
 441     "  mov         $8, %%edi        \n\t"
 442     "1:                             \n\t"
 443     "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
 444     "  movq        %%mm0, %%mm2     \n\t"
 445
 446     "  punpcklbw   %%mm6, %%mm0     \n\t"
 447     "  punpckhbw   %%mm6, %%mm2     \n\t"
 448
 449     "  paddw       %%mm0, %%mm5     \n\t"
 450     "  paddw       %%mm2, %%mm5     \n\t"
 451
 452     "  pmaddwd     %%mm0, %%mm0     \n\t"
 453     "  pmaddwd     %%mm2, %%mm2     \n\t"
 454
 455     "  paddd       %%mm0, %%mm7     \n\t"
 456     "  paddd       %%mm2, %%mm7     \n\t"
 457
 458     "  add         %3, %2           \n\t"       /* Inc pointer into src data */
 459
 460     "  dec         %%edi            \n\t"
 461     "  jnz 1b                       \n\t"
 462
 463     "  movq        %%mm5, %%mm0     \n\t"
 464     "  psrlq       $32, %%mm5       \n\t"
 465     "  paddw       %%mm0, %%mm5     \n\t"
 466     "  movq        %%mm5, %%mm0     \n\t"
 467     "  psrlq       $16, %%mm5       \n\t"
 468     "  paddw       %%mm0, %%mm5     \n\t"
 469     "  movd        %%mm5, %%edi     \n\t"
 470     "  movsx       %%di, %%edi      \n\t"
 471     "  movl        %%edi, %0        \n\t"
 472
 473     "  movq        %%mm7, %%mm0     \n\t"
 474     "  psrlq       $32, %%mm7       \n\t"
 475     "  paddd       %%mm0, %%mm7     \n\t"
 476     "  movd        %%mm7, %1        \n\t"
 477
 478      : "=r" (XSum),
 479        "=r" (XXSum),
 480        "+r" (DataPtr)
 481      : "r" (Stride)
 482      : "edi", "memory"
 483   );
 484
 485   /* Compute population variance as mis-match metric. */
 486   return (( (XXSum<<6) - XSum*XSum ) );
 487 }
 488
 489 static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
 490                                  unsigned char *RefDataPtr, ogg_uint32_t RefStride)
 491 {
 492   ogg_uint32_t  XSum;
 493   ogg_uint32_t  XXSum;
 494
 495   __asm__ __volatile__ (
 496     "  .p2align 4                   \n\t"
 497
 498     "  pxor        %%mm5, %%mm5     \n\t"
 499     "  pxor        %%mm6, %%mm6     \n\t"
 500     "  pxor        %%mm7, %%mm7     \n\t"
 501     "  mov         $8, %%edi        \n\t"
 502     "1:                             \n\t"
 503     "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
 504     "  movq        (%3), %%mm1      \n\t"
 505     "  movq        %%mm0, %%mm2     \n\t"
 506     "  movq        %%mm1, %%mm3     \n\t"
 507
 508     "  punpcklbw   %%mm6, %%mm0     \n\t"
 509     "  punpcklbw   %%mm6, %%mm1     \n\t"
 510     "  punpckhbw   %%mm6, %%mm2     \n\t"
 511     "  punpckhbw   %%mm6, %%mm3     \n\t"
 512
 513     "  psubsw      %%mm1, %%mm0     \n\t"
 514     "  psubsw      %%mm3, %%mm2     \n\t"
 515
 516     "  paddw       %%mm0, %%mm5     \n\t"
 517     "  paddw       %%mm2, %%mm5     \n\t"
 518
 519     "  pmaddwd     %%mm0, %%mm0     \n\t"
 520     "  pmaddwd     %%mm2, %%mm2     \n\t"
 521
 522     "  paddd       %%mm0, %%mm7     \n\t"
 523     "  paddd       %%mm2, %%mm7     \n\t"
 524
 525     "  add         %4, %2           \n\t"       /* Inc pointer into src data */
 526     "  add         %5, %3           \n\t"       /* Inc pointer into ref data */
 527
 528     "  dec         %%edi            \n\t"
 529     "  jnz 1b                       \n\t"
 530
 531     "  movq        %%mm5, %%mm0     \n\t"
 532     "  psrlq       $32, %%mm5       \n\t"
 533     "  paddw       %%mm0, %%mm5     \n\t"
 534     "  movq        %%mm5, %%mm0     \n\t"
 535     "  psrlq       $16, %%mm5       \n\t"
 536     "  paddw       %%mm0, %%mm5     \n\t"
 537     "  movd        %%mm5, %%edi     \n\t"
 538     "  movsx       %%di, %%edi      \n\t"
 539     "  movl        %%edi, %0        \n\t"
 540
 541     "  movq        %%mm7, %%mm0     \n\t"
 542     "  psrlq       $32, %%mm7       \n\t"
 543     "  paddd       %%mm0, %%mm7     \n\t"
 544     "  movd        %%mm7, %1        \n\t"
 545
 546      : "=m" (XSum),
 547        "=m" (XXSum),
 548        "+r" (SrcData),
 549        "+r" (RefDataPtr)
 550      : "m" (SrcStride),
 551        "m" (RefStride)
 552      : "edi", "memory"
 553   );
 554
 555   /* Compute and return population variance as mis-match metric. */
 556   return (( (XXSum<<6) - XSum*XSum ));
 557 }
 558
 559 static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
 560                                      unsigned char *RefDataPtr1,
 561                                      unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
 562 {
 563   ogg_uint32_t XSum;
 564   ogg_uint32_t XXSum;
 565
 566   __asm__ __volatile__ (
 567     "  .p2align 4                   \n\t"
 568
 569     "  pcmpeqd     %%mm4, %%mm4     \n\t"       /* fefefefefefefefe in mm4 */
 570     "  paddb       %%mm4, %%mm4     \n\t"
 571     "  pxor        %%mm5, %%mm5     \n\t"
 572     "  pxor        %%mm6, %%mm6     \n\t"
 573     "  pxor        %%mm7, %%mm7     \n\t"
 574     "  mov         $8, %%edi        \n\t"
 575     "1:                             \n\t"
 576     "  movq        (%2), %%mm0      \n\t"       /* take 8 bytes */
 577
 578     "  movq        (%3), %%mm2      \n\t"
 579     "  movq        (%4), %%mm3      \n\t"       /* take average of mm2 and mm3 */
 580     "  movq        %%mm2, %%mm1     \n\t"
 581     "  pand        %%mm3, %%mm1     \n\t"
 582     "  pxor        %%mm2, %%mm3     \n\t"
 583     "  pand        %%mm4, %%mm3     \n\t"
 584     "  psrlq       $1, %%mm3        \n\t"
 585     "  paddb       %%mm3, %%mm1     \n\t"
 586
 587     "  movq        %%mm0, %%mm2     \n\t"
 588     "  movq        %%mm1, %%mm3     \n\t"
 589
 590     "  punpcklbw   %%mm6, %%mm0     \n\t"
 591     "  punpcklbw   %%mm6, %%mm1     \n\t"
 592     "  punpckhbw   %%mm6, %%mm2     \n\t"
 593     "  punpckhbw   %%mm6, %%mm3     \n\t"
 594
 595     "  psubsw      %%mm1, %%mm0     \n\t"
 596     "  psubsw      %%mm3, %%mm2     \n\t"
 597
 598     "  paddw       %%mm0, %%mm5     \n\t"
 599     "  paddw       %%mm2, %%mm5     \n\t"
 600
 601     "  pmaddwd     %%mm0, %%mm0     \n\t"
 602     "  pmaddwd     %%mm2, %%mm2     \n\t"
 603
 604     "  paddd       %%mm0, %%mm7     \n\t"
 605     "  paddd       %%mm2, %%mm7     \n\t"
 606
 607     "  add         %5, %2           \n\t"       /* Inc pointer into src data */
 608     "  add         %6, %3           \n\t"       /* Inc pointer into ref data */
 609     "  add         %6, %4           \n\t"       /* Inc pointer into ref data */
 610
 611     "  dec         %%edi            \n\t"
 612     "  jnz 1b                       \n\t"
 613
 614     "  movq        %%mm5, %%mm0     \n\t"
 615     "  psrlq       $32, %%mm5       \n\t"
 616     "  paddw       %%mm0, %%mm5     \n\t"
 617     "  movq        %%mm5, %%mm0     \n\t"
 618     "  psrlq       $16, %%mm5       \n\t"
 619     "  paddw       %%mm0, %%mm5     \n\t"
 620     "  movd        %%mm5, %%edi     \n\t"
 621     "  movsx       %%di, %%edi      \n\t"
 622     "  movl        %%edi, %0        \n\t"
 623
 624     "  movq        %%mm7, %%mm0     \n\t"
 625     "  psrlq       $32, %%mm7       \n\t"
 626     "  paddd       %%mm0, %%mm7     \n\t"
 627     "  movd        %%mm7, %1        \n\t"
 628
 629      : "=m" (XSum),
 630        "=m" (XXSum),
 631        "+r" (SrcData),
 632        "+r" (RefDataPtr1),
 633        "+r" (RefDataPtr2)
 634      : "m" (SrcStride),
 635        "m" (RefStride)
 636      : "edi", "memory"
 637   );
 638
 639   /* Compute and return population variance as mis-match metric. */
 640   return (( (XXSum<<6) - XSum*XSum ));
 641 }
 642
 643 static void restore_fpu (void)
 644 {
 645   __asm__ __volatile__ (
 646     "  emms                         \n\t"
 647   );
 648 }
 649
 650 void dsp_mmx_init(DspFunctions *funcs)
 651 {
 652   TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
 653   funcs->restore_fpu = restore_fpu;
 654   funcs->sub8x8 = sub8x8__mmx;
 655   funcs->sub8x8_128 = sub8x8_128__mmx;
 656   funcs->sub8x8avg2 = sub8x8avg2__mmx;
 657   funcs->row_sad8 = row_sad8__mmx;
 658   funcs->col_sad8x8 = col_sad8x8__mmx;
 659   funcs->sad8x8 = sad8x8__mmx;
 660   funcs->sad8x8_thres = sad8x8_thres__mmx;
 661   funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
 662   funcs->intra8x8_err = intra8x8_err__mmx;
 663   funcs->inter8x8_err = inter8x8_err__mmx;
 664   funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
 665 }
 666
 667 #endif /* USE_ASM */