theora-old/lib/x86_32/idct_mmx.c

   1 /********************************************************************
   2  *                                                                  *
   3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
   4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
   5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
   6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
   7  *                                                                  *
   8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
   9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  10  *                                                                  *
  11  ********************************************************************
  12
  13   function:
  14   last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
  15
  16  ********************************************************************/
  17
  18 #include "codec_internal.h"
  19
  20 #if defined(USE_ASM)
  21
  22 #define ASM asm
  23
  24 /****************************************************************************
  25 *
  26 *   Description  :     IDCT with multiple versions based on # of non 0 coeffs
  27 *
  28 *****************************************************************************
  29 */
  30
  31 // Dequantization + inverse discrete cosine transform.
  32
  33 // Constants used in MMX implementation of dequantization and idct.
  34 // All the MMX stuff works with 4 16-bit quantities at a time and
  35 // we create 11 constants of size 4 x 16 bits.
  36 // The first 4 are used to mask the individual 16-bit words within a group
  37 // and are used in the address-shuffling part of the dequantization.
  38 // The last 7 are fixed-point approximations to the cosines of angles
  39 // occurring in the DCT; each of these contains 4 copies of the same value.
  40
  41 // There is only one (statically initialized) instance of this object
  42 // wrapped in an allocator object that forces its starting address
  43 // to be evenly divisible by 32.  Hence the actual object occupies 2.75
  44 // cache lines on a Pentium processor.
  45
  46 // Offsets in bytes used by the assembler code below
  47 // must of course agree with the idctConstants constructor.
  48
  49 #define MaskOffset 0        // 4 masks come in order low word to high
  50 #define CosineOffset 32     // 7 cosines come in order pi/16 * (1 ... 7)
  51 #define EightOffset 88
  52 #define IdctAdjustBeforeShift 8
  53
  54 /*
  55 UINT16 idctcosTbl[ 7] =
  56 {
  57     64277, 60547, 54491, 46341, 36410, 25080, 12785
  58 };
  59
  60 void fillidctconstants(void)
  61 {
  62     int j = 16;
  63     UINT16 * p;
  64     do
  65     {
  66         idctconstants[ --j] = 0;
  67     }
  68     while( j);
  69
  70     idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
  71
  72     j = 1;
  73     do
  74     {
  75         p = idctconstants + ( (j+3) << 2);
  76         p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
  77     }
  78     while( ++j <= 7);
  79
  80     idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
  81 }
  82 */
  83
  84 ogg_uint16_t idctconstants[(4+7+1) * 4] = {
  85     65535,     0,     0,     0,     0, 65535,     0,     0,
  86         0,     0, 65535,     0,     0,     0,     0, 65535,
  87     64277, 64277, 64277, 64277, 60547, 60547, 60547, 60547,
  88     54491, 54491, 54491, 54491, 46341, 46341, 46341, 46341,
  89     36410, 36410, 36410, 36410, 25080, 25080, 25080, 25080,
  90     12785, 12785, 12785, 12785,     8,     8,     8,     8,
  91 };
  92
  93 /* Dequantization + inverse DCT.
  94
  95    Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
  96    by unsigned 16-bit quantization table entries.
  97    These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
  98    Result is scaled signed DCT coefficients (abs value < 2^15).
  99
 100    In the data stream, the coefficients are sent in order of increasing
 101    total (horizontal + vertical) frequency.  The exact picture is as follows:
 102
 103     00 01 05 06  16 17 33 34
 104     02 04 07 15  20 32 35 52
 105     03 10 14 21  31 36 51 53
 106     11 13 22 30  37 50 54 65
 107
 108     12 23 27 40  47 55 64 66
 109     24 26 41 46  56 63 67 74
 110     25 42 45 57  62 70 73 75
 111     43 44 60 61  71 72 76 77
 112
 113    Here the position in the matrix corresponds to the (horiz,vert)
 114    freqency indices and the octal entry in the matrix is the position
 115    of the coefficient in the data stream.  Thus the coefficients are sent
 116    in sort of a diagonal "snake".
 117
 118    The dequantization stage "uncurls the snake" and stores the expanded
 119    coefficients in more convenient positions.  These are not exactly the
 120    natural positions given above but take into account our implementation
 121    of the idct, which basically requires two one-dimensional idcts and
 122    two transposes.
 123
 124    We fold the first transpose into the storage of the expanded coefficients.
 125    We don't actually do a full transpose because this would require doubling
 126    the size of the idct buffer; rather, we just transpose each of the 4x4
 127    subblocks.  Using slightly varying addressing schemes in each of the
 128    four 4x8 idcts then allows these transforms to be done in place.
 129
 130    Transposing the 4x4 subblocks in the matrix above gives
 131
 132     00 02 03 11  16 20 31 37
 133     01 04 10 13  17 32 36 50
 134     05 07 14 22  33 35 51 54
 135     06 15 21 30  34 52 53 65
 136
 137     12 24 25 43  47 56 62 71
 138     23 26 42 44  55 63 70 72
 139     27 41 45 60  64 67 73 76
 140     40 46 57 61  66 74 75 77
 141
 142    Finally, we reverse the words in each 4 word group to clarify
 143    direction of shifts.
 144
 145     11 03 02 00  37 31 20 16
 146     13 10 04 01  50 36 32 17
 147     22 14 07 05  54 51 35 33
 148     30 21 15 06  65 53 52 34
 149
 150     43 25 24 12  71 62 56 47
 151     44 42 26 23  72 70 63 55
 152     60 45 41 27  76 73 67 64
 153     61 57 46 40  77 75 74 66
 154
 155    This matrix then shows the 16 4x16 destination words in terms of
 156    the 16 4x16 input words.
 157
 158    We implement this algorithm by manipulation of mmx registers,
 159    which seems to be the fastest way to proceed.  It is completely
 160    hand-written; there does not seem to be enough recurrence to
 161    reasonably compartmentalize any of it.  Hence the resulting
 162    program is ugly and bloated.  Furthermore, due to the absence of
 163    register pressure, it is boring and artless.  I hate it.
 164
 165    The idct itself is more interesting.  Since the two-dimensional dct
 166    basis functions are products of the one-dimesional dct basis functions,
 167    we can compute an inverse (or forward) dct via two 1-D transforms,
 168    on rows then on columns.  To exploit MMX parallelism, we actually do
 169    both operations on columns, interposing a (partial) transpose between
 170    the two 1-D transforms, the first transpose being done by the expansion
 171    described above.
 172
 173    The 8-sample one-dimensional DCT is a standard orthogonal expansion using
 174    the (unnormalized) basis functions
 175
 176     b[k]( i) = cos( pi * k * (2i + 1) / 16);
 177
 178    here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
 179    To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
 180    should be multiplied by 1/2.
 181
 182    The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
 183    in each direction.  The (unnormalized) basis functions are
 184
 185     B[k,l]( i, j) = b[k]( i) * b[l]( j);
 186
 187    this time k and l are the horizontal and vertical frequencies,
 188    i and j are the horizontal and vertical spatial coordinates;
 189    all indices vary from 0 ... 7 (as above)
 190    and there are now 4 cases of normalization.
 191
 192    Our 1-D idct expansion uses constants C1 ... C7 given by
 193
 194     (*)  Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)
 195
 196    and the following 1-D algorithm transforming I0 ... I7  to  R0 ... R7 :
 197
 198    A = (C1 * I1) + (C7 * I7)        B = (C7 * I1) - (C1 * I7)
 199    C = (C3 * I3) + (C5 * I5)        D = (C3 * I5) - (C5 * I3)
 200    A. = C4 * (A - C)                B. = C4 * (B - D)
 201    C. = A + C                       D. = B + D
 202
 203    E = C4 * (I0 + I4)               F = C4 * (I0 - I4)
 204    G = (C2 * I2) + (C6 * I6)        H = (C6 * I2) - (C2 * I6)
 205    E. = E - G
 206    G. = E + G
 207
 208    A.. = F + A.                 B.. = B. - H
 209    F.  = F - A.                 H.  = B. + H
 210
 211    R0 = G. + C. R1 = A.. + H.   R3 = E. + D.    R5 = F. + B..
 212    R7 = G. - C. R2 = A.. - H.   R4 = E. - D.    R6 = F. - B..
 213
 214    It is due to Vetterli and Lightenberg and may be found in the JPEG
 215    reference book by Pennebaker and Mitchell.
 216
 217    Correctness of the algorithm follows from (*) together with the
 218    addition formulas for sine and cosine:
 219
 220     cos( A + B) = cos( A) * cos( B)  -  sin( A) * sin( B)
 221     sin( A + B) = sin( A) * cos( B)  +  cos( A) * sin( B)
 222
 223    Note that this implementation absorbs the difference in normalization
 224    between the 0th and higher frequencies, although the results produced
 225    are actually twice as big as they should be.  Since we do this for each
 226    dimension, the 2-D idct results are 4x the desired results.  Finally,
 227    taking into account that the dequantization multiplies by 4 as well,
 228    our actual results are 16x too big.  We fix this by shifting the final
 229    results right by 4 bits.
 230
 231    High precision version approximates C1 ... C7 to 16 bits.
 232    Since MMX only provides a signed multiply, C1 ... C5 appear to be
 233    negative and multiplies involving them must be adjusted to compensate
 234    for this.  C6 and C7 do not require this adjustment since
 235    they are < 1/2 and are correctly treated as positive numbers.
 236
 237    Following macro does four 8-sample one-dimensional idcts in parallel.
 238    This is actually not such a difficult program to write once you
 239    make a couple of observations (I of course was unable to make these
 240    observations until I'd half-written a couple of other versions).
 241
 242     1. Everything is easy once you are done with the multiplies.
 243        This is because, given X and Y in registers, one may easily
 244        calculate X+Y and X-Y using just those 2 registers.
 245
 246     2. You always need at least 2 extra registers to calculate products,
 247        so storing 2 temporaries is inevitable.  C. and D. seem to be
 248        the best candidates.
 249
 250     3. The products should be calculated in decreasing order of complexity
 251        (which translates into register pressure).  Since C1 ... C5 require
 252        adjustment (and C6, C7 do not), we begin by calculating C and D.
 253 */
 254
 255 /**************************************************************************************
 256  *
 257  *      Routine:        BeginIDCT
 258  *
 259  *      Description:    The Macro does IDct on 4 1-D Dcts
 260  *
 261  *      Input:          None
 262  *
 263  *      Output:         None
 264  *
 265  *      Return:         None
 266  *
 267  *      Special Note:   None
 268  *
 269  *      Error:          None
 270  *
 271  ***************************************************************************************
 272  */
 273
 274 #define MtoSTR(s) #s
 275
 276 #define Dump    "call MMX_dump\n"
 277
 278 #define BeginIDCT "#BeginIDCT\n"    \
 279                                     \
 280     "   movq    "   I(3)","r2"\n"   \
 281                                     \
 282     "   movq    "   C(3)","r6"\n"   \
 283     "   movq    "   r2","r4"\n"     \
 284     "   movq    "   J(5)","r7"\n"   \
 285     "   pmulhw  "   r6","r4"\n"     \
 286     "   movq    "   C(5)","r1"\n"   \
 287     "   pmulhw  "   r7","r6"\n"     \
 288     "   movq    "   r1","r5"\n"     \
 289     "   pmulhw  "   r2","r1"\n"     \
 290     "   movq    "   I(1)","r3"\n"   \
 291     "   pmulhw  "   r7","r5"\n"     \
 292     "   movq    "   C(1)","r0"\n"   \
 293     "   paddw   "   r2","r4"\n"     \
 294     "   paddw   "   r7","r6"\n"     \
 295     "   paddw   "   r1","r2"\n"     \
 296     "   movq    "   J(7)","r1"\n"   \
 297     "   paddw   "   r5","r7"\n"     \
 298     "   movq    "   r0","r5"\n"     \
 299     "   pmulhw  "   r3","r0"\n"     \
 300     "   paddsw  "   r7","r4"\n"     \
 301     "   pmulhw  "   r1","r5"\n"     \
 302     "   movq    "   C(7)","r7"\n"   \
 303     "   psubsw  "   r2","r6"\n"     \
 304     "   paddw   "   r3","r0"\n"     \
 305     "   pmulhw  "   r7","r3"\n"     \
 306     "   movq    "   I(2)","r2"\n"   \
 307     "   pmulhw  "   r1","r7"\n"     \
 308     "   paddw   "   r1","r5"\n"     \
 309     "   movq    "   r2","r1"\n"     \
 310     "   pmulhw  "   C(2)","r2"\n"   \
 311     "   psubsw  "   r5","r3"\n"     \
 312     "   movq    "   J(6)","r5"\n"   \
 313     "   paddsw  "   r7","r0"\n"     \
 314     "   movq    "   r5","r7"\n"     \
 315     "   psubsw  "   r4","r0"\n"     \
 316     "   pmulhw  "   C(2)","r5"\n"   \
 317     "   paddw   "   r1","r2"\n"     \
 318     "   pmulhw  "   C(6)","r1"\n"   \
 319     "   paddsw  "   r4","r4"\n"     \
 320     "   paddsw  "   r0","r4"\n"     \
 321     "   psubsw  "   r6","r3"\n"     \
 322     "   paddw   "   r7","r5"\n"     \
 323     "   paddsw  "   r6","r6"\n"     \
 324     "   pmulhw  "   C(6)","r7"\n"   \
 325     "   paddsw  "   r3","r6"\n"     \
 326     "   movq    "   r4","I(1)"\n"   \
 327     "   psubsw  "   r5","r1"\n"     \
 328     "   movq    "   C(4)","r4"\n"   \
 329     "   movq    "   r3","r5"\n"     \
 330     "   pmulhw  "   r4","r3"\n"     \
 331     "   paddsw  "   r2","r7"\n"     \
 332     "   movq    "   r6","I(2)"\n"   \
 333     "   movq    "   r0","r2"\n"     \
 334     "   movq    "   I(0)","r6"\n"   \
 335     "   pmulhw  "   r4","r0"\n"     \
 336     "   paddw   "   r3","r5"\n"     \
 337     "\n"                            \
 338     "   movq    "   J(4)","r3"\n"   \
 339     "   psubsw  "   r1","r5"\n"     \
 340     "   paddw   "   r0","r2"\n"     \
 341     "   psubsw  "   r3","r6"\n"     \
 342     "   movq    "   r6","r0"\n"     \
 343     "   pmulhw  "   r4","r6"\n"     \
 344     "   paddsw  "   r3","r3"\n"     \
 345     "   paddsw  "   r1","r1"\n"     \
 346     "   paddsw  "   r0","r3"\n"     \
 347     "   paddsw  "   r5","r1"\n"     \
 348     "   pmulhw  "   r3","r4"\n"     \
 349     "   paddsw  "   r0","r6"\n"     \
 350     "   psubsw  "   r2","r6"\n"     \
 351     "   paddsw  "   r2","r2"\n"     \
 352     "   movq    "   I(1)","r0"\n"   \
 353     "   paddsw  "   r6","r2"\n"     \
 354     "   paddw   "   r3","r4"\n"     \
 355     "   psubsw  "   r1","r2"\n"     \
 356     "#end BeginIDCT\n"
 357 // end BeginIDCT macro (38 cycles).
 358
 359
 360 // Two versions of the end of the idct depending on whether we're feeding
 361 // into a transpose or dividing the final results by 16 and storing them.
 362
 363 /**************************************************************************************
 364  *
 365  *      Routine:        RowIDCT
 366  *
 367  *      Description:    The Macro does 1-D IDct on 4 Rows
 368  *
 369  *      Input:          None
 370  *
 371  *      Output:         None
 372  *
 373  *      Return:         None
 374  *
 375  *      Special Note:   None
 376  *
 377  *      Error:          None
 378  *
 379  ***************************************************************************************
 380  */
 381
 382 // RowIDCT gets ready to transpose.
 383
 384 #define RowIDCT ASM("\n"\
 385     "#RowIDCT\n"                                        \
 386     BeginIDCT                                           \
 387     "\n"                                                \
 388     "   movq    "I(2)","r3"\n"  /* r3 = D. */           \
 389     "   psubsw  "r7","r4"\n"    /* r4 = E. = E - G */   \
 390     "   paddsw  "r1","r1"\n"    /* r1 = H. + H. */      \
 391     "   paddsw  "r7","r7"\n"    /* r7 = G + G */        \
 392     "   paddsw  "r2","r1"\n"    /* r1 = R1 = A.. + H. */\
 393     "   paddsw  "r4","r7"\n"    /* r7 = G. = E + G */   \
 394     "   psubsw  "r3","r4"\n"    /* r4 = R4 = E. - D. */ \
 395     "   paddsw  "r3","r3"\n"                            \
 396     "   psubsw  "r5","r6"\n"    /* r6 = R6 = F. - B.. */\
 397     "   paddsw  "r5","r5"\n"                            \
 398     "   paddsw  "r4","r3"\n"    /* r3 = R3 = E. + D. */ \
 399     "   paddsw  "r6","r5"\n"    /* r5 = R5 = F. + B.. */\
 400     "   psubsw  "r0","r7"\n"    /* r7 = R7 = G. - C. */ \
 401     "   paddsw  "r0","r0"\n"                            \
 402     "   movq    "r1","I(1)"\n"  /* save R1 */           \
 403     "   paddsw  "r7","r0"\n"    /* r0 = R0 = G. + C. */ \
 404     "#end RowIDCT"                                                                              \
 405 );
 406 // end RowIDCT macro (8 + 38 = 46 cycles)
 407
 408
 409 /**************************************************************************************
 410  *
 411  *      Routine:        ColumnIDCT
 412  *
 413  *      Description:    The Macro does 1-D IDct on 4 columns
 414  *
 415  *      Input:          None
 416  *
 417  *      Output:         None
 418  *
 419  *      Return:         None
 420  *
 421  *      Special Note:   None
 422  *
 423  *      Error:          None
 424  *
 425  ***************************************************************************************
 426  */
 427 // Column IDCT normalizes and stores final results.
 428
 429 #define ColumnIDCT ASM("\n"                                 \
 430     "#ColumnIDCT\n"                                         \
 431     BeginIDCT                                               \
 432     "\n"                                                    \
 433     "   paddsw  "Eight","r2"\n"                             \
 434     "   paddsw  "r1","r1"\n"        /* r1 = H. + H. */      \
 435     "   paddsw  "r2","r1"\n"        /* r1 = R1 = A.. + H. */\
 436     "   psraw   ""$4"","r2"\n"      /* r2 = NR2 */          \
 437     "   psubsw  "r7","r4"\n"        /* r4 = E. = E - G */   \
 438     "   psraw   ""$4"","r1"\n"      /* r1 = NR1 */          \
 439     "   movq    "I(2)","r3"\n"  /* r3 = D. */               \
 440     "   paddsw  "r7","r7"\n"        /* r7 = G + G */        \
 441     "   movq    "r2","I(2)"\n"  /* store NR2 at I2 */       \
 442     "   paddsw  "r4","r7"\n"        /* r7 = G. = E + G */   \
 443     "   movq    "r1","I(1)"\n"  /* store NR1 at I1 */       \
 444     "   psubsw  "r3","r4"\n"        /* r4 = R4 = E. - D. */ \
 445     "   paddsw  "Eight","r4"\n"                             \
 446     "   paddsw  "r3","r3"\n"        /* r3 = D. + D. */      \
 447     "   paddsw  "r4","r3"\n"        /* r3 = R3 = E. + D. */ \
 448     "   psraw   ""$4"","r4"\n"      /* r4 = NR4 */          \
 449     "   psubsw  "r5","r6"\n"        /* r6 = R6 = F. - B.. */\
 450     "   psraw   ""$4"","r3"\n"      /* r3 = NR3 */          \
 451     "   paddsw  "Eight","r6"\n"                             \
 452     "   paddsw  "r5","r5"\n"        /* r5 = B.. + B.. */    \
 453     "   paddsw  "r6","r5"\n"        /* r5 = R5 = F. + B.. */\
 454     "   psraw   ""$4"","r6"\n"      /* r6 = NR6 */          \
 455     "   movq    "r4","J(4)"\n"  /* store NR4 at J4 */       \
 456     "   psraw   ""$4"","r5"\n"      /* r5 = NR5 */          \
 457     "   movq    "r3","I(3)"\n"  /* store NR3 at I3 */       \
 458     "   psubsw  "r0","r7"\n"        /* r7 = R7 = G. - C. */ \
 459     "   paddsw  "Eight","r7"\n"                             \
 460     "   paddsw  "r0","r0"\n"        /* r0 = C. + C. */      \
 461     "   paddsw  "r7","r0"\n"        /* r0 = R0 = G. + C. */ \
 462     "   psraw   ""$4"","r7"\n"      /* r7 = NR7 */          \
 463     "   movq    "r6","J(6)"\n"  /* store NR6 at J6 */       \
 464     "   psraw   ""$4"","r0"\n"      /* r0 = NR0 */          \
 465     "   movq    "r5","J(5)"\n"  /* store NR5 at J5 */       \
 466     "   movq    "r7","J(7)"\n"  /* store NR7 at J7 */       \
 467     "   movq    "r0","I(0)"\n"  /* store NR0 at I0 */       \
 468     "#end ColumnIDCT\n"                                                                         \
 469 );
 470 // end ColumnIDCT macro (38 + 19 = 57 cycles)
 471
 472 /**************************************************************************************
 473  *
 474  *      Routine:        Transpose
 475  *
 476  *      Description:    The Macro does two 4x4 transposes in place.
 477  *
 478  *      Input:          None
 479  *
 480  *      Output:         None
 481  *
 482  *      Return:         None
 483  *
 484  *      Special Note:   None
 485  *
 486  *      Error:          None
 487  *
 488  ***************************************************************************************
 489  */
 490
 491 /* Following macro does two 4x4 transposes in place.
 492
 493   At entry (we assume):
 494
 495     r0 = a3 a2 a1 a0
 496     I(1) = b3 b2 b1 b0
 497     r2 = c3 c2 c1 c0
 498     r3 = d3 d2 d1 d0
 499
 500     r4 = e3 e2 e1 e0
 501     r5 = f3 f2 f1 f0
 502     r6 = g3 g2 g1 g0
 503     r7 = h3 h2 h1 h0
 504
 505    At exit, we have:
 506
 507     I(0) = d0 c0 b0 a0
 508     I(1) = d1 c1 b1 a1
 509     I(2) = d2 c2 b2 a2
 510     I(3) = d3 c3 b3 a3
 511
 512     J(4) = h0 g0 f0 e0
 513     J(5) = h1 g1 f1 e1
 514     J(6) = h2 g2 f2 e2
 515     J(7) = h3 g3 f3 e3
 516
 517    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
 518    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
 519
 520    Since r1 is free at entry, we calculate the Js first. */
 521
 522
 523 #define Transpose ASM("\n#Transpose\n"      \
 524                                             \
 525     "   movq        "r4","r1"\n"            \
 526     "   punpcklwd   "r5","r4"\n"            \
 527     "   movq        "r0","I(0)"\n"          \
 528     "   punpckhwd   "r5","r1"\n"            \
 529     "   movq        "r6","r0"\n"            \
 530     "   punpcklwd   "r7","r6"\n"            \
 531     "   movq        "r4","r5"\n"            \
 532     "   punpckldq   "r6","r4"\n"            \
 533     "   punpckhdq   "r6","r5"\n"            \
 534     "   movq        "r1","r6"\n"            \
 535     "   movq        "r4","J(4)"\n"          \
 536     "   punpckhwd   "r7","r0"\n"            \
 537     "   movq        "r5","J(5)"\n"          \
 538     "   punpckhdq   "r0","r6"\n"            \
 539     "   movq        "I(0)","r4"\n"          \
 540     "   punpckldq   "r0","r1"\n"            \
 541     "   movq        "I(1)","r5"\n"          \
 542     "   movq        "r4","r0"\n"            \
 543     "   movq        "r6","J(7)"\n"          \
 544     "   punpcklwd   "r5","r0"\n"            \
 545     "   movq        "r1","J(6)"\n"          \
 546     "   punpckhwd   "r5","r4"\n"            \
 547     "   movq        "r2","r5"\n"            \
 548     "   punpcklwd   "r3","r2"\n"            \
 549     "   movq        "r0","r1"\n"            \
 550     "   punpckldq   "r2","r0"\n"            \
 551     "   punpckhdq   "r2","r1"\n"            \
 552     "   movq        "r4","r2"\n"            \
 553     "   movq        "r0","I(0)"\n"          \
 554     "   punpckhwd   "r3","r5"\n"            \
 555     "   movq        "r1","I(1)"\n"          \
 556     "   punpckhdq   "r5","r4"\n"            \
 557     "   punpckldq   "r5","r2"\n"            \
 558                                             \
 559     "   movq        "r4","I(3)"\n"          \
 560                                             \
 561     "   movq        "r2","I(2)"\n"          \
 562     "#end Transpose\n"                                          \
 563 );
 564 // end Transpose macro (19 cycles).
 565
 566 /*
 567 static void MMX_dump()
 568 {
 569     ASM
 570     ("\
 571         movq    %mm0,(%edi)\n\
 572         movq    %mm1,8(%edi)\n\
 573         movq    %mm2,16(%edi)\n\
 574         movq    %mm3,24(%edi)\n\
 575         movq    %mm4,32(%edi)\n\
 576         movq    %mm5,40(%edi)\n\
 577         movq    %mm6,48(%edi)\n\
 578         movq    %mm7,56(%edi)\n\
 579         ret"
 580     );
 581 }
 582 */
 583
 584 /**************************************************************************************
 585  *
 586  *      Routine:        MMX_idct
 587  *
 588  *      Description:    Perform IDCT on a 8x8 block
 589  *
 590  *      Input:          Pointer to input and output buffer
 591  *
 592  *      Output:         None
 593  *
 594  *      Return:         None
 595  *
 596  *      Special Note:   The input coefficients are in ZigZag order
 597  *
 598  *      Error:          None
 599  *
 600  ***************************************************************************************
 601  */
 602 void IDctSlow__mmx(  Q_LIST_ENTRY * InputData,
 603                 ogg_int16_t *QuantMatrix,
 604                 ogg_int16_t * OutputData ) {
 605
 606 #   define MIDM(M,I)    MtoSTR(M+I*8(%ecx))
 607 #   define M(I)         MIDM( MaskOffset , I )
 608 #   define MIDC(M,I)    MtoSTR(M+(I-1)*8(%ecx))
 609 #   define C(I)         MIDC( CosineOffset , I )
 610 #   define MIDEight(M)  MtoSTR(M(%ecx))
 611 #   define Eight        MIDEight(EightOffset)
 612
 613 #   define r0   "%mm0"
 614 #   define r1   "%mm1"
 615 #   define r2   "%mm2"
 616 #   define r3   "%mm3"
 617 #   define r4   "%mm4"
 618 #   define r5   "%mm5"
 619 #   define r6   "%mm6"
 620 #   define r7   "%mm7"
 621
 622     __asm__ __volatile__ (
 623     /* eax = quantized input */
 624     /* esi = quantization table */
 625     /* edx = destination (= idct buffer) */
 626     /* ecx = idctconstants */
 627     ""
 628     :
 629     :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
 630     );
 631
 632     ASM(
 633     "movq   (%eax), "r0"\n"
 634     "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
 635     "movq   16(%eax), "r1"\n"
 636     "pmullw 16(%esi), "r1"\n"   /* r1 = 13 12 11 10 */
 637     "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
 638     "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
 639     "movq   8(%eax), "r4"\n"
 640     "psrlq  $16, "r0"\n"        /* r0 = __ 03 02 01 */
 641     "pmullw 8(%esi), "r4"\n"    /* r4 = 07 06 05 04 */
 642     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 00 */
 643     "movq   "r0", "r5"\n"       /* r5 = __ 03 02 01 */
 644     "movq   "r1", "r6"\n"       /* r6 = 13 12 11 10 */
 645     "pand   "r2", "r5"\n"       /* r5 = __ __ __ 01 */
 646     "psllq  $32, "r6"\n"        /* r6 = 11 10 __ __ */
 647     "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
 648     "pxor   "r5", "r0"\n"       /* r0 = __ 03 02 __ */
 649     "pand   "r6", "r7"\n"       /* r7 = 11 __ __ __ */
 650     "por    "r3", "r0"\n"       /* r0 = __ 03 02 00 */
 651     "pxor   "r7", "r6"\n"       /* r6 = __ 10 __ __ */
 652     "por    "r7", "r0"\n"       /* r0 = 11 03 02 00 = R0 */
 653     "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
 654     "movq   "r4", "r3"\n"       /* r3 = 07 06 05 04 */
 655     "movq   "r0", (%edx)\n"     /* write R0 = r0 */
 656     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 04 */
 657     "movq   32(%eax), "r0"\n"
 658     "psllq  $16, "r3"\n"        /* r3 = __ __ 04 __ */
 659     "pmullw 32(%esi), "r0"\n"   /* r0 = 23 22 21 20 */
 660     "pand   "r1", "r7"\n"       /* r7 = 13 __ __ __ */
 661     "por    "r3", "r5"\n"       /* r5 = __ __ 04 01 */
 662     "por    "r6", "r7"\n"       /* r7 = 13 10 __ __ */
 663     "movq   24(%eax), "r3"\n"
 664     "por    "r5", "r7"\n"       /* r7 = 13 10 04 01 = R1 */
 665     "pmullw 24(%esi), "r3"\n"   /* r3 = 17 16 15 14 */
 666     "psrlq  $16, "r4"\n"        /* r4 = __ 07 06 05 */
 667     "movq   "r7", 16(%edx)\n"   /* write R1 = r7 */
 668     "movq   "r4", "r5"\n"       /* r5 = __ 07 06 05 */
 669     "movq   "r0", "r7"\n"       /* r7 = 23 22 21 20 */
 670     "psrlq  $16, "r4"\n"        /* r4 = __ __ 07 06 */
 671     "psrlq  $48, "r7"\n"        /* r7 = __ __ __ 23 */
 672     "movq   "r2", "r6"\n"       /* r6 = __ __ __ FF */
 673     "pand   "r2", "r5"\n"       /* r5 = __ __ __ 05 */
 674     "pand   "r4", "r6"\n"       /* r6 = __ __ __ 06 */
 675     "movq   "r7", 80(%edx)\n"   /* partial R9 = __ __ __ 23 */
 676     "pxor   "r6", "r4"\n"       /* r4 = __ __ 07 __ */
 677     "psrlq  $32, "r1"\n"        /* r1 = __ __ 13 12 */
 678     "por    "r5", "r4"\n"       /* r4 = __ __ 07 05 */
 679     "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
 680     "pand   "r2", "r1"\n"       /* r1 = __ __ __ 12 */
 681     "movq   48(%eax), "r5"\n"
 682     "psllq  $16, "r0"\n"        /* r0 = 22 21 20 __ */
 683     "pmullw 48(%esi), "r5"\n"   /* r5 = 33 32 31 30 */
 684     "pand   "r0", "r7"\n"       /* r7 = 22 __ __ __ */
 685     "movq   "r1", 64(%edx)\n"   /* partial R8 = __ __ __ 12 */
 686     "por    "r4", "r7"\n"       /* r7 = 22 __ 07 05 */
 687     "movq   "r3", "r4"\n"       /* r4 = 17 16 15 14 */
 688     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 14 */
 689     "movq   "M(2)", "r1"\n"     /* r1 = __ FF __ __ */
 690     "psllq  $32, "r3"\n"        /* r3 = __ 14 __ __ */
 691     "por    "r3", "r7"\n"       /* r7 = 22 14 07 05 = R2 */
 692     "movq   "r5", "r3"\n"       /* r3 = 33 32 31 30 */
 693     "psllq  $48, "r3"\n"        /* r3 = 30 __ __ __ */
 694     "pand   "r0", "r1"\n"       /* r1 = __ 21 __ __ */
 695     "movq   "r7", 32(%edx)\n"   /* write R2 = r7 */
 696     "por    "r3", "r6"\n"       /* r6 = 30 __ __ 06 */
 697     "movq   "M(1)", "r7"\n"     /* r7 = __ __ FF __ */
 698     "por    "r1", "r6"\n"       /* r6 = 30 21 __ 06 */
 699     "movq   56(%eax), "r1"\n"
 700     "pand   "r4", "r7"\n"       /* r7 = __ __ 15 __ */
 701     "pmullw 56(%esi), "r1"\n"   /* r1 = 37 36 35 34 */
 702     "por    "r6", "r7"\n"       /* r7 = 30 21 15 06 = R3 */
 703     "pand   "M(1)", "r0"\n"     /* r0 = __ __ 20 __ */
 704     "psrlq  $32, "r4"\n"        /* r4 = __ __ 17 16 */
 705     "movq   "r7", 48(%edx)\n"   /* write R3 = r7 */
 706     "movq   "r4", "r6"\n"       /* r6 = __ __ 17 16 */
 707     "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
 708     "pand   "r2", "r4"\n"       /* r4 = __ __ __ 16 */
 709     "movq   "M(1)", "r3"\n"     /* r3 = __ __ FF __ */
 710     "pand   "r1", "r7"\n"       /* r7 = 37 __ __ __ */
 711     "pand   "r5", "r3"\n"       /* r3 = __ __ 31 __ */
 712     "por    "r4", "r0"\n"       /* r0 = __ __ 20 16 */
 713     "psllq  $16, "r3"\n"        /* r3 = __ 31 __ __ */
 714     "por    "r0", "r7"\n"       /* r7 = 37 __ 20 16 */
 715     "movq   "M(2)", "r4"\n"     /* r4 = __ FF __ __ */
 716     "por    "r3", "r7"\n"       /* r7 = 37 31 20 16 = R4 */
 717     "movq   80(%eax), "r0"\n"
 718     "movq   "r4", "r3"\n"       /* r3 = __ __ FF __ */
 719     "pmullw 80(%esi), "r0"\n"   /* r0 = 53 52 51 50 */
 720     "pand   "r5", "r4"\n"       /* r4 = __ 32 __ __ */
 721     "movq   "r7", 8(%edx)\n"    /* write R4 = r7 */
 722     "por    "r4", "r6"\n"       /* r6 = __ 32 17 16 */
 723     "movq   "r3", "r4"\n"       /* r4 = __ FF __ __ */
 724     "psrlq  $16, "r6"\n"        /* r6 = __ __ 32 17 */
 725     "movq   "r0", "r7"\n"       /* r7 = 53 52 51 50 */
 726     "pand   "r1", "r4"\n"       /* r4 = __ 36 __ __ */
 727     "psllq  $48, "r7"\n"        /* r7 = 50 __ __ __ */
 728     "por    "r4", "r6"\n"       /* r6 = __ 36 32 17 */
 729     "movq   88(%eax), "r4"\n"
 730     "por    "r6", "r7"\n"       /* r7 = 50 36 32 17 = R5 */
 731     "pmullw 88(%esi), "r4"\n"   /* r4 = 57 56 55 54 */
 732     "psrlq  $16, "r3"\n"        /* r3 = __ __ FF __ */
 733     "movq   "r7", 24(%edx)\n"   /* write R5 = r7 */
 734     "pand   "r1", "r3"\n"       /* r3 = __ __ 35 __ */
 735     "psrlq  $48, "r5"\n"        /* r5 = __ __ __ 33 */
 736     "pand   "r2", "r1"\n"       /* r1 = __ __ __ 34 */
 737     "movq   104(%eax), "r6"\n"
 738     "por    "r3", "r5"\n"       /* r5 = __ __ 35 33 */
 739     "pmullw 104(%esi), "r6"\n"  /* r6 = 67 66 65 64 */
 740     "psrlq  $16, "r0"\n"        /* r0 = __ 53 52 51 */
 741     "movq   "r4", "r7"\n"       /* r7 = 57 56 55 54 */
 742     "movq   "r2", "r3"\n"       /* r3 = __ __ __ FF */
 743     "psllq  $48, "r7"\n"        /* r7 = 54 __ __ __ */
 744     "pand   "r0", "r3"\n"       /* r3 = __ __ __ 51 */
 745     "pxor   "r3", "r0"\n"       /* r0 = __ 53 52 __ */
 746     "psllq  $32, "r3"\n"        /* r3 = __ 51 __ __ */
 747     "por    "r5", "r7"\n"       /* r7 = 54 __ 35 33 */
 748     "movq   "r6", "r5"\n"       /* r5 = 67 66 65 64 */
 749     "pand   "M(1)", "r6"\n"     /* r6 = __ __ 65 __ */
 750     "por    "r3", "r7"\n"       /* r7 = 54 51 35 33 = R6 */
 751     "psllq  $32, "r6"\n"        /* r6 = 65 __ __ __ */
 752     "por    "r1", "r0"\n"       /* r0 = __ 53 52 34 */
 753     "movq   "r7", 40(%edx)\n"   /* write R6 = r7 */
 754     "por    "r6", "r0"\n"       /* r0 = 65 53 52 34 = R7 */
 755     "movq   120(%eax), "r7"\n"
 756     "movq   "r5", "r6"\n"       /* r6 = 67 66 65 64 */
 757     "pmullw 120(%esi), "r7"\n"  /* r7 = 77 76 75 74 */
 758     "psrlq  $32, "r5"\n"        /* r5 = __ __ 67 66 */
 759     "pand   "r2", "r6"\n"       /* r6 = __ __ __ 64 */
 760     "movq   "r5", "r1"\n"       /* r1 = __ __ 67 66 */
 761     "movq   "r0", 56(%edx)\n"   /* write R7 = r0 */
 762     "pand   "r2", "r1"\n"       /* r1 = __ __ __ 66 */
 763     "movq   112(%eax), "r0"\n"
 764     "movq   "r7", "r3"\n"       /* r3 = 77 76 75 74 */
 765     "pmullw 112(%esi), "r0"\n"  /* r0 = 73 72 71 70 */
 766     "psllq  $16, "r3"\n"        /* r3 = 76 75 74 __ */
 767     "pand   "M(3)", "r7"\n"     /* r7 = 77 __ __ __ */
 768     "pxor   "r1", "r5"\n"       /* r5 = __ __ 67 __ */
 769     "por    "r5", "r6"\n"       /* r6 = __ __ 67 64 */
 770     "movq   "r3", "r5"\n"       /* r5 = 76 75 74 __ */
 771     "pand   "M(3)", "r5"\n"     /* r5 = 76 __ __ __ */
 772     "por    "r1", "r7"\n"       /* r7 = 77 __ __ 66 */
 773     "movq   96(%eax), "r1"\n"
 774     "pxor   "r5", "r3"\n"       /* r3 = __ 75 74 __ */
 775     "pmullw 96(%esi), "r1"\n"   /* r1 = 63 62 61 60 */
 776     "por    "r3", "r7"\n"       /* r7 = 77 75 74 66 = R15 */
 777     "por    "r5", "r6"\n"       /* r6 = 76 __ 67 64 */
 778     "movq   "r0", "r5"\n"       /* r5 = 73 72 71 70 */
 779     "movq   "r7", 120(%edx)\n"  /* store R15 = r7 */
 780     "psrlq  $16, "r5"\n"        /* r5 = __ 73 72 71 */
 781     "pand   "M(2)", "r5"\n"     /* r5 = __ 73 __ __ */
 782     "movq   "r0", "r7"\n"       /* r7 = 73 72 71 70 */
 783     "por    "r5", "r6"\n"       /* r6 = 76 73 67 64 = R14 */
 784     "pand   "r2", "r0"\n"       /* r0 = __ __ __ 70 */
 785     "pxor   "r0", "r7"\n"       /* r7 = 73 72 71 __ */
 786     "psllq  $32, "r0"\n"        /* r0 = __ 70 __ __ */
 787     "movq   "r6", 104(%edx)\n"  /* write R14 = r6 */
 788     "psrlq  $16, "r4"\n"        /* r4 = __ 57 56 55 */
 789     "movq   72(%eax), "r5"\n"
 790     "psllq  $16, "r7"\n"        /* r7 = 72 71 __ __ */
 791     "pmullw 72(%esi), "r5"\n"   /* r5 = 47 46 45 44 */
 792     "movq   "r7", "r6"\n"       /* r6 = 72 71 __ __ */
 793     "movq   "M(2)", "r3"\n"     /* r3 = __ FF __ __ */
 794     "psllq  $16, "r6"\n"        /* r6 = 71 __ __ __ */
 795     "pand   "M(3)", "r7"\n"     /* r7 = 72 __ __ __ */
 796     "pand   "r1", "r3"\n"       /* r3 = __ 62 __ __ */
 797     "por    "r0", "r7"\n"       /* r7 = 72 70 __ __ */
 798     "movq   "r1", "r0"\n"       /* r0 = 63 62 61 60 */
 799     "pand   "M(3)", "r1"\n"     /* r1 = 63 __ __ __ */
 800     "por    "r3", "r6"\n"       /* r6 = 71 62 __ __ */
 801     "movq   "r4", "r3"\n"       /* r3 = __ 57 56 55 */
 802     "psrlq  $32, "r1"\n"        /* r1 = __ __ 63 __ */
 803     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 55 */
 804     "por    "r1", "r7"\n"       /* r7 = 72 70 63 __ */
 805     "por    "r3", "r7"\n"       /* r7 = 72 70 63 55 = R13 */
 806     "movq   "r4", "r3"\n"       /* r3 = __ 57 56 55 */
 807     "pand   "M(1)", "r3"\n"     /* r3 = __ __ 56 __ */
 808     "movq   "r5", "r1"\n"       /* r1 = 47 46 45 44 */
 809     "movq   "r7", 88(%edx)\n"   /* write R13 = r7 */
 810     "psrlq  $48, "r5"\n"        /* r5 = __ __ __ 47 */
 811     "movq   64(%eax), "r7"\n"
 812     "por    "r3", "r6"\n"       /* r6 = 71 62 56 __ */
 813     "pmullw 64(%esi), "r7"\n"   /* r7 = 43 42 41 40 */
 814     "por    "r5", "r6"\n"       /* r6 = 71 62 56 47 = R12 */
 815     "pand   "M(2)", "r4"\n"     /* r4 = __ 57 __ __ */
 816     "psllq  $32, "r0"\n"        /* r0 = 61 60 __ __ */
 817     "movq   "r6", 72(%edx)\n"   /* write R12 = r6 */
 818     "movq   "r0", "r6"\n"       /* r6 = 61 60 __ __ */
 819     "pand   "M(3)", "r0"\n"     /* r0 = 61 __ __ __ */
 820     "psllq  $16, "r6"\n"        /* r6 = 60 __ __ __ */
 821     "movq   40(%eax), "r5"\n"
 822     "movq   "r1", "r3"\n"       /* r3 = 47 46 45 44 */
 823     "pmullw 40(%esi), "r5"\n"   /* r5 = 27 26 25 24 */
 824     "psrlq  $16, "r1"\n"        /* r1 = __ 47 46 45 */
 825     "pand   "M(1)", "r1"\n"     /* r1 = __ __ 46 __ */
 826     "por    "r4", "r0"\n"       /* r0 = 61 57 __ __ */
 827     "pand   "r7", "r2"\n"       /* r2 = __ __ __ 40 */
 828     "por    "r1", "r0"\n"       /* r0 = 61 57 46 __ */
 829     "por    "r2", "r0"\n"       /* r0 = 61 57 46 40 = R11 */
 830     "psllq  $16, "r3"\n"        /* r3 = 46 45 44 __ */
 831     "movq   "r3", "r4"\n"       /* r4 = 46 45 44 __ */
 832     "movq   "r5", "r2"\n"       /* r2 = 27 26 25 24 */
 833     "movq   "r0", 112(%edx)\n"  /* write R11 = r0 */
 834     "psrlq  $48, "r2"\n"        /* r2 = __ __ __ 27 */
 835     "pand   "M(2)", "r4"\n"     /* r4 = __ 45 __ __ */
 836     "por    "r2", "r6"\n"       /* r6 = 60 __ __ 27 */
 837     "movq   "M(1)", "r2"\n"     /* r2 = __ __ FF __ */
 838     "por    "r4", "r6"\n"       /* r6 = 60 45 __ 27 */
 839     "pand   "r7", "r2"\n"       /* r2 = __ __ 41 __ */
 840     "psllq  $32, "r3"\n"        /* r3 = 44 __ __ __ */
 841     "por    80(%edx), "r3"\n"   /* r3 = 44 __ __ 23 */
 842     "por    "r2", "r6"\n"       /* r6 = 60 45 41 27 = R10 */
 843     "movq   "M(3)", "r2"\n"     /* r2 = FF __ __ __ */
 844     "psllq  $16, "r5"\n"        /* r5 = 26 25 24 __ */
 845     "movq   "r6", 96(%edx)\n"   /* store R10 = r6 */
 846     "pand   "r5", "r2"\n"       /* r2 = 26 __ __ __ */
 847     "movq   "M(2)", "r6"\n"     /* r6 = __ FF __ __ */
 848     "pxor   "r2", "r5"\n"       /* r5 = __ 25 24 __ */
 849     "pand   "r7", "r6"\n"       /* r6 = __ 42 __ __ */
 850     "psrlq  $32, "r2"\n"        /* r2 = __ __ 26 __ */
 851     "pand   "M(3)", "r7"\n"     /* r7 = 43 __ __ __ */
 852     "por    "r2", "r3"\n"       /* r3 = 44 __ 26 23 */
 853     "por    64(%edx), "r7"\n"   /* r7 = 43 __ __ 12 */
 854     "por    "r3", "r6"\n"       /* r6 = 44 42 26 23 = R9 */
 855     "por    "r5", "r7"\n"       /* r7 = 43 25 24 12 = R8 */
 856     "movq   "r6", 80(%edx)\n"   /* store R9 = r6 */
 857     "movq   "r7", 64(%edx)\n"   /* store R8 = r7 */
 858     );
 859     /* 123c  ( / 64 coeffs  < 2c / coeff) */
 860 #   undef M
 861
 862 /* Done w/dequant + descramble + partial transpose; now do the idct itself. */
 863
 864 #   define I( K)    MtoSTR(K*16(%edx))
 865 #   define J( K)    MtoSTR(((K - 4)*16)+8(%edx))
 866
 867     RowIDCT         /* 46 c */
 868     Transpose       /* 19 c */
 869
 870 #   undef I
 871 #   undef J
 872 #   define I( K)    MtoSTR((K*16)+64(%edx))
 873 #   define J( K)    MtoSTR(((K-4)*16)+72(%edx))
 874
 875     RowIDCT         /* 46 c */
 876     Transpose       /* 19 c */
 877
 878 #   undef I
 879 #   undef J
 880 #   define I( K)    MtoSTR((K * 16)(%edx))
 881 #   define J( K)    I( K)
 882
 883     ColumnIDCT      /* 57 c */
 884
 885 #   undef I
 886 #   undef J
 887 #   define I( K)    MtoSTR((K*16)+8(%edx))
 888 #   define J( K)    I( K)
 889
 890     ColumnIDCT      /* 57 c */
 891
 892 #   undef I
 893 #   undef J
 894     /* 368 cycles  ( / 64 coeff  <  6 c / coeff) */
 895
 896     ASM("emms\n");
 897 }
 898
 899 /**************************************************************************************
 900  *
 901  *      Routine:        MMX_idct10
 902  *
 903  *      Description:    Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
 904  *
 905  *      Input:          Pointer to input and output buffer
 906  *
 907  *      Output:         None
 908  *
 909  *      Return:         None
 910  *
 911  *      Special Note:   The input coefficients are in transposed ZigZag order
 912  *
 913  *      Error:          None
 914  *
 915  ***************************************************************************************
 916  */
 917 /* --------------------------------------------------------------- */
 918 // This macro does four 4-sample one-dimensional idcts in parallel.  Inputs
 919 // 4 thru 7 are assumed to be zero.
 920 #define BeginIDCT_10 "#BeginIDCT_10\n"  \
 921     "   movq    "I(3)","r2"\n"          \
 922                                         \
 923     "   movq    "C(3)","r6"\n"          \
 924     "   movq    "r2","r4"\n"            \
 925                                         \
 926     "   movq    "C(5)","r1"\n"          \
 927     "   pmulhw  "r6","r4"\n"            \
 928                                         \
 929     "   movq    "I(1)","r3"\n"          \
 930     "   pmulhw  "r2","r1"\n"            \
 931                                         \
 932     "   movq    "C(1)","r0"\n"          \
 933     "   paddw   "r2","r4"\n"            \
 934                                         \
 935     "   pxor    "r6","r6"\n"            \
 936     "   paddw   "r1","r2"\n"            \
 937                                         \
 938     "   movq    "I(2)","r5"\n"          \
 939     "   pmulhw  "r3","r0"\n"            \
 940                                         \
 941     "   movq    "r5","r1"\n"            \
 942     "   paddw   "r3","r0"\n"            \
 943                                         \
 944     "   pmulhw  "C(7)","r3"\n"          \
 945     "   psubsw  "r2","r6"\n"            \
 946                                         \
 947     "   pmulhw  "C(2)","r5"\n"          \
 948     "   psubsw  "r4","r0"\n"            \
 949                                         \
 950     "   movq    "I(2)","r7"\n"          \
 951     "   paddsw  "r4","r4"\n"            \
 952                                         \
 953     "   paddw   "r5","r7"\n"            \
 954     "   paddsw  "r0","r4"\n"            \
 955                                         \
 956     "   pmulhw  "C(6)","r1"\n"          \
 957     "   psubsw  "r6","r3"\n"            \
 958                                         \
 959     "   movq    "r4","I(1)"\n"          \
 960     "   paddsw  "r6","r6"\n"            \
 961                                         \
 962     "   movq    "C(4)","r4"\n"          \
 963     "   paddsw  "r3","r6"\n"            \
 964                                         \
 965     "   movq    "r3","r5"\n"            \
 966     "   pmulhw  "r4","r3"\n"            \
 967                                         \
 968     "   movq    "r6","I(2)"\n"          \
 969     "   movq    "r0","r2"\n"            \
 970                                         \
 971     "   movq    "I(0)","r6"\n"          \
 972     "   pmulhw  "r4","r0"\n"            \
 973                                         \
 974     "   paddw   "r3","r5"\n"            \
 975     "   paddw   "r0","r2"\n"            \
 976                                         \
 977     "   psubsw  "r1","r5"\n"            \
 978     "   pmulhw  "r4","r6"\n"            \
 979                                         \
 980     "   paddw   "I(0)","r6"\n"          \
 981     "   paddsw  "r1","r1"\n"            \
 982                                         \
 983     "   movq    "r6","r4"\n"            \
 984     "   paddsw  "r5","r1"\n"            \
 985                                         \
 986     "   psubsw  "r2","r6"\n"            \
 987     "   paddsw  "r2","r2"\n"            \
 988                                         \
 989     "   movq    "I(1)","r0"\n"          \
 990     "   paddsw  "r6","r2"\n"            \
 991                                         \
 992     "   psubsw  "r1","r2"\n"            \
 993     "#end BeginIDCT_10\n"
 994 // end BeginIDCT_10 macro (25 cycles).
 995
 996 #define RowIDCT_10 ASM("\n"                                 \
 997     "#RowIDCT_10\n"                                         \
 998     BeginIDCT_10                                            \
 999     "\n"                                                    \
1000     "   movq    "I(2)","r3"\n"  /* r3 = D. */               \
1001     "   psubsw  "r7","r4"\n"        /* r4 = E. = E - G */   \
1002     "   paddsw  "r1","r1"\n"        /* r1 = H. + H. */      \
1003     "   paddsw  "r7","r7"\n"        /* r7 = G + G */        \
1004     "   paddsw  "r2","r1"\n"        /* r1 = R1 = A.. + H. */\
1005     "   paddsw  "r4","r7"\n"        /* r7 = G. = E + G */   \
1006     "   psubsw  "r3","r4"\n"        /* r4 = R4 = E. - D. */ \
1007     "   paddsw  "r3","r3"\n"                                \
1008     "   psubsw  "r5","r6"\n"        /* r6 = R6 = F. - B.. */\
1009     "   paddsw  "r5","r5"\n"                                \
1010     "   paddsw  "r4","r3"\n"        /* r3 = R3 = E. + D. */ \
1011     "   paddsw  "r6","r5"\n"        /* r5 = R5 = F. + B.. */\
1012     "   psubsw  "r0","r7"\n"        /* r7 = R7 = G. - C. */ \
1013     "   paddsw  "r0","r0"\n"                                \
1014     "   movq    "r1","I(1)"\n"  /* save R1 */               \
1015     "   paddsw  "r7","r0"\n"        /* r0 = R0 = G. + C. */ \
1016     "#end RowIDCT_10\n"                                                                         \
1017 );
1018 // end RowIDCT macro (8 + 38 = 46 cycles)
1019
1020 // Column IDCT normalizes and stores final results.
1021
1022 #define ColumnIDCT_10 ASM("\n"                          \
1023     "#ColumnIDCT_10\n"                                  \
1024     BeginIDCT_10                                        \
1025     "\n"                                                \
1026     "   paddsw  "Eight","r2"\n"                         \
1027     "   paddsw  "r1","r1"\n"    /* r1 = H. + H. */      \
1028     "   paddsw  "r2","r1"\n"    /* r1 = R1 = A.. + H. */\
1029     "   psraw   ""$4"","r2"\n"      /* r2 = NR2 */      \
1030     "   psubsw  "r7","r4"\n"    /* r4 = E. = E - G */   \
1031     "   psraw   ""$4"","r1"\n"      /* r1 = NR1 */      \
1032     "   movq    "I(2)","r3"\n"  /* r3 = D. */           \
1033     "   paddsw  "r7","r7"\n"    /* r7 = G + G */        \
1034     "   movq    "r2","I(2)"\n"  /* store NR2 at I2 */   \
1035     "   paddsw  "r4","r7"\n"    /* r7 = G. = E + G */   \
1036     "   movq    "r1","I(1)"\n"  /* store NR1 at I1 */   \
1037     "   psubsw  "r3","r4"\n"    /* r4 = R4 = E. - D. */ \
1038     "   paddsw  "Eight","r4"\n"                         \
1039     "   paddsw  "r3","r3"\n"    /* r3 = D. + D. */      \
1040     "   paddsw  "r4","r3"\n"    /* r3 = R3 = E. + D. */ \
1041     "   psraw   ""$4"","r4"\n"      /* r4 = NR4 */      \
1042     "   psubsw  "r5","r6"\n"    /* r6 = R6 = F. - B.. */\
1043     "   psraw   ""$4"","r3"\n"      /* r3 = NR3 */      \
1044     "   paddsw  "Eight","r6"\n"                         \
1045     "   paddsw  "r5","r5"\n"    /* r5 = B.. + B.. */    \
1046     "   paddsw  "r6","r5"\n"    /* r5 = R5 = F. + B.. */\
1047     "   psraw   ""$4"","r6"\n"      /* r6 = NR6 */      \
1048     "   movq    "r4","J(4)"\n"  /* store NR4 at J4 */   \
1049     "   psraw   ""$4"","r5"\n"      /* r5 = NR5 */      \
1050     "   movq    "r3","I(3)"\n"  /* store NR3 at I3 */   \
1051     "   psubsw  "r0","r7"\n"    /* r7 = R7 = G. - C. */ \
1052     "   paddsw  "Eight","r7"\n"                         \
1053     "   paddsw  "r0","r0"\n"    /* r0 = C. + C. */      \
1054     "   paddsw  "r7","r0"\n"    /* r0 = R0 = G. + C. */ \
1055     "   psraw   ""$4"","r7"\n"      /* r7 = NR7 */      \
1056     "   movq    "r6","J(6)"\n"  /* store NR6 at J6 */   \
1057     "   psraw   ""$4"","r0"\n"      /* r0 = NR0 */      \
1058     "   movq    "r5","J(5)"\n"  /* store NR5 at J5 */   \
1059                                                         \
1060     "   movq    "r7","J(7)"\n"  /* store NR7 at J7 */   \
1061                                                         \
1062     "   movq    "r0","I(0)"\n"  /* store NR0 at I0 */   \
1063     "#end ColumnIDCT_10\n"                                                              \
1064 );
1065 // end ColumnIDCT macro (38 + 19 = 57 cycles)
1066 /* --------------------------------------------------------------- */
1067
1068
1069 /* --------------------------------------------------------------- */
1070 /* IDCT 10 */
1071 void IDct10__mmx( Q_LIST_ENTRY * InputData,
1072              ogg_int16_t *QuantMatrix,
1073              ogg_int16_t * OutputData ) {
1074
1075 #   define MIDM(M,I)    MtoSTR(M+I*8(%ecx))
1076 #   define M(I)         MIDM( MaskOffset , I )
1077 #   define MIDC(M,I)    MtoSTR(M+(I-1)*8(%ecx))
1078 #   define C(I)         MIDC( CosineOffset , I )
1079 #   define MIDEight(M)  MtoSTR(M(%ecx))
1080 #   define Eight        MIDEight(EightOffset)
1081
1082 #   define r0   "%mm0"
1083 #   define r1   "%mm1"
1084 #   define r2   "%mm2"
1085 #   define r3   "%mm3"
1086 #   define r4   "%mm4"
1087 #   define r5   "%mm5"
1088 #   define r6   "%mm6"
1089 #   define r7   "%mm7"
1090
1091     __asm__ __volatile__ (
1092     /* eax = quantized input */
1093     /* esi = quantization table */
1094     /* edx = destination (= idct buffer) */
1095     /* ecx = idctconstants */
1096     ""
1097     :
1098     :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
1099     );
1100
1101     ASM(
1102     "movq   (%eax), "r0"\n"
1103     "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
1104     "movq   16(%eax), "r1"\n"
1105     "pmullw 16(%esi), "r1"\n"   /* r1 = 13 12 11 10 */
1106     "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
1107     "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
1108     "movq   8(%eax), "r4"\n"
1109     "psrlq  $16, "r0"\n"        /* r0 = __ 03 02 01 */
1110     "pmullw 8(%esi), "r4"\n"    /* r4 = 07 06 05 04 */
1111     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 00 */
1112     "movq   "r0", "r5"\n"       /* r5 = __ 03 02 01 */
1113     "pand   "r2", "r5"\n"       /* r5 = __ __ __ 01 */
1114     "psllq  $32, "r1"\n"        /* r1 = 11 10 __ __ */
1115     "movq   "M(3)", "r7"\n"     /* r7 = FF __ __ __ */
1116     "pxor   "r5", "r0"\n"       /* r0 = __ 03 02 __ */
1117     "pand   "r1", "r7"\n"       /* r7 = 11 __ __ __ */
1118     "por    "r3", "r0"\n"       /* r0 = __ 03 02 00 */
1119     "pxor   "r7", "r1"\n"       /* r1 = __ 10 __ __ */
1120     "por    "r7", "r0"\n"       /* r0 = 11 03 02 00 = R0 */
1121     "movq   "r4", "r3"\n"       /* r3 = 07 06 05 04 */
1122     "movq   "r0", (%edx)\n"     /* write R0 = r0 */
1123     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 04 */
1124     "psllq  $16, "r3"\n"        /* r3 = __ __ 04 __ */
1125     "por    "r3", "r5"\n"       /* r5 = __ __ 04 01 */
1126     "por    "r5", "r1"\n"       /* r1 = __ 10 04 01 = R1 */
1127     "psrlq  $16, "r4"\n"        /* r4 = __ 07 06 05 */
1128     "movq   "r1", 16(%edx)\n"   /* write R1 = r1 */
1129     "movq   "r4", "r5"\n"       /* r5 = __ 07 06 05 */
1130     "psrlq  $16, "r4"\n"        /* r4 = __ __ 07 06 */
1131     "movq   "r2", "r6"\n"       /* r6 = __ __ __ FF */
1132     "pand   "r2", "r5"\n"       /* r5 = __ __ __ 05 */
1133     "pand   "r4", "r6"\n"       /* r6 = __ __ __ 06 */
1134     "pxor   "r6", "r4"\n"       /* r4 = __ __ 07 __ */
1135     "por    "r5", "r4"\n"       /* r4 = __ __ 07 05 */
1136     "movq   "r4", 32(%edx)\n"   /* write R2 = r4 */
1137     "movq   "r6", 48(%edx)\n"   /* write R3 = r6 */
1138     );
1139 #   undef M
1140
1141 /* Done w/dequant + descramble + partial transpose; now do the idct itself. */
1142
1143 #   define I( K)    MtoSTR((K*16)(%edx))
1144 #   define J( K)    MtoSTR(((K - 4) * 16)+8(%edx))
1145
1146     RowIDCT_10      /* 33 c */
1147     Transpose       /* 19 c */
1148
1149 #   undef I
1150 #   undef J
1151 //# define I( K)    [edx + (  K      * 16) + 64]
1152 //# define J( K)    [edx + ( (K - 4) * 16) + 72]
1153
1154 //  RowIDCT         ; 46 c
1155 //  Transpose       ; 19 c
1156
1157 //# undef I
1158 //# undef J
1159 #   define I( K)    MtoSTR((K * 16)(%edx))
1160 #   define J( K)    I( K)
1161
1162     ColumnIDCT_10       /* 44 c */
1163
1164 #   undef I
1165 #   undef J
1166 #   define I( K)    MtoSTR((K * 16)+8(%edx))
1167 #   define J( K)    I( K)
1168
1169     ColumnIDCT_10       /* 44 c */
1170
1171 #   undef I
1172 #   undef J
1173
1174     ASM("emms\n");
1175 }
1176
1177 /**************************************************************************************
1178  *
1179  *      Routine:        MMX_idct3
1180  *
1181  *      Description:    Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
1182  *
1183  *      Input:          Pointer to input and output buffer
1184  *
1185  *      Output:         None
1186  *
1187  *      Return:         None
1188  *
1189  *      Special Note:   Only works for three nonzero coefficients.
1190  *
1191  *      Error:          None
1192  *
1193  ***************************************************************************************
1194  */
1195 /***************************************************************************************
1196     In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
1197     In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
1198     do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
1199     After row IDCTs, since every column could have nonzero coefficients, we need do
1200     eight 1-D column IDCT. However, for each column, there are at most two nonzero
1201     coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
1202     two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
1203
1204     from a full version:
1205
1206     A = (C1 * I1) + (C7 * I7)       B = (C7 * I1) - (C1 * I7)
1207     C = (C3 * I3) + (C5 * I5)       D = (C3 * I5) - (C5 * I3)
1208     A. = C4 * (A - C)               B. = C4 * (B - D)
1209     C. = A + C                      D. = B + D
1210
1211     E = C4 * (I0 + I4)              F = C4 * (I0 - I4)
1212     G = (C2 * I2) + (C6 * I6)       H = (C6 * I2) - (C2 * I6)
1213     E. = E - G
1214     G. = E + G
1215
1216     A.. = F + A.                    B.. = B. - H
1217     F.  = F - A.                    H.  = B. + H
1218
1219     R0 = G. + C.    R1 = A.. + H.   R3 = E. + D.    R5 = F. + B..
1220     R7 = G. - C.    R2 = A.. - H.   R4 = E. - D.    R6 = F. - B..
1221
1222     To:
1223
1224
1225     A = (C1 * I1)                   B = (C7 * I1)
1226     C = 0                           D = 0
1227     A. = C4 * A                     B. = C4 * B
1228     C. = A                          D. = B
1229
1230     E = C4 * I0                     F = E
1231     G = 0                           H = 0
1232     E. = E
1233     G. = E
1234
1235     A.. = E + A.                    B.. = B.
1236     F.  = E - A.                    H.  = B.
1237
1238     R0 = E + A      R1 = E + A. + B.    R3 = E + B      R5 = E - A. + B.
1239     R7 = E - A      R2 = E + A. - B.    R4 = E - B      R6 = F - A. - B.
1240
1241 ******************************************************************************************/
1242
1243 #define RowIDCT_3 ASM("\n"\
1244     "#RowIDCT_3\n"\
1245     "   movq        "I(1)","r7"\n"  /* r7 = I1                      */  \
1246     "   movq        "C(1)","r0"\n"  /* r0 = C1                      */  \
1247     "   movq        "C(7)","r3"\n"  /* r3 = C7                      */  \
1248     "   pmulhw      "r7","r0"\n"    /* r0 = C1 * I1 - I1            */  \
1249     "   pmulhw      "r7","r3"\n"    /* r3 = C7 * I1 = B, D.         */  \
1250     "   movq        "I(0)","r6"\n"  /* r6 = I0                      */  \
1251     "   movq        "C(4)","r4"\n"  /* r4 = C4                      */  \
1252     "   paddw       "r7","r0"\n"    /* r0 = C1 * I1 = A, C.         */  \
1253     "   movq        "r6","r1"\n"    /* make a copy of I0            */  \
1254     "   pmulhw      "r4","r6"\n"    /* r2 = C4 * I0 - I0            */  \
1255     "   movq        "r0","r2"\n"    /* make a copy of A             */  \
1256     "   movq        "r3","r5"\n"    /* make a copy of B             */  \
1257     "   pmulhw      "r4","r2"\n"    /* r2 = C4 * A - A              */  \
1258     "   pmulhw      "r4","r5"\n"    /* r5 = C4 * B - B              */  \
1259     "   paddw       "r1","r6"\n"    /* r2 = C4 * I0 = E, F          */  \
1260     "   movq        "r6","r4"\n"    /* r4 = E                       */  \
1261     "   paddw       "r0","r2"\n"    /* r2 = A.                      */  \
1262     "   paddw       "r3","r5"\n"    /* r5 = B.                      */  \
1263     "   movq        "r6","r7"\n"    /* r7 = E                       */  \
1264     "   movq        "r5","r1"\n"    /* r1 = B.                      */  \
1265     /*  r0 = A      */   \
1266     /*  r3 = B      */   \
1267     /*  r2 = A.     */   \
1268     /*  r5 = B.     */   \
1269     /*  r6 = E      */   \
1270     /*  r4 = E      */   \
1271     /*  r7 = E      */   \
1272     /*  r1 = B.     */   \
1273     "   psubw       "r2","r6"\n"    /* r6 = E - A.                  */  \
1274     "   psubw       "r3","r4"\n"    /* r4 = E - B ----R4            */  \
1275     "   psubw       "r0","r7"\n"    /* r7 = E - A ----R7            */  \
1276     "   paddw       "r2","r2"\n"    /* r2 = A. + A.                 */  \
1277     "   paddw       "r3","r3"\n"    /* r3 = B + B                   */  \
1278     "   paddw       "r0","r0"\n"    /* r0 = A + A                   */  \
1279     "   paddw       "r6","r2"\n"    /* r2 = E + A.                  */  \
1280     "   paddw       "r4","r3"\n"    /* r3 = E + B ----R3            */  \
1281     "   psubw       "r1","r2"\n"    /* r2 = E + A. - B. ----R2      */  \
1282     "   psubw       "r5","r6"\n"    /* r6 = E - A. - B. ----R6      */  \
1283     "   paddw       "r1","r1"\n"    /* r1 = B. + B.                 */  \
1284     "   paddw       "r5","r5"\n"    /* r5 = B. + B.                 */  \
1285     "   paddw       "r7","r0"\n"    /* r0 = E + A ----R0            */  \
1286     "   paddw       "r2","r1"\n"    /* r1 = E + A. + B. -----R1     */  \
1287     "   movq        "r1","I(1)"\n"  /* save r1                      */  \
1288     "   paddw       "r6","r5"\n"    /* r5 = E - A. + B. -----R5     */  \
1289     "#end RowIDCT_3\n"\
1290 );
1291 //End of RowIDCT_3
1292
1293 #define ColumnIDCT_3 ASM("\n"\
1294     "#ColumnIDCT_3\n"\
1295     "   movq        "I(1)","r7"\n"  /* r7 = I1                      */  \
1296     "   movq        "C(1)","r0"\n"  /* r0 = C1                      */  \
1297     "   movq        "C(7)","r3"\n"  /* r3 = C7                      */  \
1298     "   pmulhw      "r7","r0"\n"    /* r0 = C1 * I1 - I1            */  \
1299     "   pmulhw      "r7","r3"\n"    /* r3 = C7 * I1 = B, D.         */  \
1300     "   movq        "I(0)","r6"\n"  /* r6 = I0                      */  \
1301     "   movq        "C(4)","r4"\n"  /* r4 = C4                      */  \
1302     "   paddw       "r7","r0"\n"    /* r0 = C1 * I1 = A, C.         */  \
1303     "   movq        "r6","r1"\n"    /* make a copy of I0            */  \
1304     "   pmulhw      "r4","r6"\n"    /* r2 = C4 * I0 - I0            */  \
1305     "   movq        "r0","r2"\n"    /* make a copy of A             */  \
1306     "   movq        "r3","r5"\n"    /* make a copy of B             */  \
1307     "   pmulhw      "r4","r2"\n"    /* r2 = C4 * A - A              */  \
1308     "   pmulhw      "r4","r5"\n"    /* r5 = C4 * B - B              */  \
1309     "   paddw       "r1","r6"\n"    /* r2 = C4 * I0 = E, F          */  \
1310     "   movq        "r6","r4"\n"    /* r4 = E                       */  \
1311     "   paddw       "Eight","r6"\n" /* +8 for shift                 */  \
1312     "   paddw       "Eight","r4"\n" /* +8 for shift                 */  \
1313     "   paddw       "r0","r2"\n"    /* r2 = A.                      */  \
1314     "   paddw       "r3","r5"\n"    /* r5 = B.                      */  \
1315     "   movq        "r6","r7"\n"    /* r7 = E                       */  \
1316     "   movq        "r5","r1"\n"    /* r1 = B.                      */  \
1317 /*  r0 = A      */   \
1318 /*  r3 = B      */   \
1319 /*  r2 = A.     */   \
1320 /*  r5 = B.     */   \
1321 /*  r6 = E      */   \
1322 /*  r4 = E      */   \
1323 /*  r7 = E      */   \
1324 /*  r1 = B.     */   \
1325     "   psubw       "r2","r6"\n"    /* r6 = E - A.                  */  \
1326     "   psubw       "r3","r4"\n"    /* r4 = E - B ----R4            */  \
1327     "   psubw       "r0","r7"\n"    /* r7 = E - A ----R7            */  \
1328     "   paddw       "r2","r2"\n"    /* r2 = A. + A.                 */  \
1329     "   paddw       "r3","r3"\n"    /* r3 = B + B                   */  \
1330     "   paddw       "r0","r0"\n"    /* r0 = A + A                   */  \
1331     "   paddw       "r6","r2"\n"    /* r2 = E + A.                  */  \
1332     "   paddw       "r4","r3"\n"    /* r3 = E + B ----R3            */  \
1333     "   psraw        $4,"r4"\n"     /* shift                        */  \
1334     "   movq        "r4","J(4)"\n"  /* store R4 at J4               */  \
1335     "   psraw       $4,"r3"\n"      /* shift                        */  \
1336     "   movq        "r3","I(3)"\n"  /* store R3 at I3               */  \
1337     "   psubw       "r1","r2"\n"    /* r2 = E + A. - B. ----R2      */  \
1338     "   psubw       "r5","r6"\n"    /* r6 = E - A. - B. ----R6      */  \
1339     "   paddw       "r1","r1"\n"    /* r1 = B. + B.                 */  \
1340     "   paddw       "r5","r5"\n"    /* r5 = B. + B.                 */  \
1341     "   paddw       "r7","r0"\n"    /* r0 = E + A ----R0            */  \
1342     "   paddw       "r2","r1"\n"    /* r1 = E + A. + B. -----R1     */  \
1343     "   psraw       $4,"r7"\n"      /* shift                        */  \
1344     "   psraw       $4,"r2"\n"      /* shift                        */  \
1345     "   psraw       $4,"r0"\n"      /* shift                        */  \
1346     "   psraw       $4,"r1"\n"      /* shift                        */  \
1347     "   movq        "r7","J(7)"\n"  /* store R7 to J7               */  \
1348     "   movq        "r0","I(0)"\n"  /* store R0 to I0               */  \
1349     "   movq        "r1","I(1)"\n"  /* store R1 to I1               */  \
1350     "   movq        "r2","I(2)"\n"  /* store R2 to I2               */  \
1351     "   movq        "r1","I(1)"\n"  /* save r1                      */  \
1352     "   paddw       "r6","r5"\n"    /* r5 = E - A. + B. -----R5     */  \
1353     "   psraw       $4,"r5"\n"      /* shift                        */  \
1354     "   movq        "r5","J(5)"\n"  /* store R5 at J5               */  \
1355     "   psraw       $4,"r6"\n"      /* shift                        */  \
1356     "   movq        "r6","J(6)"\n"  /* store R6 at J6               */  \
1357     "#end ColumnIDCT_3\n"\
1358 );
1359 //End of ColumnIDCT_3
1360
1361 void IDct3__mmx( Q_LIST_ENTRY * InputData,
1362             ogg_int16_t *QuantMatrix,
1363             ogg_int16_t * OutputData ) {
1364
1365 #   define MIDM(M,I)    MtoSTR(M+I*8(%ecx))
1366 #   define M(I)         MIDM( MaskOffset , I )
1367 #   define MIDC(M,I)    MtoSTR(M+(I-1)*8(%ecx))
1368 #   define C(I)         MIDC( CosineOffset , I )
1369 #   define MIDEight(M)  MtoSTR(M(%ecx))
1370 #   define Eight        MIDEight(EightOffset)
1371
1372 #   define r0   "%mm0"
1373 #   define r1   "%mm1"
1374 #   define r2   "%mm2"
1375 #   define r3   "%mm3"
1376 #   define r4   "%mm4"
1377 #   define r5   "%mm5"
1378 #   define r6   "%mm6"
1379 #   define r7   "%mm7"
1380
1381     __asm__ __volatile__ (
1382     /* eax = quantized input */
1383     /* esi = quantization table */
1384     /* edx = destination (= idct buffer) */
1385     /* ecx = idctconstants */
1386     ""
1387     :
1388     :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
1389     );
1390
1391     ASM(
1392     "movq   (%eax), "r0"\n"
1393     "pmullw (%esi), "r0"\n"     /* r0 = 03 02 01 00 */
1394     "movq   "M(0)", "r2"\n"     /* r2 = __ __ __ FF */
1395     "movq   "r0", "r3"\n"       /* r3 = 03 02 01 00 */
1396     "psrlq  $16, "r0"\n"        /* r0 = __ 03 02 01 */
1397     "pand   "r2", "r3"\n"       /* r3 = __ __ __ 00 */
1398     "movq   "r0", "r5"\n"       /* r5 = __ 03 02 01 */
1399     "pand   "r2", "r5"\n"       /* r5 = __ __ __ 01 */
1400     "pxor   "r5", "r0"\n"       /* r0 = __ 03 02 __ */
1401     "por    "r3", "r0"\n"       /* r0 = __ 03 02 00 */
1402     "movq   "r0", (%edx)\n"     /* write R0 = r0 */
1403     "movq   "r5", 16(%edx)\n"   /* write R1 = r5 */
1404     );
1405 #   undef M
1406
1407 /* Done partial transpose; now do the idct itself. */
1408
1409 #   define I( K)    MtoSTR(K*16(%edx))
1410 #   define J( K)    MtoSTR(((K - 4)*16)+8(%edx))
1411
1412     RowIDCT_3       /* 33 c */
1413     Transpose       /* 19 c */
1414
1415 #   undef I
1416 #   undef J
1417 //# define I( K)    [edx + (  K      * 16) + 64]
1418 //# define J( K)    [edx + ( (K - 4) * 16) + 72]
1419
1420 //  RowIDCT         ; 46 c
1421 //  Transpose       ; 19 c
1422
1423 //# undef I
1424 //# undef J
1425 #   define I( K)    MtoSTR((K * 16)(%edx))
1426 #   define J( K)    I( K)
1427
1428     ColumnIDCT_3    /* 44 c */
1429
1430 #   undef I
1431 #   undef J
1432 #   define I( K)    MtoSTR((K*16)+8(%edx))
1433 #   define J( K)    I( K)
1434
1435     ColumnIDCT_3    /* 44 c */
1436
1437 #   undef I
1438 #   undef J
1439
1440     ASM("emms\n");
1441 }
1442
1443
1444 /* install our implementation in the function table */
1445 void dsp_mmx_idct_init(DspFunctions *funcs)
1446 {
1447   TH_DEBUG("enabling accelerated x86_32 mmx idct functions.\n");
1448   funcs->IDctSlow = IDctSlow__mmx;
1449   funcs->IDct10 = IDct10__mmx;
1450   funcs->IDct3 = IDct3__mmx;
1451 }
1452
1453 #endif /* USE_ASM */