quicktime/ffmpeg/libavcodec/i386/fdct_mmx.c

   1 /*
   2  * MMX optimized forward DCT
   3  * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
   4  * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
   6  *
   7  * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
   8  *
   9  *  Intel Application Note AP-922 - fast, precise implementation of DCT
  10  *        http://developer.intel.com/vtune/cbts/appnotes.htm
  11  *
  12  * Also of inspiration:
  13  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  14  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  15  */
  16 #include "../common.h"
  17 #include "../dsputil.h"
  18 #include "mmx.h"
  19
  20 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
  21
  22 //////////////////////////////////////////////////////////////////////
  23 //
  24 // constants for the forward DCT
  25 // -----------------------------
  26 //
  27 // Be sure to check that your compiler is aligning all constants to QWORD
  28 // (8-byte) memory boundaries!  Otherwise the unaligned memory access will
  29 // severely stall MMX execution.
  30 //
  31 //////////////////////////////////////////////////////////////////////
  32
  33 #define BITS_FRW_ACC    3 //; 2 or 3 for accuracy
  34 #define SHIFT_FRW_COL   BITS_FRW_ACC
  35 #define SHIFT_FRW_ROW   (BITS_FRW_ACC + 17 - 3)
  36 #define RND_FRW_ROW             (1 << (SHIFT_FRW_ROW-1))
  37 //#define RND_FRW_COL           (1 << (SHIFT_FRW_COL-1))
  38
  39 //concatenated table, for forward DCT transformation
  40 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
  41     13036, 13036, 13036, 13036,         // tg * (2<<16) + 0.5
  42     27146, 27146, 27146, 27146,         // tg * (2<<16) + 0.5
  43     -21746, -21746, -21746, -21746,     // tg * (2<<16) + 0.5
  44 };
  45
  46 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
  47     23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
  48 };
  49
  50 static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
  51
  52 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
  53
  54 struct
  55 {
  56  const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
  57 } fdct_r_row_sse2 ATTR_ALIGN(16)=
  58 {{
  59  RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
  60 }};
  61 //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
  62
  63 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
  64   16384,   16384,   22725,   19266,
  65   16384,   16384,   12873,    4520,
  66   21407,    8867,   19266,   -4520,
  67   -8867,  -21407,  -22725,  -12873,
  68   16384,  -16384,   12873,  -22725,
  69  -16384,   16384,    4520,   19266,
  70    8867,  -21407,    4520,  -12873,
  71   21407,   -8867,   19266,  -22725,
  72
  73   22725,   22725,   31521,   26722,
  74   22725,   22725,   17855,    6270,
  75   29692,   12299,   26722,   -6270,
  76  -12299,  -29692,  -31521,  -17855,
  77   22725,  -22725,   17855,  -31521,
  78  -22725,   22725,    6270,   26722,
  79   12299,  -29692,    6270,  -17855,
  80   29692,  -12299,   26722,  -31521,
  81
  82   21407,   21407,   29692,   25172,
  83   21407,   21407,   16819,    5906,
  84   27969,   11585,   25172,   -5906,
  85  -11585,  -27969,  -29692,  -16819,
  86   21407,  -21407,   16819,  -29692,
  87  -21407,   21407,    5906,   25172,
  88   11585,  -27969,    5906,  -16819,
  89   27969,  -11585,   25172,  -29692,
  90
  91   19266,   19266,   26722,   22654,
  92   19266,   19266,   15137,    5315,
  93   25172,   10426,   22654,   -5315,
  94  -10426,  -25172,  -26722,  -15137,
  95   19266,  -19266,   15137,  -26722,
  96  -19266,   19266,    5315,   22654,
  97   10426,  -25172,    5315,  -15137,
  98   25172,  -10426,   22654,  -26722,
  99
 100   16384,   16384,   22725,   19266,
 101   16384,   16384,   12873,    4520,
 102   21407,    8867,   19266,   -4520,
 103   -8867,  -21407,  -22725,  -12873,
 104   16384,  -16384,   12873,  -22725,
 105  -16384,   16384,    4520,   19266,
 106    8867,  -21407,    4520,  -12873,
 107   21407,   -8867,   19266,  -22725,
 108
 109   19266,   19266,   26722,   22654,
 110   19266,   19266,   15137,    5315,
 111   25172,   10426,   22654,   -5315,
 112  -10426,  -25172,  -26722,  -15137,
 113   19266,  -19266,   15137,  -26722,
 114  -19266,   19266,    5315,   22654,
 115   10426,  -25172,    5315,  -15137,
 116   25172,  -10426,   22654,  -26722,
 117
 118   21407,   21407,   29692,   25172,
 119   21407,   21407,   16819,    5906,
 120   27969,   11585,   25172,   -5906,
 121  -11585,  -27969,  -29692,  -16819,
 122   21407,  -21407,   16819,  -29692,
 123  -21407,   21407,    5906,   25172,
 124   11585,  -27969,    5906,  -16819,
 125   27969,  -11585,   25172,  -29692,
 126
 127   22725,   22725,   31521,   26722,
 128   22725,   22725,   17855,    6270,
 129   29692,   12299,   26722,   -6270,
 130  -12299,  -29692,  -31521,  -17855,
 131   22725,  -22725,   17855,  -31521,
 132  -22725,   22725,    6270,   26722,
 133   12299,  -29692,    6270,  -17855,
 134   29692,  -12299,   26722,  -31521,
 135 };
 136
 137 struct
 138 {
 139  const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
 140 } tab_frw_01234567_sse2 ATTR_ALIGN(16) =
 141 {{
 142 //static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table
 143 #define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
 144                    C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
 145                   -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
 146                    C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
 147 // c1..c7 * cos(pi/4) * 2^15
 148 #define C1 22725
 149 #define C2 21407
 150 #define C3 19266
 151 #define C4 16384
 152 #define C5 12873
 153 #define C6 8867
 154 #define C7 4520
 155 TABLE_SSE2
 156
 157 #undef C1
 158 #undef C2
 159 #undef C3
 160 #undef C4
 161 #undef C5
 162 #undef C6
 163 #undef C7
 164 #define C1 31521
 165 #define C2 29692
 166 #define C3 26722
 167 #define C4 22725
 168 #define C5 17855
 169 #define C6 12299
 170 #define C7 6270
 171 TABLE_SSE2
 172
 173 #undef C1
 174 #undef C2
 175 #undef C3
 176 #undef C4
 177 #undef C5
 178 #undef C6
 179 #undef C7
 180 #define C1 29692
 181 #define C2 27969
 182 #define C3 25172
 183 #define C4 21407
 184 #define C5 16819
 185 #define C6 11585
 186 #define C7 5906
 187 TABLE_SSE2
 188
 189 #undef C1
 190 #undef C2
 191 #undef C3
 192 #undef C4
 193 #undef C5
 194 #undef C6
 195 #undef C7
 196 #define C1 26722
 197 #define C2 25172
 198 #define C3 22654
 199 #define C4 19266
 200 #define C5 15137
 201 #define C6 10426
 202 #define C7 5315
 203 TABLE_SSE2
 204
 205 #undef C1
 206 #undef C2
 207 #undef C3
 208 #undef C4
 209 #undef C5
 210 #undef C6
 211 #undef C7
 212 #define C1 22725
 213 #define C2 21407
 214 #define C3 19266
 215 #define C4 16384
 216 #define C5 12873
 217 #define C6 8867
 218 #define C7 4520
 219 TABLE_SSE2
 220
 221 #undef C1
 222 #undef C2
 223 #undef C3
 224 #undef C4
 225 #undef C5
 226 #undef C6
 227 #undef C7
 228 #define C1 26722
 229 #define C2 25172
 230 #define C3 22654
 231 #define C4 19266
 232 #define C5 15137
 233 #define C6 10426
 234 #define C7 5315
 235 TABLE_SSE2
 236
 237 #undef C1
 238 #undef C2
 239 #undef C3
 240 #undef C4
 241 #undef C5
 242 #undef C6
 243 #undef C7
 244 #define C1 29692
 245 #define C2 27969
 246 #define C3 25172
 247 #define C4 21407
 248 #define C5 16819
 249 #define C6 11585
 250 #define C7 5906
 251 TABLE_SSE2
 252
 253 #undef C1
 254 #undef C2
 255 #undef C3
 256 #undef C4
 257 #undef C5
 258 #undef C6
 259 #undef C7
 260 #define C1 31521
 261 #define C2 29692
 262 #define C3 26722
 263 #define C4 22725
 264 #define C5 17855
 265 #define C6 12299
 266 #define C7 6270
 267 TABLE_SSE2
 268 }};
 269
 270
 271 static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
 272 {
 273     movq_m2r(*(in + offset + 1 * 8), mm0);
 274     movq_m2r(*(in + offset + 6 * 8), mm1);
 275     movq_r2r(mm0, mm2);
 276     movq_m2r(*(in + offset + 2 * 8), mm3);
 277     paddsw_r2r(mm1, mm0);
 278     movq_m2r(*(in + offset + 5 * 8), mm4);
 279     psllw_i2r(SHIFT_FRW_COL, mm0);
 280     movq_m2r(*(in + offset + 0 * 8), mm5);
 281     paddsw_r2r(mm3, mm4);
 282     paddsw_m2r(*(in + offset + 7 * 8), mm5);
 283     psllw_i2r(SHIFT_FRW_COL, mm4);
 284     movq_r2r(mm0, mm6);
 285     psubsw_r2r(mm1, mm2);
 286     movq_m2r(*(fdct_tg_all_16 + 4), mm1);
 287     psubsw_r2r(mm4, mm0);
 288     movq_m2r(*(in + offset + 3 * 8), mm7);
 289     pmulhw_r2r(mm0, mm1);
 290     paddsw_m2r(*(in + offset + 4 * 8), mm7);
 291     psllw_i2r(SHIFT_FRW_COL, mm5);
 292     paddsw_r2r(mm4, mm6);
 293     psllw_i2r(SHIFT_FRW_COL, mm7);
 294     movq_r2r(mm5, mm4);
 295     psubsw_r2r(mm7, mm5);
 296     paddsw_r2r(mm5, mm1);
 297     paddsw_r2r(mm7, mm4);
 298     por_m2r(fdct_one_corr, mm1);
 299     psllw_i2r(SHIFT_FRW_COL + 1, mm2);
 300     pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
 301     movq_r2r(mm4, mm7);
 302     psubsw_m2r(*(in + offset + 5 * 8), mm3);
 303     psubsw_r2r(mm6, mm4);
 304     movq_r2m(mm1, *(out + offset + 2 * 8));
 305     paddsw_r2r(mm6, mm7);
 306     movq_m2r(*(in + offset + 3 * 8), mm1);
 307     psllw_i2r(SHIFT_FRW_COL + 1, mm3);
 308     psubsw_m2r(*(in + offset + 4 * 8), mm1);
 309     movq_r2r(mm2, mm6);
 310     movq_r2m(mm4, *(out + offset + 4 * 8));
 311     paddsw_r2r(mm3, mm2);
 312     pmulhw_m2r(*ocos_4_16, mm2);
 313     psubsw_r2r(mm3, mm6);
 314     pmulhw_m2r(*ocos_4_16, mm6);
 315     psubsw_r2r(mm0, mm5);
 316     por_m2r(fdct_one_corr, mm5);
 317     psllw_i2r(SHIFT_FRW_COL, mm1);
 318     por_m2r(fdct_one_corr, mm2);
 319     movq_r2r(mm1, mm4);
 320     movq_m2r(*(in + offset + 0 * 8), mm3);
 321     paddsw_r2r(mm6, mm1);
 322     psubsw_m2r(*(in + offset + 7 * 8), mm3);
 323     psubsw_r2r(mm6, mm4);
 324     movq_m2r(*(fdct_tg_all_16 + 0), mm0);
 325     psllw_i2r(SHIFT_FRW_COL, mm3);
 326     movq_m2r(*(fdct_tg_all_16 + 8), mm6);
 327     pmulhw_r2r(mm1, mm0);
 328     movq_r2m(mm7, *(out + offset + 0 * 8));
 329     pmulhw_r2r(mm4, mm6);
 330     movq_r2m(mm5, *(out + offset + 6 * 8));
 331     movq_r2r(mm3, mm7);
 332     movq_m2r(*(fdct_tg_all_16 + 8), mm5);
 333     psubsw_r2r(mm2, mm7);
 334     paddsw_r2r(mm2, mm3);
 335     pmulhw_r2r(mm7, mm5);
 336     paddsw_r2r(mm3, mm0);
 337     paddsw_r2r(mm4, mm6);
 338     pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
 339     por_m2r(fdct_one_corr, mm0);
 340     paddsw_r2r(mm7, mm5);
 341     psubsw_r2r(mm6, mm7);
 342     movq_r2m(mm0, *(out + offset + 1 * 8));
 343     paddsw_r2r(mm4, mm5);
 344     movq_r2m(mm7, *(out + offset + 3 * 8));
 345     psubsw_r2r(mm1, mm3);
 346     movq_r2m(mm5, *(out + offset + 5 * 8));
 347     movq_r2m(mm3, *(out + offset + 7 * 8));
 348 }
 349
 350
 351 static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
 352 {
 353     asm volatile(
 354         ".macro FDCT_ROW_SSE2_H1 i t   \n\t"
 355         "movq      \\i(%0), %%xmm2     \n\t"
 356         "movq      \\i+8(%0), %%xmm0   \n\t"
 357         "movdqa    \\t+32(%1), %%xmm3  \n\t"
 358         "movdqa    \\t+48(%1), %%xmm7  \n\t"
 359         "movdqa    \\t(%1), %%xmm4     \n\t"
 360         "movdqa    \\t+16(%1), %%xmm5  \n\t"
 361         ".endm                         \n\t"
 362         ".macro FDCT_ROW_SSE2_H2 i t   \n\t"
 363         "movq      \\i(%0), %%xmm2     \n\t"
 364         "movq      \\i+8(%0), %%xmm0   \n\t"
 365         "movdqa    \\t+32(%1), %%xmm3  \n\t"
 366         "movdqa    \\t+48(%1), %%xmm7  \n\t"
 367         ".endm                         \n\t"
 368         ".macro FDCT_ROW_SSE2 i        \n\t"
 369         "movq      %%xmm2, %%xmm1      \n\t"
 370         "pshuflw   $27, %%xmm0, %%xmm0 \n\t"
 371         "paddsw    %%xmm0, %%xmm1      \n\t"
 372         "psubsw    %%xmm0, %%xmm2      \n\t"
 373         "punpckldq %%xmm2, %%xmm1      \n\t"
 374         "pshufd    $78, %%xmm1, %%xmm2 \n\t"
 375         "pmaddwd   %%xmm2, %%xmm3      \n\t"
 376         "pmaddwd   %%xmm1, %%xmm7      \n\t"
 377         "pmaddwd   %%xmm5, %%xmm2      \n\t"
 378         "pmaddwd   %%xmm4, %%xmm1      \n\t"
 379         "paddd     %%xmm7, %%xmm3      \n\t"
 380         "paddd     %%xmm2, %%xmm1      \n\t"
 381         "paddd     %%xmm6, %%xmm3      \n\t"
 382         "paddd     %%xmm6, %%xmm1      \n\t"
 383         "psrad     %3, %%xmm3          \n\t"
 384         "psrad     %3, %%xmm1          \n\t"
 385         "packssdw  %%xmm3, %%xmm1      \n\t"
 386         "movdqa    %%xmm1, \\i(%4)     \n\t"
 387         ".endm                         \n\t"
 388         "movdqa    (%2), %%xmm6        \n\t"
 389         "FDCT_ROW_SSE2_H1 0 0 \n\t"
 390         "FDCT_ROW_SSE2 0 \n\t"
 391         "FDCT_ROW_SSE2_H2 64 0 \n\t"
 392         "FDCT_ROW_SSE2 64 \n\t"
 393
 394         "FDCT_ROW_SSE2_H1 16 64 \n\t"
 395         "FDCT_ROW_SSE2 16 \n\t"
 396         "FDCT_ROW_SSE2_H2 112 64 \n\t"
 397         "FDCT_ROW_SSE2 112 \n\t"
 398
 399         "FDCT_ROW_SSE2_H1 32 128 \n\t"
 400         "FDCT_ROW_SSE2 32 \n\t"
 401         "FDCT_ROW_SSE2_H2 96 128 \n\t"
 402         "FDCT_ROW_SSE2 96 \n\t"
 403
 404         "FDCT_ROW_SSE2_H1 48 192 \n\t"
 405         "FDCT_ROW_SSE2 48 \n\t"
 406         "FDCT_ROW_SSE2_H2 80 192 \n\t"
 407         "FDCT_ROW_SSE2 80 \n\t"
 408         :
 409         : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
 410     );
 411 }
 412
 413 static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
 414 {
 415     pshufw_m2r(*(in + 4), mm5, 0x1B);
 416     movq_m2r(*(in + 0), mm0);
 417     movq_r2r(mm0, mm1);
 418     paddsw_r2r(mm5, mm0);
 419     psubsw_r2r(mm5, mm1);
 420     movq_r2r(mm0, mm2);
 421     punpckldq_r2r(mm1, mm0);
 422     punpckhdq_r2r(mm1, mm2);
 423     movq_m2r(*(table + 0), mm1);
 424     movq_m2r(*(table + 4), mm3);
 425     movq_m2r(*(table + 8), mm4);
 426     movq_m2r(*(table + 12), mm5);
 427     movq_m2r(*(table + 16), mm6);
 428     movq_m2r(*(table + 20), mm7);
 429     pmaddwd_r2r(mm0, mm1);
 430     pmaddwd_r2r(mm2, mm3);
 431     pmaddwd_r2r(mm0, mm4);
 432     pmaddwd_r2r(mm2, mm5);
 433     pmaddwd_r2r(mm0, mm6);
 434     pmaddwd_r2r(mm2, mm7);
 435     pmaddwd_m2r(*(table + 24), mm0);
 436     pmaddwd_m2r(*(table + 28), mm2);
 437     paddd_r2r(mm1, mm3);
 438     paddd_r2r(mm4, mm5);
 439     paddd_r2r(mm6, mm7);
 440     paddd_r2r(mm0, mm2);
 441     movq_m2r(*fdct_r_row, mm0);
 442     paddd_r2r(mm0, mm3);
 443     paddd_r2r(mm0, mm5);
 444     paddd_r2r(mm0, mm7);
 445     paddd_r2r(mm0, mm2);
 446     psrad_i2r(SHIFT_FRW_ROW, mm3);
 447     psrad_i2r(SHIFT_FRW_ROW, mm5);
 448     psrad_i2r(SHIFT_FRW_ROW, mm7);
 449     psrad_i2r(SHIFT_FRW_ROW, mm2);
 450     packssdw_r2r(mm5, mm3);
 451     packssdw_r2r(mm2, mm7);
 452     movq_r2m(mm3, *(out + 0));
 453     movq_r2m(mm7, *(out + 4));
 454 }
 455
 456 static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
 457 {
 458 //FIXME reorder (i dont have a old mmx only cpu here to benchmark ...)
 459     movd_m2r(*(in + 6), mm1);
 460     punpcklwd_m2r(*(in + 4), mm1);
 461     movq_r2r(mm1, mm2);
 462     psrlq_i2r(0x20, mm1);
 463     movq_m2r(*(in + 0), mm0);
 464     punpcklwd_r2r(mm2, mm1);
 465     movq_r2r(mm0, mm5);
 466     paddsw_r2r(mm1, mm0);
 467     psubsw_r2r(mm1, mm5);
 468     movq_r2r(mm0, mm2);
 469     punpckldq_r2r(mm5, mm0);
 470     punpckhdq_r2r(mm5, mm2);
 471     movq_m2r(*(table + 0), mm1);
 472     movq_m2r(*(table + 4), mm3);
 473     movq_m2r(*(table + 8), mm4);
 474     movq_m2r(*(table + 12), mm5);
 475     movq_m2r(*(table + 16), mm6);
 476     movq_m2r(*(table + 20), mm7);
 477     pmaddwd_r2r(mm0, mm1);
 478     pmaddwd_r2r(mm2, mm3);
 479     pmaddwd_r2r(mm0, mm4);
 480     pmaddwd_r2r(mm2, mm5);
 481     pmaddwd_r2r(mm0, mm6);
 482     pmaddwd_r2r(mm2, mm7);
 483     pmaddwd_m2r(*(table + 24), mm0);
 484     pmaddwd_m2r(*(table + 28), mm2);
 485     paddd_r2r(mm1, mm3);
 486     paddd_r2r(mm4, mm5);
 487     paddd_r2r(mm6, mm7);
 488     paddd_r2r(mm0, mm2);
 489     movq_m2r(*fdct_r_row, mm0);
 490     paddd_r2r(mm0, mm3);
 491     paddd_r2r(mm0, mm5);
 492     paddd_r2r(mm0, mm7);
 493     paddd_r2r(mm0, mm2);
 494     psrad_i2r(SHIFT_FRW_ROW, mm3);
 495     psrad_i2r(SHIFT_FRW_ROW, mm5);
 496     psrad_i2r(SHIFT_FRW_ROW, mm7);
 497     psrad_i2r(SHIFT_FRW_ROW, mm2);
 498     packssdw_r2r(mm5, mm3);
 499     packssdw_r2r(mm2, mm7);
 500     movq_r2m(mm3, *(out + 0));
 501     movq_r2m(mm7, *(out + 4));
 502 }
 503
 504 void ff_fdct_mmx(int16_t *block)
 505 {
 506     int64_t align_tmp[16] ATTR_ALIGN(8);
 507     int16_t * const block_tmp= (int16_t*)align_tmp;
 508     int16_t *block1, *out;
 509     const int16_t *table;
 510     int i;
 511
 512     block1 = block_tmp;
 513     fdct_col(block, block1, 0);
 514     fdct_col(block, block1, 4);
 515
 516     block1 = block_tmp;
 517     table = tab_frw_01234567;
 518     out = block;
 519     for(i=8;i>0;i--) {
 520         fdct_row_mmx(block1, out, table);
 521         block1 += 8;
 522         table += 32;
 523         out += 8;
 524     }
 525 }
 526
 527 void ff_fdct_mmx2(int16_t *block)
 528 {
 529     int64_t align_tmp[16] ATTR_ALIGN(8);
 530     int16_t * const block_tmp= (int16_t*)align_tmp;
 531     int16_t *block1, *out;
 532     const int16_t *table;
 533     int i;
 534
 535     block1 = block_tmp;
 536     fdct_col(block, block1, 0);
 537     fdct_col(block, block1, 4);
 538
 539     block1 = block_tmp;
 540     table = tab_frw_01234567;
 541     out = block;
 542     for(i=8;i>0;i--) {
 543         fdct_row_mmx2(block1, out, table);
 544         block1 += 8;
 545         table += 32;
 546         out += 8;
 547     }
 548 }
 549
 550 void ff_fdct_sse2(int16_t *block)
 551 {
 552     int64_t align_tmp[16] ATTR_ALIGN(8);
 553     int16_t * const block_tmp= (int16_t*)align_tmp;
 554     int16_t *block1;
 555
 556     block1 = block_tmp;
 557     fdct_col(block, block1, 0);
 558     fdct_col(block, block1, 4);
 559
 560     fdct_row_sse2(block1, block);
 561 }
 562